In [1]:
import os
import re
import pandas as pd
from pathlib  import Path

from sklearn.model_selection import train_test_split
from datasets import load_dataset, concatenate_datasets

In [2]:
current_dir = Path().resolve()
while not current_dir.name.endswith("xlm-roberta-base-cls-depression"):
    current_dir = current_dir.parent

os.chdir(current_dir)

input_mental_health_texts_data = current_dir / "data/raw/mental_health_texts.csv"
input_sentiment_analysis_dataset_data = 'tyqiangz/multilingual-sentiments'
output_train_data = current_dir / "data/clean/train.csv"
output_val_data = current_dir / "data/clean/val.csv"
output_test_data = current_dir / "data/clean/test.csv"

In [3]:
LANGUAGES = ['english', 'german', 'french', 'italian', 'portuguese', 'spanish']
SPLITS = ['train', 'validation', 'test']

TEXTS_LENGTHS = {
    "between 3 and 8 words": 0.2,
    "between 9 and 15 words": 0.1,
    "between 16 and 35 words": 0.4,
    "between 36 and 60 words": 0.2,
    "between 61 and 90 words": 0.1
}

### utils

In [None]:
def get_text_length_category(text):
    word_count = len(text.split())
    
    if 3 <= word_count <= 8:
        return "between 3 and 8 words"
    elif 9 <= word_count <= 15:
        return "between 9 and 15 words"
    elif 16 <= word_count <= 35:
        return "between 16 and 35 words"
    elif 36 <= word_count <= 60:
        return "between 36 and 60 words"
    elif 61 <= word_count <= 90:
        return "between 61 and 90 words"
    else:
        return None

In [None]:
def balance_text_length_proportions(df, proportions):
    # Calculate current counts
    current_counts = df['text_length'].value_counts()
    
    # Find the limiting category based on desired proportions
    limiting_ratio = float('inf')
    for category, count in current_counts.items():
        target_prop = proportions[category]
        limiting_ratio = min(limiting_ratio, count / target_prop)
    
    # Calculate target size for each category
    target_sizes = {
        category: int(proportions[category] * limiting_ratio)
        for category in proportions
    }
    
    # Sample from each category
    balanced_dfs = []
    for category, target_size in target_sizes.items():
        category_df = df[df['text_length'] == category]
        if category_df.shape[0] > 0:
            sampled_df = category_df.sample(n=target_size, random_state=42)
            balanced_dfs.append(sampled_df)
    
    # Combine all balanced categories
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    
    return balanced_df

### sentiment_analysis_dataset

In [4]:
datasets = []

dataset_subset = load_dataset(input_sentiment_analysis_dataset_data, 'all', streaming=False)
for split in SPLITS:
    datasets.append(dataset_subset[split].filter(lambda x: x["language"] in LANGUAGES))

all_datasets = concatenate_datasets(datasets)
all_datasets_df = all_datasets.to_pandas()

In [5]:
neutral_df = all_datasets_df.copy()
neutral_df['label'] = 0

In [6]:
neutral_df['text_length'] = neutral_df['text'].map(get_text_length_category)

In [7]:
neutral_df.dropna(inplace=True)

In [8]:
balanced_df = balance_text_length_proportions(neutral_df, TEXTS_LENGTHS)

In [9]:
def multi_column_stratified_split(df, strat_columns, random_state=42):
    """
    Performs stratified split on multiple columns with fixed proportions:
    train=0.7, val=0.2, test=0.1
    
    Args:
        df: Input dataframe
        strat_columns: List of column names to stratify on
        random_state: Random seed for reproducibility
    """
    
    # Create a combined stratification label
    df['combined_strat'] = df[strat_columns].apply(lambda x: '_'.join(x.astype(str)), axis=1)
    
    # First split: separate test set (10%)
    temp_df, test_df = train_test_split(
        df,
        test_size=0.1,  # 10% for test
        stratify=df['combined_strat'],
        random_state=random_state
    )
    
    # Second split: split remaining 90% into train (77.78%) and validation (22.22%)
    # 77.78% of 90% = 70% of total
    # 22.22% of 90% = 20% of total
    train_df, val_df = train_test_split(
        temp_df,
        test_size=0.2222,  # 20% / 90% ≈ 0.2222
        stratify=temp_df['combined_strat'],
        random_state=random_state
    )
    
    # Remove the combined stratification column
    train_df = train_df.drop('combined_strat', axis=1)
    val_df = val_df.drop('combined_strat', axis=1)
    test_df = test_df.drop('combined_strat', axis=1)
    
    return train_df, val_df, test_df

strat_columns = ['language', 'text_length']
neutral_train_df, neutral_val_df, neutral_test_df = multi_column_stratified_split(balanced_df, strat_columns)

featured_columns = ['text','label']
neutral_train_df = neutral_train_df[featured_columns]
neutral_val_df = neutral_val_df[featured_columns]
neutral_test_df = neutral_test_df[featured_columns]

total_samples = len(balanced_df)
print(f"Training set: {len(neutral_train_df)} samples ({len(neutral_train_df)/total_samples:.1%})")
print(f"Validation set: {len(neutral_val_df)} samples ({len(neutral_val_df)/total_samples:.1%})")
print(f"Test set: {len(neutral_test_df)} samples ({len(neutral_test_df)/total_samples:.1%})")

Training set: 8028 samples (70.0%)
Validation set: 2294 samples (20.0%)
Test set: 1147 samples (10.0%)


### mental_health_texts

In [10]:
depression_texts_full_df = pd.read_csv(input_mental_health_texts_data, encoding='utf-8', sep='|')

In [11]:
depression_texts_full_df = depression_texts_full_df[~depression_texts_full_df['mental_state'].isna()]

In [12]:
# depression_texts_full_df['text_length'] = depression_texts_full_df['text'].map(get_text_length_category)

In [13]:
depression_texts_full_df.dropna(inplace=True)

In [14]:
depression_texts_full_df.loc[depression_texts_full_df['mental_state']=='Healthy','label'] = 0
depression_texts_full_df.loc[depression_texts_full_df['mental_state']=='Unhealthy','label'] = 1

In [15]:
strat_columns = ['language', 'category', 'mental_state', 'text_length', 'label']
base_train_df, base_val_df, base_test_df = multi_column_stratified_split(depression_texts_full_df, strat_columns)

featured_columns = ['text','label']
base_train_df = base_train_df[featured_columns]
base_val_df = base_val_df[featured_columns]
base_test_df = base_test_df[featured_columns]

total_samples = len(depression_texts_full_df)
print(f"Training set: {len(base_train_df)} samples ({len(base_train_df)/total_samples:.1%})")
print(f"Validation set: {len(base_val_df)} samples ({len(base_val_df)/total_samples:.1%})")
print(f"Test set: {len(base_test_df)} samples ({len(base_test_df)/total_samples:.1%})")

Training set: 26208 samples (70.0%)
Validation set: 7488 samples (20.0%)
Test set: 3745 samples (10.0%)


In [16]:
total_train_df = pd.concat([base_train_df, neutral_train_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
total_val_df = pd.concat([base_val_df, neutral_val_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
total_test_df = pd.concat([base_test_df, neutral_test_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

total_train_df['label'] = total_train_df['label'].astype('Int64')
total_val_df['label'] = total_val_df['label'].astype('Int64')
total_test_df['label'] = total_test_df['label'].astype('Int64')

total_train_df.to_csv(output_train_data, index=False, encoding='utf-8', sep='|')
total_val_df.to_csv(output_val_data, index=False, encoding='utf-8', sep='|')
total_test_df.to_csv(output_test_data, index=False, encoding='utf-8', sep='|')