In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import json
import os

print("="*70)
print("MULTI-CLASS MENTAL HEALTH DATA PREPARATION")
print("="*70)

DATA_PATH = 'mental_health_data.csv'

if not os.path.exists(DATA_PATH):
    print(f"\n? Error: {DATA_PATH} not found!")
    exit(1)



MULTI-CLASS MENTAL HEALTH DATA PREPARATION


In [9]:
# ============= LOAD DATA =============
print(f"\n[1/8] Loading dataset from {DATA_PATH}...")

try:
    df = pd.read_csv(DATA_PATH)
    print(f"+ Loaded {len(df)} samples")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"? Error loading data: {e}")
    exit(1)




[1/8] Loading dataset from mental_health_data.csv...
+ Loaded 53043 samples
Columns: ['Unnamed: 0', 'statement', 'status']


In [10]:
# ============= INITIAL EXPLORATION =============
print("\n[2/8] Exploring dataset...")

print(f"\n+ Dataset Info:")
print(f"Total samples: {len(df)}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\n+ Mental Health Status Distribution:")
status_counts = df['status'].value_counts()
print(status_counts)
print(f"\n+ Class percentages:")
for status, count in status_counts.items():
    print(f"  {status:<25} {count:>6} ({count/len(df)*100:>5.2f}%)")




[2/8] Exploring dataset...

+ Dataset Info:
Total samples: 53043

First few rows:
   Unnamed: 0                                          statement   status
0           0                                         oh my gosh  Anxiety
1           1  trouble sleeping, confused mind, restless hear...  Anxiety
2           2  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3           3  I've shifted my focus to something else but I'...  Anxiety
4           4  I'm restless and restless, it's been a month n...  Anxiety

+ Mental Health Status Distribution:
status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64

+ Class percentages:
  Normal                     16351 (30.83%)
  Depression                 15404 (29.04%)
  Suicidal                   10653 (20.08%)
  Anxiety                     3888 ( 7

In [11]:
# ============= DEFINE CLASS MAPPING =============
print("\n[3/8] Setting up 7-class classification...")

# Map to standardized class names
CLASS_MAPPING = {
    'Normal': 0,
    'Anxiety': 1,
    'Depression': 2,
    'Stress': 3,
    'Suicidal': 4,
    'Bipolar': 5,
    'Personality disorder': 6
}

# Create label column
df['label'] = df['status'].map(CLASS_MAPPING)

# Remove any unmapped classes
unmapped = df[df['label'].isna()]
if len(unmapped) > 0:
    print(f"?  Found {len(unmapped)} samples with unmapped classes:")
    print(unmapped['status'].value_counts())
    df = df.dropna(subset=['label'])

df['label'] = df['label'].astype(int)

print(f"\n+ Class mapping complete:")
for name, idx in sorted(CLASS_MAPPING.items(), key=lambda x: x[1]):
    count = (df['label'] == idx).sum()
    print(f"  [{idx}] {name:<25} {count:>6} samples")

# Save class mapping
os.makedirs('models/mental_health_roberta', exist_ok=True)
with open('models/mental_health_roberta/label_mapping.json', 'w') as f:
    json.dump(CLASS_MAPPING, f, indent=2)
print("\n+ Saved: models/mental_health_roberta/label_mapping.json")




[3/8] Setting up 7-class classification...

+ Class mapping complete:
  [0] Normal                     16351 samples
  [1] Anxiety                     3888 samples
  [2] Depression                 15404 samples
  [3] Stress                      2669 samples
  [4] Suicidal                   10653 samples
  [5] Bipolar                     2877 samples
  [6] Personality disorder        1201 samples

+ Saved: models/mental_health_roberta/label_mapping.json


In [13]:
# ============= DATA CLEANING =============
print("\n[4/8] Cleaning data...")

original_size = len(df)

# Remove nulls
df = df.dropna(subset=['statement', 'label'])
print(f"After removing nulls: {len(df)} ({len(df)/original_size*100:.1f}%)")

# Remove duplicates
df = df.drop_duplicates(subset=['statement'])
print(f"After removing duplicates: {len(df)} ({len(df)/original_size*100:.1f}%)")

# Add text statistics
df['text_length'] = df['statement'].str.len()
df['word_count'] = df['statement'].str.split().str.len()
df['avg_word_length'] = df['statement'].apply(
    lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0
)

# Filter by text length
df = df[(df['text_length'] >= 10) & (df['text_length'] <= 5000)]
print(f"After length filtering (10-5000 chars): {len(df)} ({len(df)/original_size*100:.1f}%)")

# Filter by word count
df = df[df['word_count'] >= 3]
print(f"After word count filtering (>=3 words): {len(df)} ({len(df)/original_size*100:.1f}%)")

print(f"\n+ Cleaned dataset: {len(df)} samples ({len(df)/original_size*100:.1f}% retained)")





[4/8] Cleaning data...
After removing nulls: 50083 (100.0%)
After removing duplicates: 50083 (100.0%)
After length filtering (10-5000 chars): 50083 (100.0%)
After word count filtering (>=3 words): 50083 (100.0%)

+ Cleaned dataset: 50083 samples (100.0% retained)


In [14]:
# ============= TEXT STATISTICS PER CLASS =============
print("\n[5/8] Analyzing text statistics per class...")

print(f"\n+ Text Statistics by Mental Health Category:")
stats_by_class = df.groupby('status')[['text_length', 'word_count', 'avg_word_length']].agg(['mean', 'std'])
print(stats_by_class)



[5/8] Analyzing text statistics per class...

+ Text Statistics by Mental Health Category:
                     text_length              word_count              \
                            mean         std        mean         std   
status                                                                 
Anxiety               744.754801  742.284513  140.118842  139.865082   
Bipolar               930.185393  774.520113  173.260032  144.943391   
Depression            796.182068  755.923456  158.749816  149.712414   
Normal                 95.038673  122.305803   18.187418   23.053253   
Personality disorder  895.494938  751.040234  167.446569  141.282430   
Stress                584.583224  486.220051  109.937090   90.419784   
Suicidal              693.490868  731.846244  138.819343  144.698839   

                     avg_word_length            
                                mean       std  
status                                          
Anxiety                     4.392177  0.

In [15]:
# ============= CLASS BALANCING ANALYSIS =============
print("\n[6/8] Analyzing class balance...")

label_counts = df['label'].value_counts().sort_index()
max_count = label_counts.max()
min_count = label_counts.min()
imbalance_ratio = max_count / min_count

print(f"\n+ Class Balance Analysis:")
print(f"Largest class:  {label_counts.idxmax()} with {max_count} samples")
print(f"Smallest class: {label_counts.idxmin()} with {min_count} samples")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

# Calculate class weights for training
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

print(f"\n+  Computed class weights (for training):")
for idx, weight in enumerate(class_weights):
    class_name = [k for k, v in CLASS_MAPPING.items() if v == idx][0]
    print(f"  [{idx}] {class_name:<25} weight: {weight:.3f}")

# Save class weights
class_weights_dict = {int(idx): float(weight) for idx, weight in enumerate(class_weights)}
with open('models/mental_health_roberta/class_weights.json', 'w') as f:
    json.dump(class_weights_dict, f, indent=2)
print("\n+ Saved: models/mental_health_roberta/class_weights.json")




[6/8] Analyzing class balance...

+ Class Balance Analysis:
Largest class:  0 with 15308 samples
Smallest class: 6 with 889 samples
Imbalance ratio: 17.22:1

+  Computed class weights (for training):
  [0] Normal                    weight: 0.467
  [1] Anxiety                   weight: 1.991
  [2] Depression                weight: 0.479
  [3] Stress                    weight: 3.126
  [4] Suicidal                  weight: 0.677
  [5] Bipolar                   weight: 2.871
  [6] Personality disorder      weight: 8.048

+ Saved: models/mental_health_roberta/class_weights.json


In [16]:
# ============= STRATIFIED SPLIT =============
print("\n[7/8] Creating stratified train/val/test splits...")

# 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(
    df, 
    test_size=0.3, 
    stratify=df['label'], 
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df['label'], 
    random_state=42
)

print(f"\n+ Split complete:")
print(f"{'Train:':<12} {len(train_df):>6} samples ({len(train_df)/len(df)*100:>5.1f}%)")
print(f"{'Validation:':<12} {len(val_df):>6} samples ({len(val_df)/len(df)*100:>5.1f}%)")
print(f"{'Test:':<12} {len(test_df):>6} samples ({len(test_df)/len(df)*100:>5.1f}%)")

# Verify stratification
print("\n+ Class distribution across splits:")
print(f"\n{'Class':<25} {'Train':>8} {'Val':>8} {'Test':>8}")
print("-" * 55)
for class_name, class_idx in sorted(CLASS_MAPPING.items(), key=lambda x: x[1]):
    train_count = (train_df['label'] == class_idx).sum()
    val_count = (val_df['label'] == class_idx).sum()
    test_count = (test_df['label'] == class_idx).sum()
    print(f"{class_name:<25} {train_count:>8} {val_count:>8} {test_count:>8}")




[7/8] Creating stratified train/val/test splits...

+ Split complete:
Train:        35058 samples ( 70.0%)
Validation:    7512 samples ( 15.0%)
Test:          7513 samples ( 15.0%)

+ Class distribution across splits:

Class                        Train      Val     Test
-------------------------------------------------------
Normal                       10716     2296     2296
Anxiety                       2515      539      539
Depression                   10462     2241     2242
Stress                        1602      343      344
Suicidal                      7397     1585     1585
Bipolar                       1744      374      374
Personality disorder           622      134      133


In [17]:
# ============= SAVE DATA =============
print("\n[8/8] Saving processed data...")

os.makedirs('data', exist_ok=True)

# Save only text and label columns
train_df[['statement', 'label']].rename(columns={'statement': 'text'}).to_csv('data/train.csv', index=False)
val_df[['statement', 'label']].rename(columns={'statement': 'text'}).to_csv('data/val.csv', index=False)
test_df[['statement', 'label']].rename(columns={'statement': 'text'}).to_csv('data/test.csv', index=False)

print("+ Saved:")
print("   - data/train.csv")
print("   - data/val.csv")
print("   - data/test.csv")





[8/8] Saving processed data...
+ Saved:
   - data/train.csv
   - data/val.csv
   - data/test.csv
