<a href="https://colab.research.google.com/github/oxayavongsa/aai-590-capstone-mental-health/blob/main/split_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Split Data
To prevent data leakage, we seperate the target then split the data.

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [24]:
# Load the cleaned mental health data with risk labels
file_path = "aai-590-capstone-mental-health/data-assets/target_health_data.csv"
df = pd.read_csv(file_path)
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options', 'risk_cluster',
       'risk_label'],
      dtype='object')

In [25]:
df.shape

(260986, 19)

In [26]:
# Separate target first
target_col = 'risk_label'
y = df[target_col]
X = df.drop(columns=[target_col])

In [27]:
# Split before encoding
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (182690, 18), Test set: (78296, 18)


## Preprocess Data

In [28]:
# Label encode features
label_encoders = {}
for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Use same mapping
    label_encoders[col] = le

# Encode target
if y_train.dtype == 'object':
    target_encoder = LabelEncoder()
    y_train = target_encoder.fit_transform(y_train)
    y_test = target_encoder.transform(y_test)

print("Encoding completed")

Encoding completed


In [31]:
# Identify and display numerical columns in X_train
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:")
for col in numeric_cols:
    print("-", col)

# Display numerical columns in X_train
numeric_cols = X_train.select_dtypes(include=['int64', 'float64'])
display(numeric_cols.head())

Numerical columns:
- Timestamp
- Gender
- Country
- Occupation
- self_employed
- family_history
- treatment
- Days_Indoors
- Growing_Stress
- Changes_Habits
- Mental_Health_History
- Mood_Swings
- Coping_Struggles
- Work_Interest
- Social_Weakness
- mental_health_interview
- care_options
- risk_cluster


Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options,risk_cluster
97181,81,1,1,0,1,0,0,4,0,1,2,2,1,1,0,2,1,0
132736,492,1,9,1,0,0,0,1,1,2,1,0,0,0,0,0,1,2
145990,192,1,1,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0
30480,409,0,9,0,0,1,0,1,1,2,2,0,0,2,1,1,1,0
118493,294,1,9,0,1,1,1,0,0,0,2,0,0,2,1,1,2,1


In [20]:
# Scaling / Normalization (Numerical Columns)
scaler = StandardScaler()
X_train[numeric_cols.columns] = scaler.fit_transform(X_train[numeric_cols.columns])
X_test[numeric_cols.columns] = scaler.transform(X_test[numeric_cols.columns])

print("Numerical columns scaled successfully.")

Numerical columns scaled successfully.


In [21]:
# Save preprocessed datasets
X_train.to_csv("aai-590-capstone-mental-health/data-assets/X_train_encoded_scaled.csv", index=False)
X_test.to_csv("aai-590-capstone-mental-health/data-assets/X_test_encoded_scaled.csv", index=False)
pd.DataFrame(y_train).to_csv("aai-590-capstone-mental-health/data-assets/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("aai-590-capstone-mental-health/data-assets/y_test.csv", index=False)

In [42]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (182690, 18)
X_test shape: (78296, 18)
y_train shape: (182690,)
y_test shape: (78296,)
