<a href="https://colab.research.google.com/github/oxayavongsa/aai-590-capstone-mental-health/blob/main/notebook-pipeline/split_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Split Data
To prevent data leakage, we seperate the target then split the data prior to preprocessing.

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [15]:
# Load the cleaned mental health data with risk labels
file_path = "aai-590-capstone-mental-health/data-assets/cleaned_mental_health_data.csv"
# file_path = "./data-assets/cleaned_mental_health_data.csv"
df = pd.read_csv(file_path)
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options', 'risk_label'],
      dtype='object')

In [16]:
df.shape

(260986, 18)

In [17]:
# Drop the Timestamp column
df = df.drop(columns=["Timestamp"])

In [18]:
# Separate target first
target_col = 'risk_label'
y = df[target_col]
X = df.drop(columns=[target_col])

In [19]:
# Split before encoding
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (182690, 16), Test set: (78296, 16)


## Preprocess Data

In [20]:
# Label encode features
label_encoders = {}
for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Use same mapping
    label_encoders[col] = le

# Encode target
if y_train.dtype == 'object':
    target_encoder = LabelEncoder()
    y_train = target_encoder.fit_transform(y_train)
    y_test = target_encoder.transform(y_test)

print("Encoding completed")

Encoding completed


In [21]:
X_train.dtypes

Unnamed: 0,0
Gender,int64
Country,int64
Occupation,int64
self_employed,int64
family_history,int64
treatment,int64
Days_Indoors,int64
Growing_Stress,int64
Changes_Habits,int64
Mental_Health_History,int64


In [23]:
# Display the first few rows of the encoded X_train DataFrame
pd.set_option('display.max_columns', None)  # Show all columns
X_train.head()

Unnamed: 0,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
198840,1,8,2,0,1,1,0,2,1,0,1,0,1,1,1,2
66941,1,9,2,0,0,0,2,0,1,0,1,0,1,1,1,0
88246,1,9,0,1,1,1,2,0,2,1,1,1,0,1,1,2
4058,0,8,2,0,1,1,3,1,0,0,1,0,0,1,0,2
62721,1,9,3,0,0,0,0,0,2,2,1,0,1,0,1,1


In [24]:
# Scaling / Normalization (Numerical Columns)
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()

# Fit on training data and transform both train and test
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Scaling and normalization completed.")

Scaling and normalization completed.


In [25]:
# Save preprocessed datasets
X_train.to_csv("aai-590-capstone-mental-health/data-assets/X_train_encoded_scaled.csv", index=False)
X_test.to_csv("aai-590-capstone-mental-health/data-assets/X_test_encoded_scaled.csv", index=False)
pd.DataFrame(y_train).to_csv("aai-590-capstone-mental-health/data-assets/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("aai-590-capstone-mental-health/data-assets/y_test.csv", index=False)
# X_train.to_csv("./data-assets/X_train_encoded_scaled.csv", index=False)
# X_test.to_csv("./data-assets/X_test_encoded_scaled.csv", index=False)
# pd.DataFrame(y_train).to_csv("./data-assets/y_train.csv", index=False)
# pd.DataFrame(y_test).to_csv("./data-assets/y_test.csv", index=False)

In [26]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (182690, 16)
X_test shape: (78296, 16)
y_train shape: (182690,)
y_test shape: (78296,)
