<a href="https://colab.research.google.com/github/oxayavongsa/aai-590-capstone-mental-health/blob/main/split_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Split Data
To prevent data leakage, we seperate the target then split the data prior to preprocessing.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the cleaned mental health data with risk labels
# file_path = "aai-590-capstone-mental-health/data-assets/cleaned_mental_health_data.csv"
file_path = "./data-assets/cleaned_mental_health_data.csv"
df = pd.read_csv(file_path)
df.columns

Index(['Timestamp', 'Gender', 'Country', 'Occupation', 'self_employed',
       'family_history', 'treatment', 'Days_Indoors', 'Growing_Stress',
       'Changes_Habits', 'Mental_Health_History', 'Mood_Swings',
       'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
       'mental_health_interview', 'care_options', 'risk_label'],
      dtype='object')

In [4]:
df.shape

(260986, 18)

In [5]:
# Separate target first
target_col = 'risk_label'
y = df[target_col]
X = df.drop(columns=[target_col])

In [6]:
# Split before encoding
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Train set: (182690, 17), Test set: (78296, 17)


## Preprocess Data

In [7]:
# Label encode features
label_encoders = {}
for col in X_train.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Use same mapping
    label_encoders[col] = le

# Encode target
if y_train.dtype == 'object':
    target_encoder = LabelEncoder()
    y_train = target_encoder.fit_transform(y_train)
    y_test = target_encoder.transform(y_test)

print("Encoding completed")

Encoding completed


In [9]:
X_train.dtypes

Timestamp                  int32
Gender                     int32
Country                    int32
Occupation                 int32
self_employed              int32
family_history             int32
treatment                  int32
Days_Indoors               int32
Growing_Stress             int32
Changes_Habits             int32
Mental_Health_History      int32
Mood_Swings                int32
Coping_Struggles           int32
Work_Interest              int32
Social_Weakness            int32
mental_health_interview    int32
care_options               int32
dtype: object

In [11]:
# Identify and display numerical columns in X_train
numeric_cols = X_train.select_dtypes(include=['int32', 'float64']).columns
print("Numerical columns:")
for col in numeric_cols:
    print("-", col)

# Display numerical columns in X_train
numeric_cols = X_train.select_dtypes(include=['int32', 'float64'])
display(numeric_cols.head())

Numerical columns:
- Timestamp
- Gender
- Country
- Occupation
- self_employed
- family_history
- treatment
- Days_Indoors
- Growing_Stress
- Changes_Habits
- Mental_Health_History
- Mood_Swings
- Coping_Struggles
- Work_Interest
- Social_Weakness
- mental_health_interview
- care_options


Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
198840,40,1,8,2,0,1,1,0,2,1,0,1,0,1,1,1,2
66941,126,1,9,2,0,0,0,2,0,1,0,1,0,1,1,1,0
88246,347,1,9,0,1,1,1,2,0,2,1,1,1,0,1,1,2
4058,516,0,8,2,0,1,1,3,1,0,0,1,0,0,1,0,2
62721,245,1,9,3,0,0,0,0,0,2,2,1,0,1,0,1,1


In [12]:
# Scaling / Normalization (Numerical Columns)
scaler = StandardScaler()
X_train[numeric_cols.columns] = scaler.fit_transform(X_train[numeric_cols.columns])
X_test[numeric_cols.columns] = scaler.transform(X_test[numeric_cols.columns])

print("Numerical columns scaled successfully.")

Numerical columns scaled successfully.


In [13]:
# Save preprocessed datasets
# X_train.to_csv("aai-590-capstone-mental-health/data-assets/X_train_encoded_scaled.csv", index=False)
# X_test.to_csv("aai-590-capstone-mental-health/data-assets/X_test_encoded_scaled.csv", index=False)
# pd.DataFrame(y_train).to_csv("aai-590-capstone-mental-health/data-assets/y_train.csv", index=False)
# pd.DataFrame(y_test).to_csv("aai-590-capstone-mental-health/data-assets/y_test.csv", index=False)
X_train.to_csv("./data-assets/X_train_encoded_scaled.csv", index=False)
X_test.to_csv("./data-assets/X_test_encoded_scaled.csv", index=False)
pd.DataFrame(y_train).to_csv("./data-assets/y_train.csv", index=False)
pd.DataFrame(y_test).to_csv("./data-assets/y_test.csv", index=False)

In [14]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (182690, 17)
X_test shape: (78296, 17)
y_train shape: (182690,)
y_test shape: (78296,)
