In [6]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [7]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [8]:
# Preprocessing
# Convert Yes/No columns to binary (1/0)
df = df.map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)
# Make working_student binary: 1 if working, 0 if student
df['working_student'] = df['working_student'].map({'Working Professional': 1, 'Student': 0})
# Convert gender to binary: Male = 1, Female = 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
# remove sleep_duration occurences that have less than 10 appearences
df = df[df['sleep_duration'].isin(df['sleep_duration'].value_counts()[df['sleep_duration'].value_counts() >= 10].index)]
#if profession is student then make profession "Student"
df.loc[df["working_student"] == 0, "profession"] = "Student"
#If profession is still NaN, set to "Unemployed"
df.loc[df["profession"].isna(), "profession"] = "Unemployed"
null_professions = df[df['profession'].isnull()]

# Print null value counts
display(df.isna().sum())
display(df.sample(10))

id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                    0
academic_pressure        112752
work_pressure             27902
cgpa                     112751
study_satisfaction       112752
job_satisfaction          27894
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
126267,126267,Pratyush,1,33.0,Vasai-Virar,1,Chemist,,3.0,,,5.0,5-6 hours,Moderate,MBBS,0,4.0,2.0,0,0
31851,31851,Kiran,1,38.0,Agra,1,Business Analyst,,1.0,,,5.0,7-8 hours,Moderate,BSc,1,6.0,3.0,1,0
101295,101295,Aanchal,1,40.0,Kalyan,1,Teacher,,1.0,,,3.0,5-6 hours,Unhealthy,M.Ed,0,8.0,1.0,1,0
74349,74349,Riya,0,48.0,Mumbai,1,Mechanical Engineer,,5.0,,,1.0,5-6 hours,Moderate,BSc,0,9.0,3.0,1,0
74879,74879,Anand,1,20.0,Mumbai,1,Unemployed,,5.0,,,4.0,Less than 5 hours,Healthy,Class 12,1,10.0,2.0,0,0
1871,1871,Rupak,1,34.0,Kanpur,0,Student,4.0,,5.64,4.0,,More than 8 hours,Moderate,M.Com,0,10.0,1.0,1,0
127834,127834,Vidya,0,45.0,Vadodara,1,Teacher,,4.0,,,2.0,More than 8 hours,Unhealthy,LLM,0,6.0,4.0,1,0
20575,20575,Shreya,0,56.0,Faridabad,1,Chef,,3.0,,,2.0,5-6 hours,Moderate,BHM,1,5.0,5.0,1,0
5262,5262,Radhika,0,44.0,Patna,1,Pilot,,4.0,,,3.0,Less than 5 hours,Unhealthy,BCA,0,3.0,4.0,0,0
21774,21774,Vanya,0,40.0,Pune,1,Graphic Designer,,3.0,,,5.0,Less than 5 hours,Moderate,B.Tech,1,6.0,5.0,1,0


In [9]:
#Now we want to split it into 2 datasets, students and workers
students_df = df.loc[df["working_student"] == 0].drop({"working_student", "work_pressure", "job_satisfaction"}, axis=1)
working_df = df.loc[df["working_student"] == 1].drop({"working_student", "study_satisfaction", "cgpa", "academic_pressure"}, axis=1)

display(students_df)

Unnamed: 0,id,name,gender,age,city,profession,academic_pressure,cgpa,study_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
2,2,Yuvraj,1,33.0,Visakhapatnam,Student,5.0,8.97,2.0,5-6 hours,Healthy,B.Pharm,1,3.0,1.0,0,1
8,8,Aishwarya,0,24.0,Bangalore,Student,2.0,5.90,5.0,5-6 hours,Moderate,BSc,0,3.0,2.0,1,0
26,26,Aditya,1,31.0,Srinagar,Student,3.0,7.03,5.0,Less than 5 hours,Healthy,BA,0,9.0,1.0,1,0
30,30,Prisha,0,28.0,Varanasi,Student,3.0,5.59,2.0,7-8 hours,Moderate,BCA,1,4.0,5.0,1,1
32,32,Chhavi,0,25.0,Jaipur,Student,4.0,8.13,3.0,5-6 hours,Moderate,M.Tech,1,1.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140685,140685,Aaradhya,0,27.0,Surat,Student,5.0,5.75,5.0,5-6 hours,Unhealthy,Class 12,1,7.0,1.0,1,0
140686,140686,Rohan,1,27.0,Ludhiana,Student,2.0,9.40,3.0,Less than 5 hours,Healthy,MSc,0,0.0,3.0,1,0
140689,140689,Ayaan,1,31.0,Faridabad,Student,3.0,6.61,4.0,5-6 hours,Unhealthy,MD,0,12.0,2.0,0,0
140690,140690,Rashi,0,18.0,Ludhiana,Student,5.0,6.88,2.0,Less than 5 hours,Healthy,Class 12,1,10.0,5.0,0,1


In [10]:
#scale our data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

student_numeric_cols = students_df.select_dtypes(include=["number"]).columns
students_df[student_numeric_cols] = scaler.fit_transform(students_df[student_numeric_cols])

working_numeric_cols = working_df.select_dtypes(include=["number"]).columns
working_df[working_numeric_cols] = scaler.fit_transform(working_df[working_numeric_cols])

display(students_df)
display(working_df)

Unnamed: 0,id,name,gender,age,city,profession,academic_pressure,cgpa,study_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
2,-1.733370,Yuvraj,0.891357,1.463328,Visakhapatnam,Student,1.345541,0.895492,-0.694531,5-6 hours,Healthy,B.Pharm,0.761738,-1.121451,-1.488980,-0.968403,0.841310
8,-1.733222,Aishwarya,-1.121885,-0.371260,Bangalore,Student,-0.827216,-1.200905,1.511072,5-6 hours,Moderate,BSc,-1.312787,-1.121451,-0.793248,1.032628,-1.188622
26,-1.732779,Aditya,0.891357,1.055642,Srinagar,Student,-0.102964,-0.429267,1.511072,Less than 5 hours,Healthy,BA,-1.312787,0.496997,-1.488980,1.032628,-1.188622
30,-1.732681,Prisha,-1.121885,0.444113,Varanasi,Student,-0.102964,-1.412594,-0.694531,7-8 hours,Moderate,BCA,0.761738,-0.851709,1.293950,1.032628,0.841310
32,-1.732631,Chhavi,-1.121885,-0.167417,Jaipur,Student,0.621289,0.321885,0.040670,5-6 hours,Moderate,M.Tech,0.761738,-1.660933,-1.488980,-0.968403,-1.188622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140685,1.728154,Aaradhya,-1.121885,0.240270,Surat,Student,1.345541,-1.303335,1.511072,5-6 hours,Unhealthy,Class 12,0.761738,-0.042485,-1.488980,1.032628,-1.188622
140686,1.728179,Rohan,0.891357,0.240270,Ludhiana,Student,-0.827216,1.189125,0.040670,Less than 5 hours,Healthy,MSc,-1.312787,-1.930675,-0.097515,1.032628,-1.188622
140689,1.728253,Ayaan,0.891357,1.055642,Faridabad,Student,-0.102964,-0.716071,0.775871,5-6 hours,Unhealthy,MD,-1.312787,1.306221,-0.793248,-0.968403,-1.188622
140690,1.728277,Rashi,-1.121885,-1.594319,Ludhiana,Student,1.345541,-0.531697,-0.694531,Less than 5 hours,Healthy,Class 12,0.761738,0.766739,1.293950,-0.968403,0.841310


Unnamed: 0,id,name,gender,age,city,profession,work_pressure,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,-1.731621,Aaradhya,-1.103140,0.457479,Ludhiana,Chef,1.423408,-0.688153,More than 8 hours,Healthy,BHM,-0.923188,-1.303958,-0.677174,-1.000763,-0.298567
1,-1.731596,Vivan,0.906503,-1.642980,Varanasi,Teacher,0.712051,0.018009,Less than 5 hours,Unhealthy,LLB,1.083203,0.251875,0.034443,-1.000763,3.349333
3,-1.731547,Yuvraj,0.906503,-2.008277,Mumbai,Teacher,1.423408,-1.394315,Less than 5 hours,Moderate,BBA,1.083203,1.029791,-1.388791,0.999238,3.349333
4,-1.731522,Rhea,-1.103140,-1.277683,Kanpur,Business Analyst,-1.422020,-1.394315,5-6 hours,Unhealthy,BBA,1.083203,0.770486,0.746059,0.999238,-0.298567
5,-1.731498,Vani,-1.103140,1.370723,Ahmedabad,Finanancial Analyst,-0.710663,1.430333,5-6 hours,Healthy,MCA,-0.923188,0.251875,1.457676,-1.000763,-0.298567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140694,1.732657,Ishaani,-1.103140,0.092182,Ahmedabad,Teacher,-0.710663,1.430333,Less than 5 hours,Moderate,B.Ed,1.083203,-1.303958,1.457676,-1.000763,-0.298567
140695,1.732681,Vidya,-1.103140,-2.373575,Ahmedabad,Unemployed,1.423408,0.724171,5-6 hours,Unhealthy,Class 12,-0.923188,-1.044653,0.746059,0.999238,3.349333
140696,1.732706,Lata,-1.103140,-0.273115,Hyderabad,Content Writer,1.423408,0.724171,7-8 hours,Moderate,B.Tech,1.083203,-0.007431,1.457676,0.999238,-0.298567
140697,1.732731,Aanchal,-1.103140,-1.825629,Kolkata,Marketing Manager,0.000694,-1.394315,More than 8 hours,Moderate,B.Com,-0.923188,-0.526042,0.746059,-1.000763,-0.298567
