In [184]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [185]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [186]:
# Pre processing

#Convert yes/no to binary
df = df.replace({'Yes': 1, 'No': 0}).infer_objects(copy=False)
#Make working_student binary, if they are working it will be 1, student 0 
df['working_student'] = df['working_student'].replace({'Working Professional':1, 'Student': 0})
df.dropna()

#Dealing with Null values
#For profession if they are a student their profession is "Student"
df.loc[df['working_student'] == 0, 'profession'] = 'Student'
#For Academic pressure if working_student is 1 then academic_pressure is 0 
df.loc[df['working_student'] == 0, 'work_pressure'] = 0

df.head()
df.isna().sum() 
df[df['profession'].isna()]


Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
29,29,Kashish,Female,19.0,Agra,1,,,1.0,,,5.0,More than 8 hours,Healthy,Class 12,0,5.0,2.0,0,0
36,36,Anvi,Female,50.0,Kalyan,1,,,4.0,,,4.0,7-8 hours,Unhealthy,Class 12,0,0.0,2.0,0,0
43,43,Vidya,Female,18.0,Vadodara,1,,,5.0,,,4.0,Less than 5 hours,Moderate,Class 12,1,10.0,5.0,0,1
48,48,Anand,Male,38.0,Ghaziabad,1,,,2.0,,,5.0,5-6 hours,Moderate,Class 12,0,1.0,1.0,1,0
50,50,Raunak,Male,21.0,Pune,1,,,4.0,,,5.0,5-6 hours,Healthy,Class 12,1,8.0,5.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140647,140647,Yuvraj,Male,35.0,Rajkot,1,,,5.0,,,1.0,7-8 hours,Unhealthy,Class 12,1,11.0,2.0,1,1
140655,140655,Kiran,Male,44.0,Meerut,1,,,1.0,,,3.0,More than 8 hours,Healthy,PhD,0,0.0,4.0,0,0
140666,140666,Abhishek,Male,18.0,Vadodara,1,,,5.0,,,4.0,Less than 5 hours,Moderate,Class 12,0,2.0,5.0,0,1
140667,140667,Kashish,Female,19.0,Rajkot,1,,,4.0,,,2.0,More than 8 hours,Unhealthy,MHM,1,12.0,4.0,1,1
