In [270]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [271]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [272]:
# Pre processing

#Convert yes/no to binary
df = df.replace({'Yes': 1, 'No': 0}).infer_objects(copy=False)
#Make working_student binary, if they are working it will be 1, student 0 
df['working_student'] = df['working_student'].replace({'Working Professional':1, 'Student': 0})
#Male 1 Female 0 
df['gender'] = df['gender'].replace({'Male': 1, 'Female': 0})
#Dealing with Null values
#For profession if they are a student their profession is "Student"
# df.loc[df['working_student'] == 0, 'profession'] = 'Student'
display(df.sample(10))
print(df.isna().sum())
print(df['sleep_duration'].unique())

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
81790,81790,Shaurya,1,40.0,Visakhapatnam,1,Pharmacist,,3.0,,,3.0,Less than 5 hours,Healthy,MBBS,1,2.0,3.0,0,0
88655,88655,Pranav,1,53.0,Meerut,1,Business Analyst,,4.0,,,5.0,Less than 5 hours,Unhealthy,M.Tech,1,5.0,5.0,0,0
139922,139922,Bhavesh,1,33.0,Vasai-Virar,0,,1.0,,8.44,4.0,,7-8 hours,Moderate,B.Tech,1,11.0,1.0,0,0
13313,13313,Vidhi,0,40.0,Agra,1,Entrepreneur,,5.0,,,2.0,7-8 hours,Moderate,MSc,1,1.0,4.0,0,0
91968,91968,Kartikeya,1,24.0,Thane,0,,1.0,,8.13,3.0,,Less than 5 hours,Moderate,B.Ed,1,5.0,1.0,0,0
71081,71081,Ishwar,1,38.0,Faridabad,1,Teacher,,1.0,,,2.0,More than 8 hours,Moderate,B.Ed,0,8.0,2.0,1,0
118168,118168,Aarya,0,21.0,Ghaziabad,1,Architect,,4.0,,,3.0,Less than 5 hours,Moderate,B.Arch,1,6.0,5.0,0,1
5817,5817,Shlok,1,49.0,Kalyan,1,Chemist,,3.0,,,5.0,More than 8 hours,Moderate,MD,0,6.0,5.0,0,0
24407,24407,Saanvi,0,33.0,Indore,0,,3.0,,9.93,4.0,,5-6 hours,Moderate,BBA,1,2.0,2.0,0,0
94363,94363,Ritik,1,45.0,Mumbai,1,Teacher,,5.0,,,4.0,Less than 5 hours,Moderate,LLB,1,5.0,1.0,1,0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64
['More than 8 hours' 'Less than 5 hours' '5-6 hours' '7-8 hours'
 'Sleep_Duration' '1-2 hours' '6-8 hours' '4-6 hours' '6-7 hours'
 '10-11 hours' '8-9 hours' '40-45 hours' '9-11 hours' '2-3 hours'
 '3-4 hours' 'Moderate' '55-66 hours' '4-5 hours' '9-6 hours' '1-3 hours'
 'Indore' '45' '1-6 hours' '35-36 hours' '8 hours' 0 '10-6 hours'
 'than 5 h