In [10]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [11]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [12]:
# Preprocessing

# Convert Yes/No columns to binary (1/0)
df = df.map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

# Make working_student binary: 1 if working, 0 if student
df['working_student'] = df['working_student'].map({'Working Professional': 1, 'Student': 0})

# Convert gender to binary: Male = 1, Female = 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Handle Null values if needed
# For example, set profession to 'Student' if working_student is 0
# df.loc[df['working_student'] == 0, 'profession'] = 'Student'

# Display random sample

# Print null value counts
print(df.isna().sum())

# remove sleep_duration occurences that have less than 10 appearences
df = df[df['sleep_duration'].isin(df['sleep_duration'].value_counts()[df['sleep_duration'].value_counts() >= 10].index)]
#if profession is student then make profession "Student"
df.loc[df["working_student"] == 0, "profession"] = "Student"



display(df.sample(10))

id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64


Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
38048,38048,Abhishek,1,37.0,Srinagar,1,Pharmacist,,3.0,,,5.0,7-8 hours,Unhealthy,M.Pharm,0,0.0,3.0,0,0
97533,97533,Zara,0,59.0,Surat,1,Travel Consultant,,2.0,,,2.0,7-8 hours,Moderate,MHM,1,1.0,2.0,0,0
34827,34827,Ivaan,1,39.0,Faridabad,1,Doctor,,2.0,,,2.0,More than 8 hours,Moderate,B.Pharm,0,5.0,1.0,1,0
127018,127018,Aariv,1,56.0,Varanasi,1,,,2.0,,,2.0,5-6 hours,Unhealthy,Class 12,1,10.0,4.0,0,0
69090,69090,Prachi,0,34.0,Ahmedabad,0,Student,3.0,,7.49,3.0,,More than 8 hours,Unhealthy,B.Com,0,12.0,4.0,0,0
3947,3947,Mithila,0,18.0,Chennai,1,,,1.0,,,4.0,5-6 hours,Healthy,Class 12,0,6.0,5.0,0,0
135495,135495,Himani,0,18.0,Nagpur,1,,,3.0,,,5.0,Less than 5 hours,Moderate,Class 12,0,9.0,5.0,1,0
50737,50737,Kartikeya,1,33.0,Srinagar,0,Student,5.0,,7.83,3.0,,Less than 5 hours,Unhealthy,BSc,1,12.0,2.0,1,1
73649,73649,Harsha,1,32.0,Nagpur,0,Student,3.0,,7.38,1.0,,Less than 5 hours,Moderate,B.Ed,1,8.0,2.0,0,1
46616,46616,Naina,0,22.0,Faridabad,1,Doctor,,5.0,,,2.0,Less than 5 hours,Healthy,B.Pharm,0,4.0,4.0,1,0


In [None]:
#normalize data

In [13]:
from OurModels.LinReg import LinReg

# Create an instance
model = LinReg(df)

# Call the instance method
model.output() 



ðŸ“Š Results for linear:
Mean Absolute Error (MAE): 0.18908353996727065
Mean Squared Error (MSE): 0.06676429710237661
Root Mean Squared Error (RMSE): 0.06676429710237661
RÂ² Score: 0.5483729391723782
