In [1]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [2]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [3]:
# Preprocessing

# Convert Yes/No columns to binary (1/0)
df = df.map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

# Make working_student binary: 1 if working, 0 if student
df['working_student'] = df['working_student'].map({'Working Professional': 1, 'Student': 0})

# Convert gender to binary: Male = 1, Female = 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Handle Null values if needed
# For example, set profession to 'Student' if working_student is 0
# df.loc[df['working_student'] == 0, 'profession'] = 'Student'

# Display random sample

# Print null value counts
print(df.isna().sum())

# remove sleep_duration occurences that have less than 10 appearences
df = df[df['sleep_duration'].isin(df['sleep_duration'].value_counts()[df['sleep_duration'].value_counts() >= 10].index)]
display(df.sample(10))



id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64


Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
134719,134719,Monika,0,60.0,Ahmedabad,1,Teacher,,1.0,,,2.0,5-6 hours,Healthy,B.Arch,0,2.0,5.0,0,0
123206,123206,Mahi,0,37.0,Ludhiana,1,Content Writer,,3.0,,,4.0,More than 8 hours,Healthy,B.Ed,1,11.0,3.0,0,0
122891,122891,Vihaan,1,43.0,Meerut,1,Lawyer,,1.0,,,4.0,Less than 5 hours,Moderate,LLM,1,10.0,2.0,0,0
102926,102926,Krishna,1,27.0,Visakhapatnam,1,Lawyer,,3.0,,,1.0,Less than 5 hours,Moderate,LLM,0,0.0,2.0,1,0
24630,24630,Shaurya,1,18.0,Vadodara,1,,,5.0,,,4.0,Less than 5 hours,Healthy,Class 12,1,4.0,3.0,0,1
101857,101857,Shivam,1,57.0,Delhi,1,Data Scientist,,5.0,,,1.0,7-8 hours,Moderate,MCA,1,9.0,4.0,0,0
6798,6798,Armaan,1,56.0,Patna,1,Architect,,5.0,,,3.0,7-8 hours,Healthy,MSc,0,2.0,5.0,0,0
109123,109123,Nikhil,1,38.0,Pune,1,Teacher,,2.0,,,1.0,More than 8 hours,Moderate,BBA,1,5.0,4.0,1,0
130732,130732,Gagan,1,28.0,Patna,0,,4.0,,8.97,2.0,,More than 8 hours,Healthy,MSc,0,7.0,1.0,0,0
21836,21836,Vivan,1,31.0,Indore,1,,,4.0,,,4.0,Less than 5 hours,Healthy,Class 12,0,2.0,5.0,0,0


In [4]:
from OurModels.LinReg import LinReg

# Create an instance
model = LinReg(df)

# Call the instance method
model.output()  # prints df.head()



ðŸ“Š Results for linear:
Mean Absolute Error (MAE): 0.18908353996727065
Mean Squared Error (MSE): 0.06676429710237661
Root Mean Squared Error (RMSE): 0.06676429710237661


NameError: name 'median_absolute_error' is not defined