In [1]:
#imports
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Read CSV file
df = pd.read_csv(os.environ['TRAIN_LOCATION'])

#rename columns
df.rename(columns={'Working Professional or Student': 'Working Student', 'Have you ever had suicidal thoughts ?': 'Suicidal Thoughts', 'Family History of Mental Illness':'Family Mental Illness' }, inplace=True)

# Convert all column names to snake_case
df.columns = (
    df.columns
    .str.strip()                              # remove leading/trailing spaces
    .str.replace(' ', '_')                    # replace spaces with underscores
    .str.replace('[^A-Za-z0-9_]+', '', regex=True)  # remove special characters
    .str.lower()            
                      # convert to lowercase (optional)
)

# View result
df.head()

Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [2]:
#Check for missing and duplicates
print(df.duplicated().sum()) # no duplicates
df.isna().sum() #We have alot of missing data that we will deal with in different ways

0


id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64

In [3]:
# Preprocessing

# Convert Yes/No columns to binary (1/0)
df = df.map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

# Make working_student binary: 1 if working, 0 if student
df['working_student'] = df['working_student'].map({'Working Professional': 1, 'Student': 0})

# Convert gender to binary: Male = 1, Female = 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Handle Null values if needed
# For example, set profession to 'Student' if working_student is 0
# df.loc[df['working_student'] == 0, 'profession'] = 'Student'

# Display random sample

# Print null value counts
print(df.isna().sum())

# remove sleep_duration occurences that have less than 10 appearences
df = df[df['sleep_duration'].isin(df['sleep_duration'].value_counts()[df['sleep_duration'].value_counts() >= 10].index)]
display(df.sample(10))



id                            0
name                          0
gender                        0
age                           0
city                          0
working_student               0
profession                36630
academic_pressure        112803
work_pressure             27918
cgpa                     112802
study_satisfaction       112803
job_satisfaction          27910
sleep_duration                0
dietary_habits                4
degree                        2
suicidal_thoughts             0
workstudy_hours               0
financial_stress              4
family_mental_illness         0
depression                    0
dtype: int64


Unnamed: 0,id,name,gender,age,city,working_student,profession,academic_pressure,work_pressure,cgpa,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,suicidal_thoughts,workstudy_hours,financial_stress,family_mental_illness,depression
30807,30807,Pratham,1,46.0,Patna,1,Architect,,1.0,,,4.0,7-8 hours,Moderate,B.Arch,1,10.0,5.0,1,0
101293,101293,Shivam,1,47.0,Ahmedabad,1,Teacher,,2.0,,,1.0,7-8 hours,Healthy,B.Arch,1,11.0,2.0,0,0
117079,117079,Tushar,1,55.0,Rajkot,1,Pilot,,3.0,,,2.0,5-6 hours,Unhealthy,MCA,1,5.0,3.0,0,0
35334,35334,Yash,1,59.0,Varanasi,1,Teacher,,3.0,,,1.0,7-8 hours,Unhealthy,LLB,1,1.0,5.0,1,0
71846,71846,Aarush,1,40.0,Mumbai,1,Business Analyst,,4.0,,,2.0,5-6 hours,Moderate,BCA,1,2.0,3.0,0,0
37770,37770,Raghavendra,1,36.0,Ludhiana,1,Teacher,,1.0,,,1.0,5-6 hours,Healthy,BCA,0,4.0,5.0,1,0
67166,67166,Raghavendra,1,32.0,Nagpur,1,Pilot,,3.0,,,4.0,More than 8 hours,Healthy,M.Tech,1,5.0,5.0,1,0
90929,90929,Aariv,1,42.0,Vasai-Virar,1,Content Writer,,1.0,,,5.0,Less than 5 hours,Moderate,B.Ed,0,2.0,3.0,0,0
116061,116061,Aariv,1,35.0,Delhi,1,Content Writer,,4.0,,,1.0,More than 8 hours,Moderate,MA,1,0.0,2.0,0,0
117477,117477,Arnav,1,20.0,Ludhiana,0,,2.0,,5.67,2.0,,Less than 5 hours,Healthy,Class 12,1,11.0,3.0,1,1


In [4]:
from OurModels.LinReg import LinReg

# Create an instance
model = LinReg(df)

# Call the instance method
model.output()  # prints df.head()


Mean Squared Error: 0.06676429710237661
RÂ² Score: 0.5483729391723782
                  feature   coefficient
0                      id  3.040435e-08
1                  gender  6.100191e-03
2                     age -1.047155e-02
3         working_student -2.677455e-01
4       academic_pressure  1.393265e-01
5           work_pressure  2.943334e-02
6                    cgpa  7.964445e-03
7      study_satisfaction -3.560930e-02
8        job_satisfaction -2.430195e-02
9       suicidal_thoughts  1.619961e-01
10        workstudy_hours  9.056009e-03
11       financial_stress  3.867007e-02
12  family_mental_illness  1.270471e-02
