In [36]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel

In [44]:
df_raw = pd.read_csv('Student Depression Dataset.csv')
df_raw['Family History of Mental Illness'] = df_raw['Family History of Mental Illness'].apply(lambda x: 1 if x=='Yes' else 0)
df_raw['Have you ever had suicidal thoughts ?'] = df_raw['Have you ever had suicidal thoughts ?'].apply(lambda x: 1 if x=='Yes' else 0)
df_raw.drop(df_raw[df_raw.isnull()['Financial Stress']].index, axis=0, inplace=True)


df_raw.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,1,3.0,1.0,0,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,0,3.0,2.0,1,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,0,9.0,1.0,1,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,1,4.0,5.0,1,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,1,1.0,1.0,0,0


In [32]:
# Adjust df to work for lasso regression
df_lasso = df_raw.drop(["Gender", 'City', "Profession", 'Degree', 'id'], axis=1)
df_lasso = pd.get_dummies(df_lasso, columns=['Dietary Habits', 'Sleep Duration'])

X = df_lasso.drop("Depression", axis=1)
y = df_lasso["Depression"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


lasso = Lasso(alpha=.1)  # Adjust alpha as needed

lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.15376489511203154
R-squared: 0.3691063597196873


In [43]:
X = df_lasso.drop("Depression", axis=1)
y = df_lasso["Depression"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

sel_ = SelectFromModel(
    LogisticRegression(C=0.1, penalty='l1', solver='liblinear', random_state=10))
sel_.fit(scaler.transform(X_train), y_train)

lst = sel_.get_support()

removed_feats = X.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats




Index(['Dietary Habits_Healthy', 'Sleep Duration_7-8 hours'], dtype='object')

In [None]:
df_dep = df_raw[df_raw['Depression'] == 1]
print("Average Stats for depressed participants:\n  Study satisfaction" + str(df_dep['Study Satisfaction'].mean()))

Stats for depressed participants:
2.7514691478942215


In [13]:
df_raw['Degree'].unique()

array(['B.Pharm', 'BSc', 'BA', 'BCA', 'M.Tech', 'PhD', 'Class 12', 'B.Ed',
       'LLB', 'BE', 'M.Ed', 'MSc', 'BHM', 'M.Pharm', 'MCA', 'MA', 'B.Com',
       'MD', 'MBA', 'MBBS', 'M.Com', 'B.Arch', 'LLM', 'B.Tech', 'BBA',
       'ME', 'MHM', 'Others'], dtype=object)

In [5]:
df_raw['Sleep Duration'].unique()

array(['5-6 hours', 'Less than 5 hours', '7-8 hours', 'More than 8 hours',
       'Others'], dtype=object)

In [18]:
print("CGPA:")
print(df_raw['CGPA'].describe())

cgpa_phd_dep = df_dep[df_dep['Degree'] == 'PhD']['CGPA'].mean()
print(f"\nAverage CGPA of PhD students who are depressed: {cgpa_phd_dep}")

cgpa_sleep = df_raw.groupby('Sleep Duration')['CGPA'].mean()
print("\nAverage CGPA by Sleep Duration:")
print(cgpa_sleep)

depression_by_sleep = df_raw.groupby('Sleep Duration')['Depression'].mean()
print("\nProportion of students with depression by Sleep Duration:")
print(depression_by_sleep)


avg_cgpa_by_degree = df_raw.groupby('Degree')['CGPA'].mean()
print("\nAverage CGPA by Degree:")
print(avg_cgpa_by_degree)

correlation_matrix = df_raw[['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 
                             'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 
                             'Work/Study Hours', 'Financial Stress', 'Depression']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


CGPA:
count    27901.000000
mean         7.656104
std          1.470707
min          0.000000
25%          6.290000
50%          7.770000
75%          8.920000
max         10.000000
Name: CGPA, dtype: float64

Average CGPA of PhD students who are depressed: 7.769265734265735

Average CGPA by Sleep Duration:
Sleep Duration
5-6 hours            7.688737
7-8 hours            7.686127
Less than 5 hours    7.640473
More than 8 hours    7.607902
Others               7.595556
Name: CGPA, dtype: float64

Proportion of students with depression by Sleep Duration:
Sleep Duration
5-6 hours            0.568818
7-8 hours            0.595018
Less than 5 hours    0.645126
More than 8 hours    0.509265
Others               0.500000
Name: Depression, dtype: float64

Average CGPA by Degree:
Degree
B.Arch      7.616502
B.Com       7.671826
B.Ed        7.850911
B.Pharm     7.707753
B.Tech      7.636513
BA          7.761300
BBA         7.714468
BCA         7.769281
BE          7.484617
BHM         7.649859


In [None]:

cgpa_by_financial_stress = df_raw.groupby('Financial Stress')['CGPA'].mean()
print("\nAverage CGPA by Financial Stress Level:")
print(cgpa_by_financial_stress)


depression_by_financial_stress = df_raw.groupby('Financial Stress')['Depression'].mean()
print("\nProportion of students with depression by Financial Stress Level:")
print(depression_by_financial_stress)


study_satisfaction_by_financial_stress = df_raw.groupby('Financial Stress')['Study Satisfaction'].mean()
print("\nAverage Study Satisfaction by Financial Stress Level:")
print(study_satisfaction_by_financial_stress)


sleep_duration_distribution = df_raw.groupby('Financial Stress')['Sleep Duration'].value_counts(normalize=True)
print("\nSleep Duration proportions by Financial Stress Level:")
print(sleep_duration_distribution)

correlation_with_financial_stress = df_raw.corr()['Financial Stress']
print("\nCorrelation of Financial Stress with other numerical variables:")
print(correlation_with_financial_stress)


Average CGPA by Financial Stress Level:
Financial Stress
1.0    7.656335
2.0    7.630041
3.0    7.640758
4.0    7.691988
5.0    7.656886
Name: CGPA, dtype: float64

Proportion of students with depression by Financial Stress Level:
Financial Stress
1.0    0.318688
2.0    0.429757
3.0    0.589361
4.0    0.690909
5.0    0.812807
Name: Depression, dtype: float64

Average Study Satisfaction by Financial Stress Level:
Financial Stress
1.0    3.134739
2.0    2.972337
3.0    2.889591
4.0    2.906494
5.0    2.851675
Name: Study Satisfaction, dtype: float64

Sleep Duration distribution by Financial Stress Level (proportion):
Financial Stress  Sleep Duration   
1.0               Less than 5 hours    0.319469
                  7-8 hours            0.247998
                  More than 8 hours    0.217926
                  5-6 hours            0.213630
                  Others               0.000976
2.0               Less than 5 hours    0.274649
                  7-8 hours            0.262201
    