# Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyClassifier
import joblib

In [4]:
# Load the dataset
data=pd.read_csv('Student Depression Dataset.csv')
data.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [5]:
# Basic Info and Data Types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [6]:
data=data.drop('id',axis=1)

# Statistics Summary

In [7]:
#  Summary Statistics
print("\nSummary Statistics (Numerical Columns):")
data.describe()


Summary Statistics (Numerical Columns):


Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Depression
count,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27901.0,27898.0,27901.0
mean,25.82,3.14,0.0,7.66,2.94,0.0,7.16,3.14,0.59
std,4.91,1.38,0.04,1.47,1.36,0.04,3.71,1.44,0.49
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,21.0,2.0,0.0,6.29,2.0,0.0,4.0,2.0,0.0
50%,25.0,3.0,0.0,7.77,3.0,0.0,8.0,3.0,1.0
75%,30.0,4.0,0.0,8.92,4.0,0.0,10.0,4.0,1.0
max,59.0,5.0,5.0,10.0,5.0,4.0,12.0,5.0,1.0


In [8]:
# Count unique values in each column
unique_counts = data.nunique()
print("\nUnique Values Count:\n", unique_counts)


Unique Values Count:
 Gender                                     2
Age                                       34
City                                      52
Profession                                14
Academic Pressure                          6
Work Pressure                              3
CGPA                                     332
Study Satisfaction                         6
Job Satisfaction                           5
Sleep Duration                             5
Dietary Habits                             4
Degree                                    28
Have you ever had suicidal thoughts ?      2
Work/Study Hours                          13
Financial Stress                           5
Family History of Mental Illness           2
Depression                                 2
dtype: int64


In [9]:
print("\nMode (Most Frequent Values):")
print(data.mode().iloc[0])


Mode (Most Frequent Values):
Gender                                                Male
Age                                                  24.00
City                                                Kalyan
Profession                                         Student
Academic Pressure                                     3.00
Work Pressure                                         0.00
CGPA                                                  8.04
Study Satisfaction                                    4.00
Job Satisfaction                                      0.00
Sleep Duration                           Less than 5 hours
Dietary Habits                                   Unhealthy
Degree                                            Class 12
Have you ever had suicidal thoughts ?                  Yes
Work/Study Hours                                     10.00
Financial Stress                                      5.00
Family History of Mental Illness                        No
Depression                

In [10]:
data['Financial Stress'].fillna(data['Financial Stress'].median(),inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 17 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27901 non-null  object 
 1   Age                                    27901 non-null  float64
 2   City                                   27901 non-null  object 
 3   Profession                             27901 non-null  object 
 4   Academic Pressure                      27901 non-null  float64
 5   Work Pressure                          27901 non-null  float64
 6   CGPA                                   27901 non-null  float64
 7   Study Satisfaction                     27901 non-null  float64
 8   Job Satisfaction                       27901 non-null  float64
 9   Sleep Duration                         27901 non-null  object 
 10  Dietary Habits                         27901 non-null  object 
 11  De

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
lb = LabelEncoder()

In [14]:
cat_col = data.select_dtypes(exclude=[np.number]).columns

In [15]:
# Create a dictionary to store encoders for each column
encoders = {}

# Loop through each categorical column and fit a LabelEncoder
for col in cat_col:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    encoders[col] = le  # Store the encoder

# Save the encoders using joblib
joblib.dump(encoders, 'encoders.pkl')

data.head()    

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,1,33.0,51,11,5.0,0.0,8.97,2.0,0.0,0,0,3,1,3.0,1.0,0,1
1,0,24.0,3,11,2.0,0.0,5.9,5.0,0.0,0,1,10,0,3.0,2.0,1,0
2,1,31.0,44,11,3.0,0.0,7.03,5.0,0.0,2,0,5,0,9.0,1.0,1,0
3,0,28.0,49,11,3.0,0.0,5.59,2.0,0.0,1,1,7,1,4.0,5.0,1,1
4,0,25.0,16,11,4.0,0.0,8.13,3.0,0.0,0,1,17,1,1.0,1.0,0,0


In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Apply Scaling
scaler = StandardScaler()
scaled_df = scaler.fit_transform(data.drop("Depression",axis=1))


# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_df)

In [17]:
principal_components 

array([[-0.04763295, -0.48190431],
       [ 0.22248599, -1.92854308],
       [ 0.13633925, -2.18136921],
       ...,
       [ 0.12979183, -0.89404637],
       [-0.14750572,  2.00509174],
       [-0.10912622,  0.2805841 ]])

In [18]:
# Create DataFrame for principal components 
pca_df = pd.DataFrame(data=principal_components,columns=['PC1','PC2'])
print("Explained Variance Ratio:",pca.explained_variance_ratio_)

Explained Variance Ratio: [0.11132236 0.10178261]


In [19]:
pca_df.head()

Unnamed: 0,PC1,PC2
0,-0.05,-0.48
1,0.22,-1.93
2,0.14,-2.18
3,0.07,0.63
4,-0.0,-0.5


In [20]:
from sklearn.model_selection import train_test_split
y=data['Depression']
x_train,x_test,y_train,y_test=train_test_split(pca_df,y,test_size=0.2,random_state=42)

In [21]:
clf = LazyClassifier()
models,predictions = clf.fit(x_train, x_test, y_train, y_test)
model_dictionary = clf.provide_models(x_train,x_test,y_train,y_test)

 97%|███████████████████████████████████████████████████████████████████████████████▍  | 31/32 [02:31<00:02,  2.91s/it]

[LightGBM] [Info] Number of positive: 13098, number of negative: 9222
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 22320, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.586828 -> initscore=0.350868
[LightGBM] [Info] Start training from score 0.350868


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [02:32<00:00,  4.75s/it]


In [22]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Perceptron,0.83,0.83,0.83,0.83,0.06
BernoulliNB,0.82,0.83,0.83,0.83,0.05
AdaBoostClassifier,0.83,0.83,0.83,0.83,1.11
PassiveAggressiveClassifier,0.83,0.83,0.83,0.83,0.08
NearestCentroid,0.83,0.83,0.83,0.83,1.19
RidgeClassifierCV,0.83,0.83,0.83,0.83,0.06
RidgeClassifier,0.83,0.83,0.83,0.83,0.05
CalibratedClassifierCV,0.83,0.83,0.83,0.83,0.21
LabelPropagation,0.83,0.83,0.83,0.83,40.81
LogisticRegression,0.83,0.83,0.83,0.83,0.09


In [23]:
## We will go with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [24]:
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
accuracy_score(y_test,y_pred)

0.8331840172012184

In [25]:
def user_predict(new_data):
    import pandas as pd
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA

    scaled_new_data = scaler.transform(new_data)
    new_principal_components = pca.transform(scaled_new_data)
    new_pca_df = pd.DataFrame(data=new_principal_components, columns=['PC1', 'PC2'])
    new_predictions = model.predict(new_pca_df)
    return new_predictions

In [26]:
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(pca, 'pca.pkl')

['pca.pkl']

In [27]:

df=pd.read_csv('Student Depression Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [28]:
df['Financial Stress'].value_counts()

Financial Stress
5.00    6715
4.00    5775
3.00    5226
1.00    5121
2.00    5061
Name: count, dtype: int64

In [29]:
import json

# Assuming 'df' is your DataFrame and 'City' is the column with city names
degree = df['Degree'].unique().tolist()

# Save the list of cities to a JSON file
with open('degree.json', 'w') as f:
    json.dump(degree, f)