In [1]:
## LIBS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## READ DATA
main = pd.read_csv(r'/kaggle/input/student-depression-dataset/Student Depression Dataset.csv')
main.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0


In [None]:
## REMOVING ID
main = main.drop(['id'], axis=1)

## CHANGING CATERORICAL TO NUMERICAL (GENDER)
main.loc[main['Gender'] == 'Male', 'Gender'] = 0
main.loc[main['Gender'] == 'Female', 'Gender'] = 1
main.head(3)


In [None]:
## CHECKING THE CITY NAMES AND COUNTS
main['City'].value_counts()

In [None]:
## REMOVING CITIES WITH LESS THAN 400 STUDENTS
cities_to_remove = main['City'].value_counts()[main['City'].value_counts() < 400]
main = main[~main['City'].isin(cities_to_remove.index)]
main['City'].value_counts()

In [None]:
main.head(3)

In [None]:
## CHECKING THE PROFESSION NAMES AND COUNTS
main['Profession'].value_counts()

In [None]:
## REMOVING PROFESSIONS != STUDENT DUE LOW COUNTS
main = main.loc[main['Profession'] == 'Student']
main['Profession'].value_counts()

In [None]:
## IF WE HAVE ONLY ONE PROFESSION, WE CAN DROP THE COLUMN
main = main.drop(['Profession'], axis=1)

In [None]:
## CHECKING THE WORK PRESSURE NAMES AND COUNTS
main['Work Pressure'].value_counts()

In [None]:
## REMOVING WORK PRESSURE DUE THE MAJORITY OF THE STUDENTS HAVE NO WORK PRESSURE
main = main.drop(['Work Pressure'], axis=1)
main.head(3)

In [None]:
## CHECKING AGE DISTRIBUTION
main['Age'].value_counts()

In [None]:
## REMOVING STUDENTS OLDER THAN 30
main = main.loc[main['Age'] <= 30]
main['Age'].value_counts()

In [None]:
## CHECKING THE ACADEMIC PRESSURE AND COUNTS
main['Academic Pressure'].value_counts()

In [None]:
## REMOVING STUDENTS WITH NO ACADEMIC PRESSURE DUE LOW COUNTS
main = main.loc[main['Academic Pressure'] > 0]
main['Academic Pressure'].value_counts()

In [None]:
## CHECKING THE STUDY SATISFACTION AND COUNTS
main['Study Satisfaction'].value_counts()

In [None]:
## REMOVING STUDENTS WITH NO STUDY SATISFACTION DUE LOW COUNTS
main = main.loc[main['Study Satisfaction'] > 0]
main['Study Satisfaction'].value_counts()

In [None]:
main.head(3)

In [None]:
## REMOVING JOB SATISFACTION DUE DROP PROFESSION COLUMN
main = main.drop(['Job Satisfaction'], axis=1)
main.head(3)

In [None]:
## CHECKING THE SLEEP DURATION AND REMOVING 'OTHERS' DUW LOW COUNTS
main['Sleep Duration'].value_counts()
main = main.loc[main['Sleep Duration'] != 'Others']
main['Sleep Duration'].value_counts()

In [None]:
## CHANGING SLEEP DURATION TO NUMERICAL
main.loc[main['Sleep Duration'] == 'Less than 5 hours', 'Sleep Duration'] = 0
main.loc[main['Sleep Duration'] == '5-6 hours', 'Sleep Duration'] = 1
main.loc[main['Sleep Duration'] == '7-8 hours', 'Sleep Duration'] = 2
main.loc[main['Sleep Duration'] == 'More than 8 hours', 'Sleep Duration'] = 3
main['Sleep Duration'].value_counts()

In [None]:
## CHECKING DIEATARY HABITS AND REMOVING 'OTHERS' DUE LOW COUNTS
main['Dietary Habits'].value_counts()
main = main.loc[main['Dietary Habits'] != 'Others']
main['Dietary Habits'].value_counts()

In [None]:
## CHANGING DIETARY HABITS TO NUMERICAL
main.loc[main['Dietary Habits'] == 'Healthy', 'Dietary Habits'] = 0
main.loc[main['Dietary Habits'] == 'Unhealthy', 'Dietary Habits'] = 1
main.loc[main['Dietary Habits'] == 'Moderate', 'Dietary Habits'] = 2
main['Dietary Habits'].value_counts()

In [None]:
## CHECKING DEGREE NAMES
main['Degree'].unique()

In [None]:
## CREATING NEW DEGREE COLUMN WITH ONLY GRADUATED, POST GRADUATED AND HIGHER SECONDARY
main.loc[main['Degree'].str.contains(r'BSc|BCA|B.Ed|BHM|B.Pharm|B.Com|BE|BA|B.Arch|B.Tech|BBA|LLB', regex=True), 'New_Degree'] = 'Graduated'
main.loc[main['Degree'].str.contains(r'MSc|MCA|M.Ed|M.Pharm|M.Com|ME|MA|M.Arch|M.Tech|MBA|LLM', regex=True), 'New_Degree'] = 'Post Graduated'
main.loc[main['Degree'] == 'Class 12', 'New_Degree'] = 'Higher Secondary'
main = main.loc[main['Degree'] != 'Others']
main['New_Degree'].value_counts()

In [None]:
## CHANGING NEW DEGREE TO NUMERICAL
main.loc[main['New_Degree'] == 'Graduated', 'New_Degree'] = 0
main.loc[main['New_Degree'] == 'Post Graduated', 'New_Degree'] = 1
main.loc[main['New_Degree'] == 'Higher Secondary', 'New_Degree'] = 2
main['New_Degree'].value_counts()

In [None]:
## CHECKING SUICIDAL THOUGHTS
main['Have you ever had suicidal thoughts ?'].value_counts()

In [None]:
## CHANGING SUICIDAL THOUGHTS TO NUMERICAL
main.loc[main['Have you ever had suicidal thoughts ?'] == 'Yes', 'Have you ever had suicidal thoughts ?'] = 1
main.loc[main['Have you ever had suicidal thoughts ?'] == 'No', 'Have you ever had suicidal thoughts ?'] = 0
main['Have you ever had suicidal thoughts ?'].value_counts()

In [None]:
## CHECKING THE STUDY HOURS
main['Work/Study Hours'].value_counts()

In [None]:
## CHECKING THE FINANCIAL STRESS
main['Financial Stress'].value_counts()

In [None]:
## CHECKING THE FAMILY HISTORY OF MENTAL ILLNESS
main['Family History of Mental Illness'].value_counts()

In [None]:
## CHANGING FAMILY HISTORY OF MENTAL ILLNESS TO NUMERICAL
main.loc[main['Family History of Mental Illness'] == 'Yes', 'Family History of Mental Illness'] = 1
main.loc[main['Family History of Mental Illness'] == 'No', 'Family History of Mental Illness'] = 0
main['Family History of Mental Illness'].value_counts()

In [None]:
main.head(3)

In [None]:
## CHECKING FOR NULL VALUES AND REMOVING
main.isnull().sum()
main = main.dropna()

## CREATING A NEW DF TO CHECK IF THERE IS ANY CORRELATION BETWEEN THE NUMERIC VARIABLES
main_data = main[['Gender', 'Age', 'Academic Pressure', 'CGPA',
       'Study Satisfaction', 'Sleep Duration', 'Dietary Habits',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression',
       'New_Degree']]
main_data.head(3)

In [None]:
## PLOTTING THE CORRELATION MATRIX
plt.figure(figsize=(20, 10))
sns.heatmap(main_data.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
## CREATING DUMMY VARIABLES FOR THE CITY COLUMN

encoder = OneHotEncoder(sparse_output=False)  
encoded = encoder.fit_transform(main[['City']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['City']))
## ADJUSTING THE INDEX
encoded_df.index = main.index
main_encoded = pd.concat([main, encoded_df], axis=1)

## CHECKING THE NEW DF
main_encoded.columns

In [None]:
## REMOVING THE ORIGINAL CITY AND OLD DEGREE COLUMN
main_encoded = main_encoded.drop(['City', 'Degree'], axis=1)

In [None]:
## SPLITTING THE DATA INTO FEATURES AND TARGET
X = main_encoded.drop('Depression', axis=1).values  # Features
y = main_encoded['Depression'].values  # Target

## SPLITTING THE DATA INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## NORMALIZING THE DATA FEATURES USING STANDARD SCALER
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## TRAINING THE MODEL USING LOGISTIC REGRESSION
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

## SCORE
score = model.score(X_test_scaled, y_test)
print(f"Accuracy: {score*100:.2f}%")

In [None]:
## PLLOTING THE CONFUSION MATRIX
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted: 0', 'Predicted: 1'], yticklabels=['Real: 0', 'Real: 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Values')
plt.ylabel('Real Values')
plt.show()

In [None]:
## COMPARING MULTIPLE MODELS TO CHECK THE BEST ONE

## MODELS
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score

## MODELS TO BE COMPARED
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42, verbose=-1)
}

## TRAINING THE MODELS AND STORING THE ACCURACY
accuracy_results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)  
    y_pred = model.predict(X_test_scaled) 
    accuracy = accuracy_score(y_test, y_pred)  
    accuracy_results[name] = accuracy 

## ORDERING THE ACCURACY RESULTS FROM HIGHEST TO LOWEST
accuracy_results_ordened = dict(sorted(accuracy_results.items(), key=lambda item: item[1], reverse=True))


## PLOTTING THE ACCURACY OF EACH MODEL ORDERING BY HIGHEST ACCURACY AND ADDING THE ACCURACY VALUE ABOVE THE BARS
plt.figure(figsize=(15, 6))
sns.barplot(x=list(accuracy_results_ordened.values()), 
            y=list(accuracy_results_ordened.keys()), 
            palette='viridis')
plt.xlabel('Accuracy')
plt.title('Model Accuracy')

## LABELS
for i, v in enumerate(accuracy_results_ordened.values()):
    plt.text(v, i, f'{v*100:.2f}%', color='black', va='center')
plt.show()
