# Feature Selection and Model Training


In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [3]:
# read preprocessed dataset
df = pd.read_csv(r"C:\Nishanthi\Hope AI\Projects\Depression Detection\Ordered_Formate\2.Data- Preprocessing\Preprocessed_data.csv")

# Feature Selection

In [4]:

X = df[['Gender', 'self_employed', 'family_history', 'treatment','Days_Indoors', 'Changes_Habits','Mental_Health_History', 'Mood_Swings', 'Coping_Struggles',
       'Work_Interest', 'Social_Weakness', 'mental_health_interview','care_options', 'Month', 'Occupation_Corporate', 'Occupation_Housewife','Occupation_Others', 'Occupation_Student']]  # Features
y = df['Growing_Stress']  # Target

clf = DecisionTreeClassifier(criterion="gini", max_depth=14, random_state=42)
clf.fit(X, y)
print(X.shape, y.shape)

# Get Feature Importances
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': clf.feature_importances_}).sort_values(by='Importance', ascending=False)  # Sort by importance
print("\nTop Features:\n", feature_importance)

(73925, 18) (73925,)

Top Features:
                     Feature  Importance
10          Social_Weakness    0.155631
6     Mental_Health_History    0.135394
4              Days_Indoors    0.125958
7               Mood_Swings    0.116158
5            Changes_Habits    0.104852
0                    Gender    0.098638
8          Coping_Struggles    0.071198
9             Work_Interest    0.046691
14     Occupation_Corporate    0.043229
17       Occupation_Student    0.042702
15     Occupation_Housewife    0.035269
16        Occupation_Others    0.023385
11  mental_health_interview    0.000293
1             self_employed    0.000211
13                    Month    0.000156
12             care_options    0.000151
3                 treatment    0.000079
2            family_history    0.000005


# Training and Testing model

In [5]:
# Define features and target
X = df[['Social_Weakness','Changes_Habits','Days_Indoors','Mental_Health_History','Gender','Mood_Swings',
         'Occupation_Student','Occupation_Corporate','Occupation_Housewife','Occupation_Others','Work_Interest']]  # Features
y = df['Growing_Stress']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [6]:
# Create Decision Tree Classifier
clf = DecisionTreeClassifier(criterion="gini", max_depth=15, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7115
           1       0.99      0.99      0.99      7670

    accuracy                           0.99     14785
   macro avg       0.99      0.99      0.99     14785
weighted avg       0.99      0.99      0.99     14785



# Save model

In [7]:
# save model
import pickle 

pickle.dump(clf, open('Depression_detection_model.sav', 'wb'))