In [174]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier

In [175]:
df = pd.read_csv('Mental Health Dataset.csv')

In [176]:
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:31,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:50,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:39,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:59,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:36,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


# EDA and Data Preparation

In [177]:
df.isnull().sum()

Timestamp                     0
Gender                        0
Country                       0
Occupation                    0
self_employed              5202
family_history                0
treatment                     0
Days_Indoors                  0
Growing_Stress                0
Changes_Habits                0
Mental_Health_History         0
Mood_Swings                   0
Coping_Struggles              0
Work_Interest                 0
Social_Weakness               0
mental_health_interview       0
care_options                  0
dtype: int64

In [178]:
df = df.dropna()

In [179]:
df.isnull().sum()

Timestamp                  0
Gender                     0
Country                    0
Occupation                 0
self_employed              0
family_history             0
treatment                  0
Days_Indoors               0
Growing_Stress             0
Changes_Habits             0
Mental_Health_History      0
Mood_Swings                0
Coping_Struggles           0
Work_Interest              0
Social_Weakness            0
mental_health_interview    0
care_options               0
dtype: int64

In [180]:
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
3,2014-08-27 11:37:59,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:36,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
5,2014-08-27 11:49:51,Female,Poland,Corporate,No,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Not sure
6,2014-08-27 11:51:34,Female,Australia,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
7,2014-08-27 11:52:41,Female,United States,Corporate,No,No,No,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No


In [181]:
df = pd.get_dummies(df , columns = ['Gender', 'Country' , 'Occupation' , 'self_employed' , 'family_history' , 'treatment', 'Days_Indoors', 'Growing_Stress' , 'Changes_Habits', 'Mood_Swings' , 'Coping_Struggles', 'Work_Interest', 'Social_Weakness','mental_health_interview', 'care_options' ], dtype = int )

In [182]:
df.head()

Unnamed: 0,Timestamp,Mental_Health_History,Gender_Female,Gender_Male,Country_Australia,Country_Belgium,Country_Bosnia and Herzegovina,Country_Brazil,Country_Canada,Country_Colombia,...,Work_Interest_Yes,Social_Weakness_Maybe,Social_Weakness_No,Social_Weakness_Yes,mental_health_interview_Maybe,mental_health_interview_No,mental_health_interview_Yes,care_options_No,care_options_Not sure,care_options_Yes
3,2014-08-27 11:37:59,Yes,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1
4,2014-08-27 11:43:36,Yes,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
5,2014-08-27 11:49:51,Yes,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
6,2014-08-27 11:51:34,Yes,1,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
7,2014-08-27 11:52:41,Yes,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0


In [183]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Timestamp_'] = df['Timestamp'].apply(lambda x: x.timestamp())

In [184]:
df.drop('Timestamp', axis=1, inplace=True)

In [185]:
df['Mental_Health_History'].value_counts()

Mental_Health_History
No       102179
Maybe     93664
Yes       91319
Name: count, dtype: int64

# Modeling

## Logistic Regression

In [186]:
X = df.drop(columns ='Mental_Health_History')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Mental_Health_History'])

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state = 42,
                                                    stratify = y)

In [188]:
logreg = LogisticRegression(multi_class='ovr')

In [189]:
logreg.fit(X_train, y_train)

## Random Forest

In [190]:
RandFor = RandomForestClassifier(random_state=42)
RandFor.fit(X_train, y_train)

# Evaluation

## Logistic Regression

In [191]:
y_pred = logreg.predict(X_test)

In [192]:
cross_val_score(logreg, X, y, cv=5) 

array([0.35582331, 0.35582331, 0.3558295 , 0.35581209, 0.3558295 ])

In [193]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [194]:
print(report)

              precision    recall  f1-score   support

       Maybe       0.00      0.00      0.00     18733
          No       0.36      1.00      0.52     20436
         Yes       0.00      0.00      0.00     18264

    accuracy                           0.36     57433
   macro avg       0.12      0.33      0.17     57433
weighted avg       0.13      0.36      0.19     57433



In [195]:
accuracy

0.3558233071579057

## Random Forest

In [196]:
cross_val_score(RandFor, X, y, cv=5)

array([0.39541727, 0.4453363 , 0.335475  , 0.37183103, 0.34827274])

In [197]:
y_pred = RandFor.predict(X_test)

In [198]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

In [199]:
accuracy

0.9762331760486131

In [200]:
print(report)

              precision    recall  f1-score   support

       Maybe       0.98      0.98      0.98     18733
          No       0.98      0.98      0.98     20436
         Yes       0.97      0.97      0.97     18264

    accuracy                           0.98     57433
   macro avg       0.98      0.98      0.98     57433
weighted avg       0.98      0.98      0.98     57433



In [201]:
train_score = RandFor.score(X_train, y_train)

test_score = RandFor.score(X_test, y_test)

In [202]:
train_score

0.9891785538612887

In [203]:
test_score

0.9762331760486131