# Mental Health in the Tech Industry: Modeling

In [78]:
import os
import math 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold
from sklearn import preprocessing

In [79]:
df = pd.read_csv('../data/therapy_data_cleaned3.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Age,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,0,37,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,3,3,31,0,1,0,1,0,0,0,0,1,0,1,0,1
2,2,6,6,35,1,0,0,0,0,1,1,0,0,0,1,0,1
3,3,8,8,42,1,0,0,0,0,1,1,0,0,0,1,0,1
4,4,11,11,29,0,1,1,0,0,0,0,1,0,1,0,1,0


In [80]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Age,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,0,37,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,3,3,31,0,1,0,1,0,0,0,0,1,0,1,0,1
2,2,6,6,35,1,0,0,0,0,1,1,0,0,0,1,0,1
3,3,8,8,42,1,0,0,0,0,1,1,0,0,0,1,0,1
4,4,11,11,29,0,1,1,0,0,0,0,1,0,1,0,1,0


In [81]:
drop_cols = ['Unnamed: 0', 'Unnamed: 0.1.1', 'Unnamed: 0.1']
df = df.drop(drop_cols, axis=1)

In [82]:
df.head()

Unnamed: 0,Age,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,37,1,0,0,1,0,0,0,1,0,1,0,0,1
1,31,0,1,0,1,0,0,0,0,1,0,1,0,1
2,35,1,0,0,0,0,1,1,0,0,0,1,0,1
3,42,1,0,0,0,0,1,1,0,0,0,1,0,1
4,29,0,1,1,0,0,0,0,1,0,1,0,1,0


## Machine Learning methods
1. I am going to start with a Train Test Split I completed in Pre-Processing to analyze who goes to therapy

In [83]:
#import machine learning models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

I was going to open the pickle file for the train_test_split that was completed in the pre-processing step, then start modeling, but the training set was larger than the dataset itself, so I will recreate the training and testing sets then work on different modeling techniques and compare them to see which one is the best

In [84]:
X = df.drop(columns = 'yes_treatment')
y = df['yes_treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=42, stratify=y)

In [85]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Since X has the age column in it, we can scale the data to fit well for logistic regression, I am going to be using StandardScaler to do this. After the scaling is complete, I will complete the logistic regression and check the accuracy to see how the model fares

In [116]:
#logistic Regression
logreg_train = LogisticRegression()
logreg_train.fit(X_train, y_train)
y_pred_train_logreg = logreg_train.predict(X_train)
y_pred_test_logreg = logreg_train.predict(X_test)

In [118]:
#get accuracy for the logistic regression
train_accuracy = accuracy_score(y_train, y_pred_train_logreg)
test_accuracy = accuracy_score(y_test, y_pred_test_logreg)
print('The training accuracy is ', train_accuracy,'and the testing accuracy is ', test_accuracy, 'when we complete a logistic regression of the data')

The training accuracy is  1.0 and the testing accuracy is  1.0 when we complete a logistic regression of the data


In [88]:
classification_report_logreg = classification_report(y_test, y_pred_test_logreg)
print(classification_report_logreg)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        29

    accuracy                           1.00        49
   macro avg       1.00      1.00      1.00        49
weighted avg       1.00      1.00      1.00        49



For some reason, the logistic regression gives training and testing accuracy of 1.0, so I will try random forest next

In [100]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(bootstrap=True,n_estimators=100,criterion='entropy')
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [107]:
#check accuracy
cnf_matrix_rf = confusion_matrix(y_test, rf_pred)
print(cnf_matrix_rf)
accuracy_rf = rf.score(X_test,y_test)
print(accuracy_rf)

[[20  0]
 [ 0 29]]
1.0


In [108]:
classification_report_rf = classification_report(y_test, rf_pred)
print(classification_report_rf)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        29

    accuracy                           1.00        49
   macro avg       1.00      1.00      1.00        49
weighted avg       1.00      1.00      1.00        49



Maybe GradientBoost will provide better results

In [109]:
#gradientBoost
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gradient = gb.predict(X_test)

In [111]:
gradient_accuracy = accuracy_score(y_test, y_pred_gradient)
print(gradient_accuracy)

1.0


In [112]:
#try knn 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(p=2,weights='distance',n_neighbors=50)
knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn=knn.score(X_test,y_test)
print(accuracy_knn)

0.9795918367346939


In [114]:
#SVM
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm=svm.score(X_test,y_test)
print(accuracy_svm)

1.0


In [119]:
#naive bayes
from sklearn.naive_bayes import GaussianNB
naivebayes = GaussianNB()
naivebayes.fit(X_train, y_train)
y_pred_nb = naivebayes.predict(X_test)
accuracy_svm=svm.score(X_test,y_test)
print(accuracy_svm)

1.0
