In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score
from cleanup import cleanup


In [2]:
df = pd.read_csv('../mental-heath-in-tech-2016_20161114.csv')
df = cleanup(df)

y = df['Label']
X = df.drop(['Label'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=22)


In [33]:
clf = RandomForestClassifier(min_samples_leaf=4, n_estimators=800)
print(cross_val_score(clf, X, y, cv=3))

clf = RandomForestClassifier(min_samples_leaf=4, n_estimators=800)
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))


[0.65425532 0.59308511 0.61702128]
              precision    recall  f1-score   support

           0       0.85      0.80      0.82        98
           1       0.55      0.72      0.62        82
           2       0.26      0.15      0.19        46

    accuracy                           0.64       226
   macro avg       0.55      0.56      0.55       226
weighted avg       0.62      0.64      0.62       226



In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn import preprocessing

In [5]:
def model_assess(model, name='Default'):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('---', name, '---', '\n',
          confusion_matrix(y_test, preds), '\n',
          'Accuracy:', round(accuracy_score(y_test, preds), 5), '\n')

In [6]:
# Naive Bayes
nb = GaussianNB()
model_assess(nb, name='Naive Bayes')

# Stochastic Gradient Descent
sgd = SGDClassifier(max_iter=5000, random_state=0)
model_assess(sgd, name='SGD')

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
model_assess(knn, name='KNN')

# Decission trees
tree = DecisionTreeClassifier()
model_assess(tree, 'Decission Trees')

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, 'Random Forest')

# Support Vector Machine
svm = SVC(decision_function_shape="ovo")
model_assess(svm, 'SVM')

# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model_assess(lg, 'Logistic Regression')

# Neural Nets
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1)
model_assess(nn, 'Neural Nets')

# Cross Gradient Booster
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_assess(xgb, 'XGBoost')

# Cross Gradient Booster (Random Forest) <=================== BEST
xgbrf = XGBRFClassifier(objective= 'multi:softmax')
model_assess(xgbrf, 'XGBoost RF')

--- Naive Bayes --- 
 [[79 14  5]
 [13 54 15]
 [ 9 27 10]] 
 Accuracy: 0.63274 

--- SGD --- 
 [[59 39  0]
 [ 5 77  0]
 [ 3 43  0]] 
 Accuracy: 0.60177 

--- KNN --- 
 [[77 17  4]
 [ 7 56 19]
 [10 31  5]] 
 Accuracy: 0.61062 

--- Decission Trees --- 
 [[77 13  8]
 [14 36 32]
 [ 7 20 19]] 
 Accuracy: 0.58407 

--- Random Forest --- 
 [[79 16  3]
 [ 9 58 15]
 [ 7 32  7]] 
 Accuracy: 0.63717 

--- SVM --- 
 [[75 23  0]
 [ 6 76  0]
 [ 6 40  0]] 
 Accuracy: 0.66814 

--- Logistic Regression --- 
 [[77 16  5]
 [11 58 13]
 [ 9 30  7]] 
 Accuracy: 0.62832 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


--- Neural Nets --- 
 [[77 17  4]
 [12 56 14]
 [ 9 29  8]] 
 Accuracy: 0.62389 

--- XGBoost --- 
 [[78 13  7]
 [12 47 23]
 [10 25 11]] 
 Accuracy: 0.60177 

--- XGBoost RF --- 
 [[77 18  3]
 [ 7 60 15]
 [ 7 34  5]] 
 Accuracy: 0.62832 



In [10]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [11]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(


In [13]:
rf_random.best_estimator_

In [14]:
rf_random.best_score_

0.6303191489361701

In [None]:
RandomForestClassifier()

In [29]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

n_estimators = [400, 600, 800, 1000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [30, 40, 50, 60, None]
# Minimum number of samples required to split a node
min_samples_split = [2, 4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6]
bootstrap = [True, False]

# Method of selecting samples for training each tree
full_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [30]:
rf = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = rf, param_grid=full_grid, cv = 3, verbose=2, n_jobs = -1)
rf_grid.fit(X, y)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


In [31]:
rf_grid.best_estimator_

In [32]:
rf_grid.best_score_

0.6320921985815603