In [50]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
from sklearn.compose import ColumnTransformer

In [51]:
# Read the data into a Pandas dataframe
mental_df = pd.read_csv('final_after_encode.csv')

In [52]:
# Define the target set.
y = mental_df["If so, what condition(s) were you diagnosed with?_anxiety disorder (generalized, social, phobia, etc)|mood disorder (depression, bipolar disorder, etc)"]
# Define the features set.
X = mental_df[["Are you self-employed?", "Do you work remotely?_always", "Do you work remotely?_never", "Do you work remotely?_sometimes", "Have you had a mental health disorder in the past?_maybe", "Have you had a mental health disorder in the past?_no", "Have you had a mental health disorder in the past?_yes", "Do you believe your productivity is ever affected by a mental health issue?_no", "Do you believe your productivity is ever affected by a mental health issue?_not applicable to me", "Do you believe your productivity is ever affected by a mental health issue?_unsure", "Do you believe your productivity is ever affected by a mental health issue?_yes", "Do you believe your productivity is ever affected by a mental health issue?_0", "Do you have a family history of mental illness?_i don't know", "Do you have a family history of mental illness?_no", "Do you have a family history of mental illness?_yes", "do you feel comfortable in your working environment?_0", "do you feel comfortable in your working environment?_maybe", "do you feel comfortable in your working environment?_no", "do you feel comfortable in your working environment?_yes", "Do you feel that your organisation takes mental health as seriously as physical health?_0", "Do you feel that your organisation takes mental health as seriously as physical health?_i don't know", "Do you feel that your organisation takes mental health as seriously as physical health?_no", "Do you feel that your organisation takes mental health as seriously as physical health?_yes", "Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_0", "Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_maybe/not sure", "Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_no", "Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_yes, i experienced", "Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_yes, i observed", "are you stressed about your career?_maybe", "are you stressed about your career?_no, i don't think it would", "are you stressed about your career?_no, it has not", "are you stressed about your career?_yes, i think it would", "are you stressed about your career?_yes, it has", "Have you ever sought treatment for a mental health issue from a mental health professional?", "How willing would you be to share with friends and family about your work stress?_neutral", "How willing would you be to share with friends and family about your work stress?_not applicable to me (i do not have a mental illness)", "How willing would you be to share with friends and family about your work stress?_not open at all", "How willing would you be to share with friends and family about your work stress?_somewhat not open", "How willing would you be to share with friends and family about your work stress?_somewhat open"]]

In [53]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [54]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [55]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [56]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0.

In [57]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,318,11
Actual 1,28,2


In [58]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [59]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,318,11
Actual 1,28,2


Accuracy Score : 0.8913649025069638
Classification Report
              precision    recall  f1-score   support

         0.0       0.92      0.97      0.94       329
         1.0       0.15      0.07      0.09        30

    accuracy                           0.89       359
   macro avg       0.54      0.52      0.52       359
weighted avg       0.86      0.89      0.87       359



In [60]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00545667, 0.03348899, 0.03346446, 0.0434212 , 0.01340029,
       0.0196678 , 0.04645545, 0.00415636, 0.00017862, 0.00376785,
       0.01023123, 0.00499056, 0.02775443, 0.02398435, 0.03554997,
       0.00566108, 0.03909794, 0.03287894, 0.02558247, 0.00540891,
       0.03120712, 0.03528425, 0.03368205, 0.00887293, 0.04217581,
       0.04300775, 0.03481669, 0.0314641 , 0.04091291, 0.01449148,
       0.01195248, 0.04305335, 0.02784359, 0.04449938, 0.02244029,
       0.00354887, 0.01548735, 0.03361171, 0.06705032])

In [61]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.06705032304534621,
  'How willing would you be to share with friends and family about your work stress?_somewhat open'),
 (0.0464554528947043,
  'Have you had a mental health disorder in the past?_yes'),
 (0.04449938419716179,
  'Have you ever sought treatment for a mental health issue from a mental health professional?'),
 (0.043421197036973225, 'Do you work remotely?_sometimes'),
 (0.043053354741269145,
  'are you stressed about your career?_yes, i think it would'),
 (0.0430077501529369,
  'Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_no'),
 (0.04217581000743199,
  'Have you observed or experienced an unsupportive or badly handled response to an issue in your current workplace?_maybe/not sure'),
 (0.04091291190986767, 'are you stressed about your career?_maybe'),
 (0.03909794476635362,
  'do you feel comfortable in your working environment?_maybe'),
 (0.035549970604861536, 'Do you have a family history of mental

In [62]:
# save our model to use later
import pickle

# Save to file in the current working directory
pkl_filename = "random_forest_pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)

Test score: 89.14 %


In [63]:
import joblib

In [64]:
# Save the trained model to disk
joblib.dump(rf_model, 'rf_model.joblib')

# Check the size of the saved model file
import os
model_size = os.path.getsize('rf_model.joblib')
print(f"Model size: {model_size} bytes")

Model size: 3658025 bytes
