In [31]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine
from sklearn.compose import ColumnTransformer

In [32]:
# Read the data into a Pandas dataframe
mental_df = pd.read_csv('final_after_encode.csv')

In [33]:
# Define the target set.
y = mental_df["If so, what condition(s) were you diagnosed with?_anxiety disorder (generalized, social, phobia, etc)|mood disorder (depression, bipolar disorder, etc)"]
# Define the features set.
X = mental_df.drop(columns=["If so, what condition(s) were you diagnosed with?_anxiety disorder (generalized, social, phobia, etc)","If so, what condition(s) were you diagnosed with?_anxiety disorder (generalized, social, phobia, etc)|mood disorder (depression, bipolar disorder, etc)"])

In [34]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [35]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [36]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [37]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [38]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,329,0
Actual 1,28,2


In [39]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [40]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,329,0
Actual 1,28,2


Accuracy Score : 0.9220055710306406
Classification Report
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96       329
         1.0       1.00      0.07      0.12        30

    accuracy                           0.92       359
   macro avg       0.96      0.53      0.54       359
weighted avg       0.93      0.92      0.89       359



In [41]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00045887, 0.00245694, 0.0033022 , ..., 0.00194279, 0.00277488,
       0.00331082])

In [42]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12212670300146024,
  'If yes, what condition(s) have you been diagnosed with?_anxiety disorder (generalized, social, phobia, etc)|mood disorder (depression, bipolar disorder, etc)'),
 (0.01627815217325935,
  'Have you been diagnosed with a mental health condition by a medical professional?'),
 (0.015727818397738333, 'If so, what condition(s) were you diagnosed with?_0'),
 (0.013571383022466708, 'What is your age?'),
 (0.010710102334662252,
  'If so, what condition(s) were you diagnosed with?_mood disorder (depression, bipolar disorder, etc)'),
 (0.010220479166078215, 'Do you currently have a mental health disorder?_1'),
 (0.009003900202518593,
  'Have you had a mental health disorder in the past?_1'),
 (0.008096263814048899, 'Do you have a family history of mental illness?'),
 (0.007480538699286177,
  'If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?_rarely'),
 (0.007298010419380741,
  'If yes, what condition(s) have y

In [43]:
# save our model to use later
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)

Test score: 92.20 %
