In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score
from sklearn.inspection import permutation_importance
import shap
from sklearn.linear_model import LinearRegression, LogisticRegression


df = pd.read_csv('mental_health_clean.csv')
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,EMP0001,32,Non-binary,HR,Healthcare,13,Hybrid,47,7,2,Medium,Depression,No,Decrease,1,Unsatisfied,1,Weekly,Good,Europe
1,EMP0002,40,Female,Data Scientist,IT,3,Remote,52,4,1,Medium,Anxiety,No,Increase,3,Satisfied,2,Weekly,Good,Asia
2,EMP0003,59,Non-binary,Software Engineer,Education,22,Hybrid,46,11,5,Medium,Anxiety,No,No Change,4,Unsatisfied,5,,Poor,North America
3,EMP0004,27,Male,Software Engineer,Finance,20,Onsite,32,8,4,High,Depression,Yes,Increase,3,Unsatisfied,3,,Poor,Europe
4,EMP0005,49,Male,Sales,Consulting,32,Onsite,35,12,2,High,,Yes,Decrease,3,Unsatisfied,3,Weekly,Average,North America


In [3]:
df.columns

Index(['Employee_ID', 'Age', 'Gender', 'Job_Role', 'Industry',
       'Years_of_Experience', 'Work_Location', 'Hours_Worked_Per_Week',
       'Number_of_Virtual_Meetings', 'Work_Life_Balance_Rating',
       'Stress_Level', 'Mental_Health_Condition',
       'Access_to_Mental_Health_Resources', 'Productivity_Change',
       'Social_Isolation_Rating', 'Satisfaction_with_Remote_Work',
       'Company_Support_for_Remote_Work', 'Physical_Activity', 'Sleep_Quality',
       'Region'],
      dtype='object')

In [4]:
df.isnull().sum()

Employee_ID                             0
Age                                     0
Gender                                  0
Job_Role                                0
Industry                                0
Years_of_Experience                     0
Work_Location                           0
Hours_Worked_Per_Week                   0
Number_of_Virtual_Meetings              0
Work_Life_Balance_Rating                0
Stress_Level                            0
Mental_Health_Condition              1100
Access_to_Mental_Health_Resources       0
Productivity_Change                     0
Social_Isolation_Rating                 0
Satisfaction_with_Remote_Work           0
Company_Support_for_Remote_Work         0
Physical_Activity                    1502
Sleep_Quality                           0
Region                                  0
dtype: int64

In [5]:
df = df.fillna("None")

In [6]:
df.isnull().sum()

Employee_ID                          0
Age                                  0
Gender                               0
Job_Role                             0
Industry                             0
Years_of_Experience                  0
Work_Location                        0
Hours_Worked_Per_Week                0
Number_of_Virtual_Meetings           0
Work_Life_Balance_Rating             0
Stress_Level                         0
Mental_Health_Condition              0
Access_to_Mental_Health_Resources    0
Productivity_Change                  0
Social_Isolation_Rating              0
Satisfaction_with_Remote_Work        0
Company_Support_for_Remote_Work      0
Physical_Activity                    0
Sleep_Quality                        0
Region                               0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,EMP0001,32,Non-binary,HR,Healthcare,13,Hybrid,47,7,2,Medium,Depression,No,Decrease,1,Unsatisfied,1,Weekly,Good,Europe
1,EMP0002,40,Female,Data Scientist,IT,3,Remote,52,4,1,Medium,Anxiety,No,Increase,3,Satisfied,2,Weekly,Good,Asia
2,EMP0003,59,Non-binary,Software Engineer,Education,22,Hybrid,46,11,5,Medium,Anxiety,No,No Change,4,Unsatisfied,5,,Poor,North America
3,EMP0004,27,Male,Software Engineer,Finance,20,Onsite,32,8,4,High,Depression,Yes,Increase,3,Unsatisfied,3,,Poor,Europe
4,EMP0005,49,Male,Sales,Consulting,32,Onsite,35,12,2,High,,Yes,Decrease,3,Unsatisfied,3,Weekly,Average,North America


In [8]:
df = df.drop('Employee_ID', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4629 entries, 0 to 4628
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Age                                4629 non-null   int64 
 1   Gender                             4629 non-null   object
 2   Job_Role                           4629 non-null   object
 3   Industry                           4629 non-null   object
 4   Years_of_Experience                4629 non-null   int64 
 5   Work_Location                      4629 non-null   object
 6   Hours_Worked_Per_Week              4629 non-null   int64 
 7   Number_of_Virtual_Meetings         4629 non-null   int64 
 8   Work_Life_Balance_Rating           4629 non-null   int64 
 9   Stress_Level                       4629 non-null   object
 10  Mental_Health_Condition            4629 non-null   object
 11  Access_to_Mental_Health_Resources  4629 non-null   object
 12  Produc

In [9]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

cat_df = df[cat_cols]
cat_df.head()

Unnamed: 0,Gender,Job_Role,Industry,Work_Location,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Satisfaction_with_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,Non-binary,HR,Healthcare,Hybrid,Medium,Depression,No,Decrease,Unsatisfied,Weekly,Good,Europe
1,Female,Data Scientist,IT,Remote,Medium,Anxiety,No,Increase,Satisfied,Weekly,Good,Asia
2,Non-binary,Software Engineer,Education,Hybrid,Medium,Anxiety,No,No Change,Unsatisfied,,Poor,North America
3,Male,Software Engineer,Finance,Onsite,High,Depression,Yes,Increase,Unsatisfied,,Poor,Europe
4,Male,Sales,Consulting,Onsite,High,,Yes,Decrease,Unsatisfied,Weekly,Average,North America


In [10]:
for col in cat_df.columns:
    print(f'{col}: \n{cat_df[col].unique()}')

Gender: 
['Non-binary' 'Female' 'Male' 'Prefer not to say']
Job_Role: 
['HR' 'Data Scientist' 'Software Engineer' 'Sales' 'Marketing' 'Designer'
 'Project Manager']
Industry: 
['Healthcare' 'IT' 'Education' 'Finance' 'Consulting' 'Manufacturing'
 'Retail']
Work_Location: 
['Hybrid' 'Remote' 'Onsite']
Stress_Level: 
['Medium' 'High' 'Low']
Mental_Health_Condition: 
['Depression' 'Anxiety' 'None' 'Burnout']
Access_to_Mental_Health_Resources: 
['No' 'Yes']
Productivity_Change: 
['Decrease' 'Increase' 'No Change']
Satisfaction_with_Remote_Work: 
['Unsatisfied' 'Satisfied' 'Neutral']
Physical_Activity: 
['Weekly' 'None' 'Daily']
Sleep_Quality: 
['Good' 'Poor' 'Average']
Region: 
['Europe' 'Asia' 'North America' 'South America' 'Oceania' 'Africa']


In [11]:
df = pd.get_dummies(df, columns=['Gender', 'Job_Role', 'Industry',
                    'Work_Location', 'Mental_Health_Condition', 'Region'], drop_first=True)

df['Access_to_Mental_Health_Resources'] = df['Access_to_Mental_Health_Resources'].map({
                                                                                      'No': 0, 'Yes': 1})

In [12]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_vars = ['Stress_Level', 'Productivity_Change',
                'Satisfaction_with_Remote_Work', 'Sleep_Quality', 'Physical_Activity']
ordinal_mappings = [
    ['Low', 'Medium', 'High'],
    ['Decrease', 'No Change', 'Increase'],
    ['Unsatisfied', 'Neutral', 'Satisfied'],
    ['Poor', 'Average', 'Good'],
    ['None', 'Weekly', 'Daily']
]

encoder = OrdinalEncoder(categories=ordinal_mappings)
df[ordinal_vars] = encoder.fit_transform(df[ordinal_vars])

In [13]:
df.head()

Unnamed: 0,Age,Years_of_Experience,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,...,Work_Location_Onsite,Work_Location_Remote,Mental_Health_Condition_Burnout,Mental_Health_Condition_Depression,Mental_Health_Condition_None,Region_Asia,Region_Europe,Region_North America,Region_Oceania,Region_South America
0,32,13,47,7,2,1.0,0,0.0,1,0.0,...,False,False,False,True,False,False,True,False,False,False
1,40,3,52,4,1,1.0,0,2.0,3,2.0,...,False,True,False,False,False,True,False,False,False,False
2,59,22,46,11,5,1.0,0,1.0,4,0.0,...,False,False,False,False,False,False,False,True,False,False
3,27,20,32,8,4,2.0,1,2.0,3,0.0,...,True,False,False,True,False,False,True,False,False,False
4,49,32,35,12,2,2.0,1,0.0,3,0.0,...,True,False,False,False,True,False,False,True,False,False


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4629 entries, 0 to 4628
Data columns (total 38 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 4629 non-null   int64  
 1   Years_of_Experience                 4629 non-null   int64  
 2   Hours_Worked_Per_Week               4629 non-null   int64  
 3   Number_of_Virtual_Meetings          4629 non-null   int64  
 4   Work_Life_Balance_Rating            4629 non-null   int64  
 5   Stress_Level                        4629 non-null   float64
 6   Access_to_Mental_Health_Resources   4629 non-null   int64  
 7   Productivity_Change                 4629 non-null   float64
 8   Social_Isolation_Rating             4629 non-null   int64  
 9   Satisfaction_with_Remote_Work       4629 non-null   float64
 10  Company_Support_for_Remote_Work     4629 non-null   int64  
 11  Physical_Activity                   4629 no

In [14]:
df.to_csv('MH_encoded.csv', index=False)