# Mental Health in the  Tech Industry: Pre-Processing
In this Pre-processing portion of my capstone, I will create dummy variables for my qualitative data, then create a train/test set for modeling. Since my variables are primarily qualitative, they will not need to be scaled

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
from statistics import stdev, mean
import operator
import pickle
import os
from sklearn.preprocessing import StandardScaler

In [53]:
df = pd.read_csv('../data/therapy_data_cleaned2.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No,
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No,
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,Don't know,No,No,Yes,Yes,Yes,Yes,Don't know,No,


In [54]:
df.shape

(509, 29)

In [55]:
print(df['Gender'].unique())

['Female' 'Male' 'Male-ish' 'Transgender Female'
 'Male Questioning Gender Identity' 'Non-binary' 'Agender'
 'Genderfluid/Androgynous' 'Unknown']


Since we are only analyzing males and females in this study, I am dropping the other gender identities so as to get the dummy values faster and easier

In [56]:
df.drop(df[(df['Gender'] != 'Male') & (df['Gender'] != 'Female')].index, inplace=True)

In [57]:
print(df['Gender'].unique())

['Female' 'Male']


Create Dummy variables for the quantitative data we are concerned with (work interference, gender, mental health consequences, family history, and whether they actually seek treatment)

In [58]:
gender_dummies = pd.get_dummies(df['Gender'])
gender_dummies.head()

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1


In [59]:
df = df.merge(gender_dummies, left_index = True, right_index = True)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,Female,Male
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,No,Some of them,Yes,No,Maybe,Yes,No,,1,0
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,Yes,Some of them,No,Maybe,Maybe,No,Yes,,0,1
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,Maybe,Some of them,No,No,No,Don't know,No,,1,0
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,No,Yes,Yes,No,Maybe,No,No,,1,0
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,No,Yes,Yes,Yes,Yes,Don't know,No,,0,1


In [60]:
interference_dummies = pd.get_dummies(df['work_interfere'])
interference_dummies.head()

Unnamed: 0,Never,Often,Rarely,Sometimes
0,0,1,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,1,0,0,0


In [61]:
interference_dummies.columns = ['interference_never', 'interference_often', 'interference_rarely', 'interference_sometimes']

In [62]:
interference_dummies.head()

Unnamed: 0,interference_never,interference_often,interference_rarely,interference_sometimes
0,0,1,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,1,0,0,0


In [63]:
df = pd.concat([df, interference_dummies], axis=1)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,phys_health_interview,mental_vs_physical,obs_consequence,comments,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,Maybe,Yes,No,,1,0,0,1,0,0
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,Maybe,No,Yes,,0,1,0,1,0,0
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,No,Don't know,No,,1,0,0,0,0,1
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,Maybe,No,No,,1,0,0,0,0,1
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,Yes,Don't know,No,,0,1,1,0,0,0


In [64]:
consequence_mental_dummies = pd.get_dummies(df['mental_health_consequence'])
consequence_mental_dummies.head()

Unnamed: 0,Maybe,No,Yes
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,1,0


In [65]:
consequence_mental_dummies.columns = ['ment_health_cons_maybe', 'ment_health_cons_no', 'ment_health_cons_yes']
consequence_mental_dummies.head()

Unnamed: 0,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes
0,0,1,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,1,0


In [66]:
df = pd.concat([df, consequence_mental_dummies], axis=1)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,comments,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,,1,0,0,1,0,0,0,1,0
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,,0,1,0,1,0,0,0,0,1
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,,1,0,0,0,0,1,1,0,0
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,,1,0,0,0,0,1,1,0,0
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,,0,1,1,0,0,0,0,1,0


In [67]:
#dummy variables for family history
family_dummies = pd.get_dummies(df['family_history'])
family_dummies.head()

Unnamed: 0,No,Yes
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [68]:
family_dummies.columns = ['no_family history', 'family_history_yes']
family_dummies.head()

Unnamed: 0,no_family history,family_history_yes
0,1,0
1,0,1
2,0,1
3,0,1
4,1,0


In [69]:
df = pd.concat([df, family_dummies], axis=1)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,0,0,1,0,0,0,1,0,1,0
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,1,0,1,0,0,0,0,1,0,1
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,0,0,0,0,1,1,0,0,0,1
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,0,0,0,0,1,1,0,0,0,1
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,1,1,0,0,0,0,1,0,1,0


In [70]:
#dummy variables for whether participants seek treatment
treatment_dummies = pd.get_dummies(df['treatment'])
treatment_dummies.head()

Unnamed: 0,No,Yes
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [71]:
treatment_dummies.columns = ['no_treatment', 'yes_treatment']
treatment_dummies.head()

Unnamed: 0,no_treatment,yes_treatment
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [72]:
df = pd.concat([df, treatment_dummies], axis=1)
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,...,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,...,1,0,0,0,1,0,1,0,0,1
1,3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,...,1,0,0,0,0,1,0,1,0,1
2,6,6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,...,0,0,1,1,0,0,0,1,0,1
3,8,8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,...,0,0,1,1,0,0,0,1,0,1
4,11,11,2014-08-27 11:32:49,29,Male,Bulgaria,Not American,,No,No,...,0,0,0,0,1,0,1,0,1,0


I am now going to drop unnessecarry columns

In [73]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Timestamp', 'Age', 'Gender', 'Country',
       'state', 'self_employed', 'family_history', 'treatment',
       'work_interfere', 'no_employees', 'remote_work', 'tech_company',
       'benefits', 'care_options', 'wellness_program', 'seek_help',
       'anonymity', 'leave', 'mental_health_consequence',
       'phys_health_consequence', 'coworkers', 'supervisor',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments', 'Female', 'Male',
       'interference_never', 'interference_often', 'interference_rarely',
       'interference_sometimes', 'ment_health_cons_maybe',
       'ment_health_cons_no', 'ment_health_cons_yes', 'no_family history',
       'family_history_yes', 'no_treatment', 'yes_treatment'],
      dtype='object')

In [74]:
df = df.drop(['Timestamp', 'Country', 'state', 'self_employed', 'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'], axis=1)

In [75]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Gender,family_history,treatment,work_interfere,mental_health_consequence,Female,Male,...,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,37,Female,No,Yes,Often,No,1,0,...,1,0,0,0,1,0,1,0,0,1
1,3,3,31,Male,Yes,Yes,Often,Yes,0,1,...,1,0,0,0,0,1,0,1,0,1
2,6,6,35,Female,Yes,Yes,Sometimes,Maybe,1,0,...,0,0,1,1,0,0,0,1,0,1
3,8,8,42,Female,Yes,Yes,Sometimes,Maybe,1,0,...,0,0,1,1,0,0,0,1,0,1
4,11,11,29,Male,No,No,Never,No,0,1,...,0,0,0,0,1,0,1,0,1,0


In [76]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Age', 'Gender', 'family_history',
       'treatment', 'work_interfere', 'mental_health_consequence', 'Female',
       'Male', 'interference_never', 'interference_often',
       'interference_rarely', 'interference_sometimes',
       'ment_health_cons_maybe', 'ment_health_cons_no', 'ment_health_cons_yes',
       'no_family history', 'family_history_yes', 'no_treatment',
       'yes_treatment'],
      dtype='object')

In [77]:
df = df.drop(['work_interfere'], axis=1)

In [78]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Age', 'Gender', 'family_history',
       'treatment', 'mental_health_consequence', 'Female', 'Male',
       'interference_never', 'interference_often', 'interference_rarely',
       'interference_sometimes', 'ment_health_cons_maybe',
       'ment_health_cons_no', 'ment_health_cons_yes', 'no_family history',
       'family_history_yes', 'no_treatment', 'yes_treatment'],
      dtype='object')

In [79]:
df = df.drop(['mental_health_consequence'], axis=1)

In [80]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Gender,family_history,treatment,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,37,Female,No,Yes,1,0,0,1,0,0,0,1,0,1,0,0,1
1,3,3,31,Male,Yes,Yes,0,1,0,1,0,0,0,0,1,0,1,0,1
2,6,6,35,Female,Yes,Yes,1,0,0,0,0,1,1,0,0,0,1,0,1
3,8,8,42,Female,Yes,Yes,1,0,0,0,0,1,1,0,0,0,1,0,1
4,11,11,29,Male,No,No,0,1,1,0,0,0,0,1,0,1,0,1,0


Creating Test and Train values using whether or not someone seeks treatment. The train_test_split will be saved as pickle files so they can be easily accessed when we model the data

Now that we have the test and training data, I will pickle the files for easier access when working on the modeling section

In [81]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Gender,family_history,treatment,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,37,Female,No,Yes,1,0,0,1,0,0,0,1,0,1,0,0,1
1,3,3,31,Male,Yes,Yes,0,1,0,1,0,0,0,0,1,0,1,0,1
2,6,6,35,Female,Yes,Yes,1,0,0,0,0,1,1,0,0,0,1,0,1
3,8,8,42,Female,Yes,Yes,1,0,0,0,0,1,1,0,0,0,1,0,1
4,11,11,29,Male,No,No,0,1,1,0,0,0,0,1,0,1,0,1,0


Now I drop the rest of the qualitative data points, since they've been replaced with dummies

In [82]:
df = df.drop(['Gender', 'family_history', 'treatment'], axis=1)

In [83]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,0,37,1,0,0,1,0,0,0,1,0,1,0,0,1
1,3,3,31,0,1,0,1,0,0,0,0,1,0,1,0,1
2,6,6,35,1,0,0,0,0,1,1,0,0,0,1,0,1
3,8,8,42,1,0,0,0,0,1,1,0,0,0,1,0,1
4,11,11,29,0,1,1,0,0,0,0,1,0,1,0,1,0


In [85]:
df.to_csv('../data/therapy_data_cleaned3.csv')