# Mental Health in the  Tech Industry: Pre-Processing
In this Pre-processing portion of my capstone, I will create dummy variables for my qualitative data, then create a train/test set for modeling. Since my variables are primarily qualitative, they will not need to be scaled

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter
from statistics import stdev, mean
import operator

In [2]:
df = pd.read_csv('../data/therapy_data_cleaned.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
print(df['Gender'].unique())

['Female' 'Male' 'Male-ish' 'Transgender Female'
 'Male Questioning Gender Identity' 'Non-binary' 'Agender'
 'Genderfluid/Androgynous' 'Unknown']


Since we are only analyzing males and females in this study, I am dropping the other gender identities so as to get the dummy values faster and easier

In [5]:
df.drop(df[(df['Gender'] != 'Male') & (df['Gender'] != 'Female')].index, inplace=True)

In [6]:
print(df['Gender'].unique())

['Female' 'Male']


Create Dummy variables for the quantitative data we are concerned with (work interference, gender, mental health consequences, family history, and whether they actually seek treatment)

In [7]:
gender_dummies = pd.get_dummies(df['Gender'])
gender_dummies.head()

Unnamed: 0,Female,Male
0,1,0
1,0,1
2,0,1
3,0,1
4,0,1


In [8]:
df = df.merge(gender_dummies, left_index = True, right_index = True)
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,Female,Male
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,No,Some of them,Yes,No,Maybe,Yes,No,,1,0
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,No,No,No,No,No,Don't know,No,,0,1
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,No,Yes,Yes,Yes,Yes,No,No,,0,1
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,Yes,Some of them,No,Maybe,Maybe,No,Yes,,0,1
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,No,Some of them,Yes,Yes,Yes,Don't know,No,,0,1


In [9]:
interference_dummies = pd.get_dummies(df['work_interfere'])
interference_dummies.head()

Unnamed: 0,Never,Often,Rarely,Sometimes
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,1,0,0,0


In [10]:
interference_dummies.columns = ['interference_never', 'interference_often', 'interference_rarely', 'interference_sometimes']

In [11]:
interference_dummies.head()

Unnamed: 0,interference_never,interference_often,interference_rarely,interference_sometimes
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,1,0,0,0


In [12]:
df = pd.concat([df, interference_dummies], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,phys_health_interview,mental_vs_physical,obs_consequence,comments,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,Maybe,Yes,No,,1,0,0,1,0,0
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,No,Don't know,No,,0,1,0,0,1,0
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,Yes,No,No,,0,1,0,0,1,0
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,Maybe,No,Yes,,0,1,0,1,0,0
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,Yes,Don't know,No,,0,1,1,0,0,0


In [13]:
consequence_mental_dummies = pd.get_dummies(df['mental_health_consequence'])
consequence_mental_dummies.head()

Unnamed: 0,Maybe,No,Yes
0,0,1,0
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [14]:
consequence_mental_dummies.columns = ['ment_health_cons_maybe', 'ment_health_cons_no', 'ment_health_cons_yes']
consequence_mental_dummies.head()

Unnamed: 0,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes
0,0,1,0
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [15]:
df = pd.concat([df, consequence_mental_dummies], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,comments,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,,1,0,0,1,0,0,0,1,0
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,,0,1,0,0,1,0,1,0,0
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,,0,1,0,0,1,0,0,1,0
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,,0,1,0,1,0,0,0,0,1
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,,0,1,1,0,0,0,0,1,0


In [16]:
#dummy variables for family history
family_dummies = pd.get_dummies(df['family_history'])
family_dummies.head()

Unnamed: 0,No,Yes
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0


In [17]:
family_dummies.columns = ['no_family history', 'family_history_yes']
family_dummies.head()

Unnamed: 0,no_family history,family_history_yes
0,1,0
1,1,0
2,1,0
3,0,1
4,1,0


In [18]:
df = pd.concat([df, family_dummies], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,0,0,1,0,0,0,1,0,1,0
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,1,0,0,1,0,1,0,0,1,0
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,1,0,0,1,0,0,1,0,1,0
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,1,0,1,0,0,0,0,1,0,1
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,1,1,0,0,0,0,1,0,1,0


In [19]:
#dummy variables for whether participants seek treatment
treatment_dummies = pd.get_dummies(df['treatment'])
treatment_dummies.head()

Unnamed: 0,No,Yes
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0


In [20]:
treatment_dummies.columns = ['no_treatment', 'yes_treatment']
treatment_dummies.head()

Unnamed: 0,no_treatment,yes_treatment
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0


In [21]:
df = pd.concat([df, treatment_dummies], axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,...,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,...,1,0,0,0,1,0,1,0,0,1
1,1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,...,0,1,0,1,0,0,1,0,1,0
2,2,2014-08-27 11:29:44,32,Male,Canada,Not American,,No,No,Rarely,...,0,1,0,0,1,0,1,0,1,0
3,3,2014-08-27 11:29:46,31,Male,United Kingdom,Not American,,Yes,Yes,Often,...,1,0,0,0,0,1,0,1,0,1
4,4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,...,0,0,0,0,1,0,1,0,1,0


I am now going to drop unnessecarry columns

In [22]:
df.columns

Index(['Unnamed: 0', 'Timestamp', 'Age', 'Gender', 'Country', 'state',
       'self_employed', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments', 'Female', 'Male',
       'interference_never', 'interference_often', 'interference_rarely',
       'interference_sometimes', 'ment_health_cons_maybe',
       'ment_health_cons_no', 'ment_health_cons_yes', 'no_family history',
       'family_history_yes', 'no_treatment', 'yes_treatment'],
      dtype='object')

In [23]:
df = df.drop(['Timestamp', 'Country', 'state', 'self_employed', 'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'], axis=1)

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,family_history,treatment,work_interfere,mental_health_consequence,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,37,Female,No,Yes,Often,No,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,44,Male,No,No,Rarely,Maybe,0,1,0,0,1,0,1,0,0,1,0,1,0
2,2,32,Male,No,No,Rarely,No,0,1,0,0,1,0,0,1,0,1,0,1,0
3,3,31,Male,Yes,Yes,Often,Yes,0,1,0,1,0,0,0,0,1,0,1,0,1
4,4,31,Male,No,No,Never,No,0,1,1,0,0,0,0,1,0,1,0,1,0


In [25]:
df.columns

Index(['Unnamed: 0', 'Age', 'Gender', 'family_history', 'treatment',
       'work_interfere', 'mental_health_consequence', 'Female', 'Male',
       'interference_never', 'interference_often', 'interference_rarely',
       'interference_sometimes', 'ment_health_cons_maybe',
       'ment_health_cons_no', 'ment_health_cons_yes', 'no_family history',
       'family_history_yes', 'no_treatment', 'yes_treatment'],
      dtype='object')

In [26]:
df = df.drop(['work_interfere'], axis=1)

In [27]:
df.columns

Index(['Unnamed: 0', 'Age', 'Gender', 'family_history', 'treatment',
       'mental_health_consequence', 'Female', 'Male', 'interference_never',
       'interference_often', 'interference_rarely', 'interference_sometimes',
       'ment_health_cons_maybe', 'ment_health_cons_no', 'ment_health_cons_yes',
       'no_family history', 'family_history_yes', 'no_treatment',
       'yes_treatment'],
      dtype='object')

In [28]:
df = df.drop(['mental_health_consequence'], axis=1)

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,family_history,treatment,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,37,Female,No,Yes,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,44,Male,No,No,0,1,0,0,1,0,1,0,0,1,0,1,0
2,2,32,Male,No,No,0,1,0,0,1,0,0,1,0,1,0,1,0
3,3,31,Male,Yes,Yes,0,1,0,1,0,0,0,0,1,0,1,0,1
4,4,31,Male,No,No,0,1,1,0,0,0,0,1,0,1,0,1,0


Creating Test and Train values using whether or not someone seeks treatment

In [30]:
from sklearn.model_selection import train_test_split
X = df.drop(columns = 'yes_treatment')
y = df['yes_treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(828, 17) (409, 17) (828,) (409,)


In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,family_history,treatment,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,37,Female,No,Yes,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,44,Male,No,No,0,1,0,0,1,0,1,0,0,1,0,1,0
2,2,32,Male,No,No,0,1,0,0,1,0,0,1,0,1,0,1,0
3,3,31,Male,Yes,Yes,0,1,0,1,0,0,0,0,1,0,1,0,1
4,4,31,Male,No,No,0,1,1,0,0,0,0,1,0,1,0,1,0


Now I drop the rest of the qualitative data points, since they've been replaced with dummies

In [33]:
df = df.drop(['Gender', 'family_history', 'treatment'], axis=1)

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Female,Male,interference_never,interference_often,interference_rarely,interference_sometimes,ment_health_cons_maybe,ment_health_cons_no,ment_health_cons_yes,no_family history,family_history_yes,no_treatment,yes_treatment
0,0,37,1,0,0,1,0,0,0,1,0,1,0,0,1
1,1,44,0,1,0,0,1,0,1,0,0,1,0,1,0
2,2,32,0,1,0,0,1,0,0,1,0,1,0,1,0
3,3,31,0,1,0,1,0,0,0,0,1,0,1,0,1
4,4,31,0,1,1,0,0,0,0,1,0,1,0,1,0


In [35]:
df.to_csv('../data/therapy_data_cleaned2.csv')