# DATA WRANGLING

In [2]:
#import packages

#data wrangling packages
import pandas as pd
import numpy as np

#visualization and EDA packages
from matplotlib import pyplot as plt
import seaborn as sns
import scipy, scipy.stats
import statistics as stats

In [5]:
#Load dataset
mentalhealth_original = pd.read_csv("/Users/jamesthompson/Documents/GitHub/Final-Project/Data/dataMH.csv")
mentalhealth_original.head()

Unnamed: 0,*Are you self-employed?*,How many employees does your company or organization have?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Does your employer provide mental health benefits as part of healthcare coverage?,Do you know the options for mental health care available under your employer-provided health coverage?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health disorders and options for seeking help?,Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?,"If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?",...,Briefly describe what you think the industry as a whole and/or employers could do to improve mental health support for employees.,"If there is anything else you would like to tell us that has not been covered by the survey questions, please use this space to do so.",Would you be willing to talk to one of us more extensively about your experiences with mental health issues in the tech industry? (Note that all interview responses would be used _anonymously_ and only with your permission.),What is your age?,What is your gender?,What country do you *live* in?,What US state or territory do you *live* in?,What is your race?,What country do you *work* in?,What US state or territory do you *work* in?
0,False,26-100,True,True,I don't know,No,Yes,Yes,I don't know,Very easy,...,,,False,25,Male,United States of America,Nebraska,White,United States of America,Nebraska
1,False,26-100,True,True,Yes,No,No,Yes,Yes,I don't know,...,,,False,51,male,United States of America,Nebraska,White,United States of America,Nebraska
2,False,26-100,True,True,I don't know,No,No,I don't know,I don't know,Somewhat difficult,...,I think opening up more conversation around th...,Thank you,True,27,Male,United States of America,Illinois,White,United States of America,Illinois
3,False,100-500,True,True,I don't know,No,Yes,Yes,Yes,Very easy,...,,,False,37,male,United States of America,Nebraska,White,United States of America,Nebraska
4,False,26-100,True,True,I don't know,No,I don't know,I don't know,I don't know,I don't know,...,,,False,46,m,United States of America,Nebraska,White,United States of America,Nebraska


## Recode Columm Names

In [6]:
mentalhealth_original.rename(columns={"*Are you self-employed?*": "SelfEmployed",
                   "How many employees does your company or organization have?": "CompanySize",
                   "Is your employer primarily a tech company/organization?": "TechCompany",
                   "Is your primary role within your company related to tech/IT?": "TechRole",
                   "Does your employer provide mental health benefits as part of healthcare coverage?": "MHcoverage",
                   "Do you know the options for mental health care available under your employer-provided health coverage?": "AwarenessMHcareResources",
                   "Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?": "ERdiscussedMHformally",
                   "Does your employer offer resources to learn more about mental health disorders and options for seeking help?": "ERofferMHresources",
                   "Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?": "AnonymityProtected",
                   "If a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?": "MHpromptWorkLeave",
                   "Would you feel more comfortable talking to your coworkers about your physical health or your mental health?": "MoreComfortCoworkerDisc",
                   "Would you feel comfortable discussing a mental health issue with your direct supervisor(s)?": "ComfortMHdiscSupervisor",
                   "Have you ever discussed your mental health with your employer?": "DiscussMHwithER",
                   "Would you feel comfortable discussing a mental health issue with your coworkers?": "ComfortMHdiscCoworker",
                   "Have you ever discussed your mental health with coworkers?": "DiscussMHwithCoworker",
                   "Overall, how much importance does your employer place on mental health?": "ERimportanceMH",
                   "Do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?": "MedCovMH",
                   "If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?": "RevealMHdxCoworker",
                   "Do you *currently* have a mental health disorder?": "CurrentMHdisorder",
                   "Have you ever been *diagnosed* with a mental health disorder?": "MHdx",
                   "*What disorder(s) have you been diagnosed with?*": "Dxdisorders",
                   "*If possibly, what disorder(s) do you believe you have?*": "BelievedDisorders",
                   "*If so, what disorder(s) were you diagnosed with?*": "ConfirmedDisorderDx",
                   "If you have a mental health disorder, how often do you feel that it interferes with your work *when being treated effectively?*": "MHdoInterfereTreated",
                   "If you have a mental health disorder, how often do you feel that it interferes with your work *when* _*NOT*_* being treated effectively (i.e., when you are experiencing symptoms)?*": "MHdoInterfereNotTreated",
                   "Have your observations of how another individual who discussed a mental health issue made you less likely to reveal a mental health issue yourself in your current workplace?": "ObsLesslikelyRevealMHdx",
                   "Are you openly identified at work as a person with a mental health issue?": "IdWithMHatWork",
                   "Have you observed or experienced an *unsupportive or badly handled response* to a mental health issue in your current or previous workplace?": "MHunsupportedWP",
                   "Have you observed or experienced a *supportive or well handled response* to a mental health issue in your current or previous workplace?": "MHsupportedWP",
                   "Overall, how well do you think the tech industry supports employees with mental health issues?":"TechSupportedMH",
                   "What is your age?": "Age",
                   "What is your gender?": "Gender",
                   "What country do you *live* in?": "LivingCountry",
                   "What US state or territory do you *live* in?": "LivingState_Territory",
                   "What is your race?": "Race",
                   "What country do you *work* in?": "WorkingCountry",
                   "What US state or territory do you *work* in?": "WorkingState_Territory",
                    "Have you had a mental health disorder in the past?": "PastMHdx"}, inplace=True)

## Subset Columns within Scope of Project

In [7]:
mentalhealth = mentalhealth_original[["SelfEmployed", "CompanySize", "TechCompany", "TechRole", "MHcoverage", "AwarenessMHcareResources", 
                                      "ERdiscussedMHformally", "ERofferMHresources", "AnonymityProtected", "MHpromptWorkLeave", "MoreComfortCoworkerDisc", 
                                      "ComfortMHdiscSupervisor", "DiscussMHwithER", "ComfortMHdiscCoworker", "DiscussMHwithCoworker", "ERimportanceMH", "MedCovMH", 
                                      "RevealMHdxCoworker", "CurrentMHdisorder", "MHdx", "Dxdisorders", "BelievedDisorders", "ConfirmedDisorderDx", "PastMHdx", "MHdoInterfereTreated",
                                      "MHdoInterfereNotTreated", "ObsLesslikelyRevealMHdx", "IdWithMHatWork", "MHunsupportedWP", "MHsupportedWP", "TechSupportedMH",
                                      "Age", "Gender", "LivingCountry", "LivingState_Territory", "Race", "WorkingCountry", "WorkingState_Territory"]]

In [8]:
# Check for accuracy in subsetting
for col in mentalhealth.columns:
    print(col)

SelfEmployed
CompanySize
TechCompany
TechRole
MHcoverage
AwarenessMHcareResources
ERdiscussedMHformally
ERofferMHresources
AnonymityProtected
MHpromptWorkLeave
MoreComfortCoworkerDisc
ComfortMHdiscSupervisor
DiscussMHwithER
ComfortMHdiscCoworker
DiscussMHwithCoworker
ERimportanceMH
MedCovMH
RevealMHdxCoworker
CurrentMHdisorder
MHdx
Dxdisorders
BelievedDisorders
ConfirmedDisorderDx
PastMHdx
MHdoInterfereTreated
MHdoInterfereNotTreated
ObsLesslikelyRevealMHdx
IdWithMHatWork
MHunsupportedWP
MHsupportedWP
TechSupportedMH
Age
Gender
LivingCountry
LivingState_Territory
Race
WorkingCountry
WorkingState_Territory


## Missing values

In [9]:
#checking for large missing values for columns to be dropped:
mentalhealth.isnull().sum()

SelfEmployed                  0
CompanySize                  48
TechCompany                  48
TechRole                     48
MHcoverage                   48
AwarenessMHcareResources     74
ERdiscussedMHformally        48
ERofferMHresources           48
AnonymityProtected           48
MHpromptWorkLeave            48
MoreComfortCoworkerDisc      48
ComfortMHdiscSupervisor      48
DiscussMHwithER              48
ComfortMHdiscCoworker        48
DiscussMHwithCoworker        51
ERimportanceMH               48
MedCovMH                    304
RevealMHdxCoworker          304
CurrentMHdisorder             0
MHdx                        205
Dxdisorders                 352
BelievedDisorders           280
ConfirmedDisorderDx         208
PastMHdx                      3
MHdoInterfereTreated          0
MHdoInterfereNotTreated       0
ObsLesslikelyRevealMHdx      70
IdWithMHatWork                0
MHunsupportedWP               0
MHsupportedWP                 0
TechSupportedMH               0
Age     

#### Will drop the following colms due to low value counts: 
1. Dxdisorders
2. IdWithMHatWorkR
3. MedCovMH 
4. RevealMHdxCoworker 
5. BelievedDisorders 
6. MHdx 
7. ConfirmedDisorderDx



#### Also dropping LivingCountry and LivingState_territory columns so staying within scope of project 

In [10]:
mentalhealth = mentalhealth[["SelfEmployed", "CompanySize", "TechCompany", "TechRole", "MHcoverage", "AwarenessMHcareResources", 
                                      "ERdiscussedMHformally", "ERofferMHresources", "AnonymityProtected", "MHpromptWorkLeave", "MoreComfortCoworkerDisc", 
                                      "ComfortMHdiscSupervisor", "DiscussMHwithER", "ComfortMHdiscCoworker", "DiscussMHwithCoworker", "ERimportanceMH", 
                                      "CurrentMHdisorder","PastMHdx", "MHdoInterfereTreated",
                                      "MHdoInterfereNotTreated", "ObsLesslikelyRevealMHdx", "MHunsupportedWP", "MHsupportedWP", "TechSupportedMH",
                                      "Age", "Gender", "Race", "WorkingCountry", "WorkingState_Territory"]]

In [11]:
#checking the columns have dropped

mentalhealth.isnull().sum()

SelfEmployed                  0
CompanySize                  48
TechCompany                  48
TechRole                     48
MHcoverage                   48
AwarenessMHcareResources     74
ERdiscussedMHformally        48
ERofferMHresources           48
AnonymityProtected           48
MHpromptWorkLeave            48
MoreComfortCoworkerDisc      48
ComfortMHdiscSupervisor      48
DiscussMHwithER              48
ComfortMHdiscCoworker        48
DiscussMHwithCoworker        51
ERimportanceMH               48
CurrentMHdisorder             0
PastMHdx                      3
MHdoInterfereTreated          0
MHdoInterfereNotTreated       0
ObsLesslikelyRevealMHdx      70
MHunsupportedWP               0
MHsupportedWP                 0
TechSupportedMH               0
Age                           0
Gender                        5
Race                        148
WorkingCountry                0
WorkingState_Territory      148
dtype: int64

## Recode Columns

In [12]:
#Recode columns with yes, no, etc.
def recode (MHcoverage):
    if MHcoverage == "Yes":
        return 0
    if MHcoverage == "No":
        return 1
    if MHcoverage == "I don't know":
        return 2
    if MHcoverage == "Not eligible for coverage / NA":
        return 3
    if MHcoverage == "Maybe":
        return 4
mentalhealth['MHcoverageR'] = mentalhealth['MHcoverage'].apply(recode)

#### Verifying Recode

In [13]:
mentalhealth.MHcoverage.value_counts()

Yes                               164
I don't know                       87
No                                 35
Not eligible for coverage / NA     18
Name: MHcoverage, dtype: int64

In [14]:
mentalhealth.MHcoverageR.value_counts()

0.0    164
2.0     87
1.0     35
3.0     18
Name: MHcoverageR, dtype: int64

In [15]:
mentalhealth['AwarenessMHcareResourcesR'] = mentalhealth['AwarenessMHcareResources'].apply(recode)
mentalhealth['ERdiscussedMHformallyR'] = mentalhealth['ERdiscussedMHformally'].apply(recode)
mentalhealth['ERofferMHresourcesR'] = mentalhealth['ERofferMHresources'].apply(recode)
mentalhealth['AnonymityProtectedR'] = mentalhealth['AnonymityProtected'].apply(recode)

In [16]:
#Recode columns with yes, no, etc.
def recode (MHpromptWorkLeave):
    if MHpromptWorkLeave == "Very easy":
        return 0
    if MHpromptWorkLeave == "Difficult":
        return 1
    if MHpromptWorkLeave == "I don't know":
        return 2
    if MHpromptWorkLeave == "Neither easy nor difficult":
        return 3
    if MHpromptWorkLeave == "Somewhat difficult":
        return 4
    if MHpromptWorkLeave == "Somewhat easy":
        return 5
mentalhealth['MHpromptWorkLeaveR'] = mentalhealth['MHpromptWorkLeave'].apply(recode)

#### Verifying Recode

In [17]:
mentalhealth.MHpromptWorkLeave.value_counts()

Somewhat easy                 83
Very easy                     70
I don't know                  52
Somewhat difficult            49
Neither easy nor difficult    31
Difficult                     19
Name: MHpromptWorkLeave, dtype: int64

In [18]:
mentalhealth.MHpromptWorkLeaveR.value_counts()

5.0    83
0.0    70
2.0    52
4.0    49
3.0    31
1.0    19
Name: MHpromptWorkLeaveR, dtype: int64

In [19]:
#Recode columns with responses, etc.
def recode (MHunsupportedWP):
    if MHunsupportedWP == "Maybe/Not sure":
        return 0
    if MHunsupportedWP == "Yes, I experienced":
        return 1
    if MHunsupportedWP == "Yes, I observed":
        return 2
    if MHunsupportedWP == "No":
        return 3
    if MHunsupportedWP == "I've always been self-employed":
        return 4
mentalhealth['MHunsupportedWPR'] = mentalhealth['MHunsupportedWP'].apply(recode)

#### Verifying Recode

In [21]:
mentalhealth.MHunsupportedWPR.value_counts()

3    139
0     90
2     67
1     53
4      3
Name: MHunsupportedWPR, dtype: int64

In [22]:
mentalhealth.MHunsupportedWP.value_counts()

No                                139
Maybe/Not sure                     90
Yes, I observed                    67
Yes, I experienced                 53
I've always been self-employed      3
Name: MHunsupportedWP, dtype: int64

In [23]:
#Recode columns: Race
def recode (Race):
    if Race == "Asian":
        return 0
    if Race == "Black or African American":
        return 1
    if Race == "Caucasian":
        return 2
    if Race == "European American":
        return 3
    if Race == "Hispanic":
        return 4
    if Race == "I prefer not to answer":
        return 5
    if Race == "More than one of the above":
        return 6
    if Race == "White":
        return 7
    if Race == "White Hispanic":
        return 8
mentalhealth['RaceR'] = mentalhealth['Race'].apply(recode)

In [24]:
#Recode columns: WorkingCountry (United States of America & Other)
def recode (WorkingCountry):
    if WorkingCountry == "United States of America":
        return 0
    if WorkingCountry == "Canada":
        return 1
    if WorkingCountry == "Afghanistan":
        return 1
    if WorkingCountry == "Algeria":
        return 1
    if WorkingCountry == "Australia":
        return 1
    if WorkingCountry == "Belgium":
        return 1
    if WorkingCountry == "Brazil":
        return 1
    if WorkingCountry == "Croatia":
        return 1
    if WorkingCountry == "Estonia":
        return 1
    if WorkingCountry == "France":
        return 1
    if WorkingCountry == "Germany":
        return 1
    if WorkingCountry == "Ghana":
        return 1
    if WorkingCountry == "Greece":
        return 1
    if WorkingCountry == "Hong Kong":
        return 1
    if WorkingCountry == "India":
        return 1
    if WorkingCountry == "Indonesia":
        return 1
    if WorkingCountry == "Ireland":
        return 1
    if WorkingCountry == "Israel":
        return 1
    if WorkingCountry == "Italy":
        return 1
    if WorkingCountry == "Japan":
        return 1
    if WorkingCountry == "Mexico":
        return 1
    if WorkingCountry == "Netherlands":
        return 1
    if WorkingCountry == "New Zealand":
        return 1
    if WorkingCountry == "Norway":
        return 1
    if WorkingCountry == "Pakistan":
        return 1
    if WorkingCountry == "Poland":
        return 1
    if WorkingCountry == "Portugal":
        return 1
    if WorkingCountry == "South Africa":
        return 1
    if WorkingCountry == "Spain":
        return 1
    if WorkingCountry == "Switzerland":
        return 1
    if WorkingCountry == "Turkey":
        return 1
    if WorkingCountry == "United Kingdom":
        return 1
    if WorkingCountry == "Botswana":
        return 1
    if WorkingCountry == "Other":
        return 1
    if WorkingCountry == "Austria":
        return 1
mentalhealth['WorkingCountryR'] = mentalhealth['WorkingCountry'].apply(recode)

#### Verifying Recode

In [26]:
mentalhealth.WorkingCountryR.unique()

array([0, 1])

In [25]:
#recoding WorkingCountry into two categories: USA and Other
def recode (WorkingCountry):
    if WorkingCountry == "United States of America":
        return "USA"
    if WorkingCountry == "Canada":
        return "Other"
    if WorkingCountry == "Afghanistan":
        return "Other"
    if WorkingCountry == "Algeria":
        return "Other"
    if WorkingCountry == "Australia":
        return "Other"
    if WorkingCountry == "Belgium":
        return "Other"
    if WorkingCountry == "Brazil":
        return "Other"
    if WorkingCountry == "Croatia":
        return "Other"
    if WorkingCountry == "Estonia":
        return "Other"
    if WorkingCountry == "France":
        return "Other"
    if WorkingCountry == "Germany":
        return "Other"
    if WorkingCountry == "Ghana":
        return "Other"
    if WorkingCountry == "Greece":
        return "Other"
    if WorkingCountry == "Hong Kong":
        return "Other"
    if WorkingCountry == "India":
        return "Other"
    if WorkingCountry == "Indonesia":
        return "Other"
    if WorkingCountry == "Ireland":
        return "Other"
    if WorkingCountry == "Israel":
        return "Other"
    if WorkingCountry == "Italy":
        return "Other"
    if WorkingCountry == "Japan":
        return "Other"
    if WorkingCountry == "Mexico":
        return "Other"
    if WorkingCountry == "Netherlands":
        return "Other"
    if WorkingCountry == "New Zealand":
        return "Other"
    if WorkingCountry == "Norway":
        return "Other"
    if WorkingCountry == "Pakistan":
        return "Other"
    if WorkingCountry == "Poland":
        return "Other"
    if WorkingCountry == "Portugal":
        return "Other"
    if WorkingCountry == "South Africa":
        return "Other"
    if WorkingCountry == "Spain":
        return "Other"
    if WorkingCountry == "Switzerland":
        return "Other"
    if WorkingCountry == "Turkey":
        return "Other"
    if WorkingCountry == "United Kingdom":
        return "Other"
    if WorkingCountry == "Botswana":
        return "Other"
    if WorkingCountry == "Other":
        return "Other"
    if WorkingCountry == "Austria":
        return "Other"
    
mentalhealth['WorkingCountry'] = mentalhealth['WorkingCountry'].apply(recode)

#### Verifying Recode

In [28]:
mentalhealth.WorkingCountry.unique()

array(['USA', 'Other'], dtype=object)

In [29]:
#Recode columns: WorkingState (Northeast = 1, Southeast = 2, West = 3, Southwest = 4, Midwest = 5)
def recode (WorkingState):
    if WorkingState == "Alabama":
        return 2
    if WorkingState == "Alaska":
        return 3
    if WorkingState == "Arizona":
        return 4
    if WorkingState == "California":
        return 3
    if WorkingState == "Colorado":
        return 5
    if WorkingState == "Connecticut":
        return 1
    if WorkingState == "District of Columbia":
        return 1
    if WorkingState == "Florida":
        return 2
    if WorkingState == "Georgia":
        return 2
    if WorkingState == "Idaho":
        return 3
    if WorkingState == "Illinois":
        return 5
    if WorkingState == "Indiana":
        return 5
    if WorkingState == "Iowa":
        return 5
    if WorkingState == "Kansas":
        return 5
    if WorkingState == "Louisiana":
        return 2
    if WorkingState == "Maine":
        return 1
    if WorkingState == "Maryland":
        return 1
    if WorkingState == "Massachusetts":
        return 1
    if WorkingState == "Michigan":
        return 5
    if WorkingState == "Minnesota":
        return 5
    if WorkingState == "Missouri":
        return 5
    if WorkingState == "Nebraska":
        return 5
    if WorkingState == "New Jersey":
        return 1
    if WorkingState == "New York":
        return 1
    if WorkingState == "North Carolina":
        return 2
    if WorkingState == "Ohio":
        return 5
    if WorkingState == "Oklahoma":
        return 4
    if WorkingState == "Oregon":
        return 3
    if WorkingState == "Pennsylvania":
        return 1
    if WorkingState == "South Carolina":
        return 2
    if WorkingState == "Tennessee":
        return 2
    if WorkingState == "Texas":
        return 4
    if WorkingState == "Utah":
        return 3
    if WorkingState == "Vermont":
        return 1
    if WorkingState == "Virginia":
        return 4
    if WorkingState == "Washington":
        return 3
    if WorkingState == "Wisconsin":
        return 5
    if WorkingState == "Wyoming":
        return 3
    else:
        return 0

mentalhealth['WorkingStateR'] = mentalhealth['WorkingState_Territory'].apply(recode)

In [30]:
#recoding WorkingState_Territory into regions

def recode (WorkingState):
    if WorkingState == "Alabama":
        return "southeast"
    if WorkingState == "Alaska":
        return "west"
    if WorkingState == "Arizona":
        return "southwest"
    if WorkingState == "California":
        return "west"
    if WorkingState == "Colorado":
        return "midwest"
    if WorkingState == "Connecticut":
        return "northeast"
    if WorkingState == "District of Columbia":
        return "northeast"
    if WorkingState == "Florida":
        return "southeast"
    if WorkingState == "Georgia":
        return "southeast"
    if WorkingState == "Idaho":
        return "west"
    if WorkingState == "Illinois":
        return "midwest"
    if WorkingState == "Indiana":
        return "midwest"
    if WorkingState == "Iowa":
        return "midwest"
    if WorkingState == "Kansas":
        return "midwest"
    if WorkingState == "Louisiana":
        return "southeast"
    if WorkingState == "Maine":
        return "northeast"
    if WorkingState == "Maryland":
        return "northeast"
    if WorkingState == "Massachusetts":
        return "northeast"
    if WorkingState == "Michigan":
        return "midwest"
    if WorkingState == "Minnesota":
        return "midwest"
    if WorkingState == "Missouri":
        return "midwest"
    if WorkingState == "Nebraska":
        return "midwest"
    if WorkingState == "New Jersey":
        return "northeast"
    if WorkingState == "New York":
        return "northeast"
    if WorkingState == "North Carolina":
        return "southeast"
    if WorkingState == "Ohio":
        return "midwest"
    if WorkingState == "Oklahoma":
        return "southwest"
    if WorkingState == "Oregon":
        return "west"
    if WorkingState == "Pennsylvania":
        return "northeast"
    if WorkingState == "South Carolina":
        return "southeast"
    if WorkingState == "Tennessee":
        return "southeast"
    if WorkingState == "Texas":
        return "southwest"
    if WorkingState == "Utah":
        return "west"
    if WorkingState == "Vermont":
        return "northeast"
    if WorkingState == "Virginia":
        return "southwest"
    if WorkingState == "Washington":
        return "west"
    if WorkingState == "Wisconsin":
        return "midwest"
    if WorkingState == "Wyoming":
        return "west"
    else:
        return "unknown"

mentalhealth['WorkingState_Territory'] = mentalhealth['WorkingState_Territory'].apply(recode)

#### Verifying Recode

In [31]:
mentalhealth.WorkingState_Territory.unique()

array(['midwest', 'unknown', 'northeast', 'southeast', 'west',
       'southwest'], dtype=object)

In [32]:
#Recode CurrentMHdisorder
# Yes, No, Possibly, Don't Know
def recode (Current):
    if Current == "Yes":
        return 0
    if Current == "No":
        return 1
    if Current == "Don't Know":
        return 2
    if Current == "Possibly":
        return 3
mentalhealth['CurrentMHdisorderR'] = mentalhealth['CurrentMHdisorder'].apply(recode)

In [33]:
# recoding ObsLesslikely...
def recode (Current):
    if Current == "Yes":
        return 0
    if Current == "No":
        return 1
    if Current == "N/A":
        return 2
    if Current == "Maybe":
        return 4
mentalhealth['ObsLesslikelyRevealMHdxR'] = mentalhealth['ObsLesslikelyRevealMHdx'].apply(recode)

In [34]:
# Recode True/False Columns

mentalhealth['SelfEmployed'].replace({False: 0, True: 1}, inplace=True)
mentalhealth['TechCompany'].replace({False: 0, True: 1}, inplace=True)
mentalhealth['TechRole'].replace({False: 0, True: 1}, inplace=True)

In [35]:
#Compile Gender

mentalhealth['Gender'].replace(['Male', 'male', 'm', 'M', 'Let\'s keep it simple and say "male"', 'man', 'masculino', 'Identify as male', 'Male ', 'Masculine', 'Cishet male', 'Man', 'Cis Male', 'cis male', 'I have a penis', 'Make', 'CIS Male'], 'Male', inplace=True)
mentalhealth['Gender'].replace(['female', 'F', 'Woman', 'f', 'Female ', 'Femina', 'cis woman', 'woman', 'Femile','femmina','Female-identified', 'Female','Female (cis)'], 'Female', inplace=True)
mentalhealth['Gender'].replace(['nan', 'Non-binary', 'Non binary', 'None', 'Nonbinary', 'agender', 'Questioning', '43', 'rr', 'Agender trans woman', 'Trans man', 'I am a Wookie', 'Trans non-binary/genderfluid','Non-binary and gender fluid'], 'Other', inplace=True)

#### Verifying Recode

In [36]:
mentalhealth.Gender.unique()

array(['Male', 'Female', nan, 'Other'], dtype=object)

In [37]:
# Recode category gender to numeric
def recode (Gender):
    if Gender == "Male":
        return 0
    if Gender == "Female":
        return 1
    if Gender == "Other":
        return 2
    
mentalhealth['GenderR'] = mentalhealth['Gender'].apply(recode)

In [38]:
# Recode Company Size

def recode (CompanySize):
    if CompanySize == "1-5":
        return 0
    if CompanySize == "6-25":
        return 1
    if CompanySize == "26-100":
        return 2
    if CompanySize == "100-500":
        return 3
    if CompanySize == "500-1000":
        return 4
    if CompanySize == "More than 1000":
        return 5

    
mentalhealth['CompanySizeR'] = mentalhealth['CompanySize'].apply(recode)

#### Verifying entire dataset recoded properly

In [40]:
mentalhealth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   SelfEmployed               352 non-null    int64  
 1   CompanySize                304 non-null    object 
 2   TechCompany                304 non-null    float64
 3   TechRole                   304 non-null    float64
 4   MHcoverage                 304 non-null    object 
 5   AwarenessMHcareResources   278 non-null    object 
 6   ERdiscussedMHformally      304 non-null    object 
 7   ERofferMHresources         304 non-null    object 
 8   AnonymityProtected         304 non-null    object 
 9   MHpromptWorkLeave          304 non-null    object 
 10  MoreComfortCoworkerDisc    304 non-null    object 
 11  ComfortMHdiscSupervisor    304 non-null    object 
 12  DiscussMHwithER            304 non-null    object 
 13  ComfortMHdiscCoworker      304 non-null    object 

## Dropping "0" in Age

In [41]:
mentalhealth.Age.unique()

array([25, 51, 27, 37, 46, 36, 39, 35, 49, 45, 40, 31, 26, 43, 32, 41, 53,
       29, 28, 24, 42, 34, 23, 30, 58, 50, 52, 33, 54, 38, 64, 55, 60, 47,
       48, 20,  0, 22, 63, 44, 59, 19, 56, 21])

In [42]:
mentalhealth.drop(mentalhealth[mentalhealth['Age'] < 2].index, inplace=True)

In [43]:
unique_age = mentalhealth.Age.unique()

In [44]:
print(sorted(unique_age))

[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 63, 64]


## Evaluating Missing values further

##### Since we might want to analyze Race as a factor later, but it's has a large amount of missing values, we will subset race into it's own dataframe and then drop it from the main dataframe

In [45]:
# looking at all the missing values again
mentalhealth.isnull().sum()

SelfEmployed                   0
CompanySize                   48
TechCompany                   48
TechRole                      48
MHcoverage                    48
AwarenessMHcareResources      74
ERdiscussedMHformally         48
ERofferMHresources            48
AnonymityProtected            48
MHpromptWorkLeave             48
MoreComfortCoworkerDisc       48
ComfortMHdiscSupervisor       48
DiscussMHwithER               48
ComfortMHdiscCoworker         48
DiscussMHwithCoworker         51
ERimportanceMH                48
CurrentMHdisorder              0
PastMHdx                       3
MHdoInterfereTreated           0
MHdoInterfereNotTreated        0
ObsLesslikelyRevealMHdx       70
MHunsupportedWP                0
MHsupportedWP                  0
TechSupportedMH                0
Age                            0
Gender                         5
Race                         148
WorkingCountry                 0
WorkingState_Territory         0
MHcoverageR                   48
AwarenessM

### Subsetting Race into separate dataframe & dropping remaining null values

In [46]:
mh_race = mentalhealth.dropna()

In [47]:
mh_race.isnull().sum()

SelfEmployed                 0
CompanySize                  0
TechCompany                  0
TechRole                     0
MHcoverage                   0
AwarenessMHcareResources     0
ERdiscussedMHformally        0
ERofferMHresources           0
AnonymityProtected           0
MHpromptWorkLeave            0
MoreComfortCoworkerDisc      0
ComfortMHdiscSupervisor      0
DiscussMHwithER              0
ComfortMHdiscCoworker        0
DiscussMHwithCoworker        0
ERimportanceMH               0
CurrentMHdisorder            0
PastMHdx                     0
MHdoInterfereTreated         0
MHdoInterfereNotTreated      0
ObsLesslikelyRevealMHdx      0
MHunsupportedWP              0
MHsupportedWP                0
TechSupportedMH              0
Age                          0
Gender                       0
Race                         0
WorkingCountry               0
WorkingState_Territory       0
MHcoverageR                  0
AwarenessMHcareResourcesR    0
ERdiscussedMHformallyR       0
ERofferM

### Subsetting data to drop race and then drop remaining na values

In [48]:
mh = mentalhealth.drop(['Race', 'RaceR'], axis=1)

In [49]:
#confirming Race and RaceR was dropped and looking at remaining null values
mh.isnull().sum()

SelfEmployed                  0
CompanySize                  48
TechCompany                  48
TechRole                     48
MHcoverage                   48
AwarenessMHcareResources     74
ERdiscussedMHformally        48
ERofferMHresources           48
AnonymityProtected           48
MHpromptWorkLeave            48
MoreComfortCoworkerDisc      48
ComfortMHdiscSupervisor      48
DiscussMHwithER              48
ComfortMHdiscCoworker        48
DiscussMHwithCoworker        51
ERimportanceMH               48
CurrentMHdisorder             0
PastMHdx                      3
MHdoInterfereTreated          0
MHdoInterfereNotTreated       0
ObsLesslikelyRevealMHdx      70
MHunsupportedWP               0
MHsupportedWP                 0
TechSupportedMH               0
Age                           0
Gender                        5
WorkingCountry                0
WorkingState_Territory        0
MHcoverageR                  48
AwarenessMHcareResourcesR    74
ERdiscussedMHformallyR       48
ERofferM

In [50]:
#dropping remaining na values
mh.dropna(inplace=True)

In [51]:
#confirming null values have been dropped
mh.isnull().sum()

SelfEmployed                 0
CompanySize                  0
TechCompany                  0
TechRole                     0
MHcoverage                   0
AwarenessMHcareResources     0
ERdiscussedMHformally        0
ERofferMHresources           0
AnonymityProtected           0
MHpromptWorkLeave            0
MoreComfortCoworkerDisc      0
ComfortMHdiscSupervisor      0
DiscussMHwithER              0
ComfortMHdiscCoworker        0
DiscussMHwithCoworker        0
ERimportanceMH               0
CurrentMHdisorder            0
PastMHdx                     0
MHdoInterfereTreated         0
MHdoInterfereNotTreated      0
ObsLesslikelyRevealMHdx      0
MHunsupportedWP              0
MHsupportedWP                0
TechSupportedMH              0
Age                          0
Gender                       0
WorkingCountry               0
WorkingState_Territory       0
MHcoverageR                  0
AwarenessMHcareResourcesR    0
ERdiscussedMHformallyR       0
ERofferMHresourcesR          0
Anonymit

## Data Wrangling Summary:

There are two datasets that are ready to be utilized for data exploration and analysis:

1. mh = all columns(excluding Race due to missing values)
2. mh_race = race data subsetted 


# EXPLORATORY DATA ANALYSIS