# Exploratory Data Analysis - 1

In [1]:
import pandas as pd
import missingno as msn

import plotly.graph_objects as go
from ipywidgets import widgets

In [2]:
df = pd.read_csv('./../../../datasets/consistent_data.csv')

## Univariate analysis

### Company employed candidates

In [3]:
ce_columns = ['are you self-employed?', 
              'how many employees does your company or organization have?', 
              'is your employer primarily a tech company/organization?', 
              'is your primary role within your company related to tech/it?', 
              'does your employer provide mental health benefits as part of healthcare coverage?', 
              'do you know the options for mental health care available under your employer-provided health coverage?', 
              'has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?', 
              'does your employer offer resources to learn more about mental health disorders and options for seeking help?', 
              'is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?', 
              'if a mental health issue prompted you to request a medical leave from work, how easy or difficult would it be to ask for that leave?', 
              'would you feel more comfortable talking to your coworkers about your physical health or your mental health?', 
              'would you feel comfortable discussing a mental health issue with your direct supervisor(s)?', 
              'have you ever discussed your mental health with your employer?',
              'would you feel comfortable discussing a mental health issue with your coworkers?',
              'have you ever discussed your mental health with coworkers?',
              'have you ever had a coworker discuss their or another coworker\'s mental health with you?',
              'overall, how much importance does your employer place on physical health?', 
              'overall, how much importance does your employer place on mental health?']

In [4]:
df_ce = df[ce_columns]
df_ce = df_ce[df_ce['are you self-employed?']==0]

In [5]:
print(f"Number of candidates who work for employers: {df_ce.shape[0]}")

Number of candidates who work for employers: 1308


In [6]:
columnbox1 = widgets.Dropdown(description='Company employed candidate specifics',
                           value='how many employees does your company or organization have?',
                           options=list(df_ce.columns))

trace1 = go.Bar(x=tuple(df_ce['how many employees does your company or organization have?'].value_counts().index), 
               y=tuple(df_ce['how many employees does your company or organization have?'].value_counts().values))

g1 = go.FigureWidget(data=[trace1], 
                   layout = go.Layout(title="Distribution", xaxis_title="how many employees does your company or organization have?", height=600))

def response1(change):
    with g1.batch_update():
        g1.data[0].x = tuple(df_ce[columnbox1.value].value_counts().index)
        g1.data[0].y = list(df_ce[columnbox1.value].value_counts().values)
        g1.layout.xaxis.title = columnbox1.value

columnbox1.observe(response1, names="value")

container1 = widgets.HBox([columnbox1])
chart1 = widgets.VBox([container1, g1])

display(chart1)

VBox(children=(HBox(children=(Dropdown(description='Company employed candidate specifics', index=1, options=('…

### Self-employed candidates

In [7]:
se_columns = ['are you self-employed?', 
              'do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?', 
              'do you know local or online resources to seek help for a mental health issue?', 
              'if you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?', 
              'if you have revealed a mental health disorder to a client or business contact, how has this affected you or the relationship?', 
              'if you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?', 
              'if you have revealed a mental health disorder to a coworker or employee, how has this impacted you or the relationship?', 
              'do you believe your productivity is ever affected by a mental health issue?', 
              'if yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?']

In [8]:
df_se = df[se_columns]
df_se = df_se[df_se['are you self-employed?']==1]

In [9]:
print(f"Number of candidates who work for employers: {df_se.shape[0]}")

Number of candidates who work for employers: 217


In [10]:
columnbox2 = widgets.Dropdown(description='Self-employed candidate specifics',
                           value='do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?',
                           options=list(df_se.columns))

trace2 = go.Bar(x=tuple(df_se['do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?'].value_counts().index), 
               y=tuple(df_se['do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?'].value_counts().values))

g2 = go.FigureWidget(data=[trace2], 
                   layout = go.Layout(title="Distribution", 
                                      xaxis_title="do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?", 
                                      height=600))

def response2(change):
    with g2.batch_update():
        g2.data[0].x = tuple(df_se[columnbox2.value].value_counts().index)
        g2.data[0].y = list(df_se[columnbox2.value].value_counts().values)
        g2.layout.xaxis.title = columnbox2.value

columnbox2.observe(response2, names="value")

container2 = widgets.HBox([columnbox2])
chart2 = widgets.VBox([container2, g2])

display(chart2)

VBox(children=(HBox(children=(Dropdown(description='Self-employed candidate specifics', index=1, options=('are…

### Common questions

In [11]:
cc_columns = list(set(df.columns)-set(ce_columns)-set(se_columns))
cc_columns.remove('what is your age?')

In [12]:
df_cc = df[cc_columns]

In [13]:
columnbox3 = widgets.Dropdown(description='Self-employed candidate specifics',
                           value='do you have previous employers?',
                           options=list(df_cc.columns))

trace3 = go.Bar(x=tuple(df_cc['do you have previous employers?'].value_counts().index), 
               y=tuple(df_cc['do you have previous employers?'].value_counts().values))

g3 = go.FigureWidget(data=[trace3], 
                   layout = go.Layout(title="Distribution", xaxis_title="do you have previous employers?", height=600))

def response3(change):
    with g3.batch_update():
        g3.data[0].x = tuple(df_cc[columnbox3.value].value_counts().index)
        g3.data[0].y = list(df_cc[columnbox3.value].value_counts().values)
        g3.layout.xaxis.title = columnbox3.value


columnbox3.observe(response3, names="value")

container3 = widgets.HBox([columnbox3])
chart3 = widgets.VBox([container3, g3])

display(chart3)

VBox(children=(HBox(children=(Dropdown(description='Self-employed candidate specifics', options=('do you have …

## Multivariate Analysis

In [14]:
relevant_columns = ['are you self-employed?', 'do you know the options for mental health care available under your employer-provided health coverage?', 
                    'do you currently have a mental health disorder?', 
                    'do you have medical coverage (private insurance or state-provided) that includes treatment of mental health disorders?',
                    'do you know local or online resources to seek help for a mental health issue?'
                    'have you ever been diagnosed with a mental health disorder?', 
                    'have you had a mental health disorder in the past?', 
                    'do you have a family history of mental illness?', 
                    'if you have a mental health disorder, how often do you feel that it interferes with your work when being treated effectively?',
                    'how willing would you be to share with friends and family that you have a mental illness?', 
                    'would you bring up your mental health with a potential employer in an interview?', 
                    'are you openly identified at work as a person with a mental health issue?', 
                    'have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?', 
                    'would you be willing to talk to one of us more extensively about your experiences with mental health issues in the tech industry? (note that all interview responses would be used anonymously and only with your permission.)', 
                    'what is your gender?', 'what country do you work in?']

In [15]:
columnbox = widgets.Dropdown(description='Specific columns for analysis',
                           value='are you self-employed?',
                           options=relevant_columns)

treated_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==1]
non_treated_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==0]

trace1 = go.Bar(y=tuple(treated_df['are you self-employed?'].value_counts().index), 
               x=tuple(treated_df['are you self-employed?'].value_counts().values),
               name="Took treatment", orientation="h")
trace2 = go.Bar(y=tuple(non_treated_df['are you self-employed?'].value_counts().index), 
               x=tuple(non_treated_df['are you self-employed?'].value_counts().values),
               name="Did not take treatment", orientation="h")

g = go.FigureWidget(data=[trace1, trace2], 
                    layout = go.Layout(title="Distribution with respect to treatment", 
                    xaxis_title="are you self-employed?", height=600))

def response4(change):
    with g.batch_update():
        g.data[0].y = tuple(treated_df[columnbox.value].value_counts().index[:7])
        g.data[0].x = tuple(treated_df[columnbox.value].value_counts().values[:7])
        g.data[1].y = tuple(non_treated_df[columnbox.value].value_counts().index[:7])
        g.data[1].x = tuple(non_treated_df[columnbox.value].value_counts().values[:7])
        g.layout.xaxis.title = columnbox.value

columnbox.observe(response4, names="value")

container = widgets.HBox([columnbox])
chart = widgets.VBox([container, g])

display(chart)

VBox(children=(HBox(children=(Dropdown(description='Specific columns for analysis', options=('are you self-emp…

In [16]:
current_em_columns = ['overall, how much importance does your employer place on physical health?', 'overall, how much importance does your employer place on mental health?']
previous_em_columns = ['overall, how much importance did your previous employer place on physical health?', 'overall, how much importance did your previous employer place on mental health?']

In [17]:
treatmentbox = widgets.Dropdown(description='Treated',
                                   value='Took treatment',
                                   options=["Took treatment", "Did not take treatment"])

emtimebox = widgets.Dropdown(description='Time of employment',
                           value='Current',
                           options=["Current", "Previous"])

treated_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==1]
non_treated_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==0]

trace1 = go.Bar(y=tuple(treated_df['overall, how much importance does your employer place on physical health?'].value_counts().index), 
               x=tuple(treated_df['overall, how much importance does your employer place on physical health?'].value_counts().values),
               name="Physical health importance", orientation="h")
trace2 = go.Bar(y=tuple(treated_df['overall, how much importance does your employer place on mental health?'].value_counts().index), 
               x=tuple(treated_df['overall, how much importance does your employer place on mental health?'].value_counts().values),
               name="Mental health importance", orientation="h")

g = go.FigureWidget(data=[trace1, trace2], 
                    layout = go.Layout(title="Distribution with respect to importance", 
                    yaxis_title="Importance", xaxis_title="Number of people", height=600))

def response5(change):
    if emtimebox.value == "Current":
        columns = current_em_columns
    else:
        columns = previous_em_columns
        
    if treatmentbox.value == "Took treatment":
        temp_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==1]
    else:
        temp_df = df[df['have you ever sought treatment for a mental health disorder from a mental health professional?']==0]
        
    with g.batch_update():
        g.data[0].y = tuple(temp_df[columns[0]].value_counts().index)
        g.data[0].x = tuple(temp_df[columns[0]].value_counts().values)
        g.data[1].y = tuple(temp_df[columns[1]].value_counts().index)
        g.data[1].x = tuple(temp_df[columns[1]].value_counts().values)
        g.layout.yaxis.title = "Importance"
        g.layout.xaxis.title = "Number of people"

treatmentbox.observe(response5, names="value")
emtimebox.observe(response5, names="value")

container = widgets.HBox([treatmentbox, emtimebox])
chart = widgets.VBox([container, g])

display(chart)

VBox(children=(HBox(children=(Dropdown(description='Treated', options=('Took treatment', 'Did not take treatme…