# Importing necessary libraries.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as ws
ws.filterwarnings("ignore")

# Importing the required dataset.

In [6]:
data=pd.read_csv('./survey.csv')

In [7]:
data.shape

(1259, 27)

We have 1259 rows of data with each of 27 columns including features.

In [8]:
data.head(5)

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


# Finding the null values in the dataset.

In [9]:
data.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64

We shall not replace null values in column "comments" because more than 85% of data is missing.Hence it will not contribute to our analysis.Even column 'State' has 50% null values,hence we will drop those two columns.

# Dropping the columns.

In [10]:
data.drop(columns=['state','comments'],inplace=True)

# Imputing the null values.

In [11]:
print(data['self_employed'].value_counts())
data['self_employed'].fillna('No',inplace=True)

No     1095
Yes     146
Name: self_employed, dtype: int64


In [12]:
print(data['work_interfere'].value_counts())
data['work_interfere'].fillna('Sometimes',inplace=True)

Sometimes    465
Never        213
Rarely       173
Often        144
Name: work_interfere, dtype: int64


In [13]:
data.isnull().sum()

Timestamp                    0
Age                          0
Gender                       0
Country                      0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
mental_vs_physical           0
obs_consequence              0
dtype: int64

In [14]:
data.columns

Index(['Timestamp', 'Age', 'Gender', 'Country', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

# Exploratory Data Analysis.

## 1.Univariate Analysis.

In [15]:
sns.boxplot(data['Age'])

<AxesSubplot:xlabel='Age'>

From this we can clearly see that 'Age' column has many outliers.

# Removing outliers.

In [16]:
data.drop(data[data['Age'] < 0].index, inplace = True) 
data.drop(data[data['Age'] > 100].index, inplace = True) 

In [17]:
data.shape

(1254, 25)

We find that we have removed 5 columns proved to be outliers.

In [18]:
sns.boxplot(data['Age'])

<AxesSubplot:xlabel='Age'>

This shows us that the median age is around 30 which we have taken for analysis.

In [19]:
data['Gender']=[m.lower() for m in data['Gender']]
data['Gender'].value_counts()

male                                              818
female                                            183
m                                                 150
f                                                  53
woman                                               4
make                                                4
male                                                3
cis male                                            3
female (trans)                                      2
man                                                 2
female                                              2
a little about you                                  1
neuter                                              1
cis female                                          1
fluid                                               1
queer                                               1
femake                                              1
enby                                                1
male leaning androgynous    

We have a majority of males in our analysis compared to females and other categories.

In [20]:
plt.pie(data['Country'].value_counts(),labels=data['Country'].unique(),labeldistance=1.1)

([<matplotlib.patches.Wedge at 0x7f8d59aa2650>,
  <matplotlib.patches.Wedge at 0x7f8d59a85bd0>,
  <matplotlib.patches.Wedge at 0x7f8d59aa2950>,
  <matplotlib.patches.Wedge at 0x7f8d59aa2e90>,
  <matplotlib.patches.Wedge at 0x7f8d59ab3490>,
  <matplotlib.patches.Wedge at 0x7f8d59ab3a10>,
  <matplotlib.patches.Wedge at 0x7f8d59ab3f10>,
  <matplotlib.patches.Wedge at 0x7f8d59ac1490>,
  <matplotlib.patches.Wedge at 0x7f8d59ab3350>,
  <matplotlib.patches.Wedge at 0x7f8d59ab3990>,
  <matplotlib.patches.Wedge at 0x7f8d48f86950>,
  <matplotlib.patches.Wedge at 0x7f8d59ad1850>,
  <matplotlib.patches.Wedge at 0x7f8d59ad1d90>,
  <matplotlib.patches.Wedge at 0x7f8d59ade310>,
  <matplotlib.patches.Wedge at 0x7f8d59ade850>,
  <matplotlib.patches.Wedge at 0x7f8d59aded90>,
  <matplotlib.patches.Wedge at 0x7f8d59aed310>,
  <matplotlib.patches.Wedge at 0x7f8d59aed850>,
  <matplotlib.patches.Wedge at 0x7f8d59aedd90>,
  <matplotlib.patches.Wedge at 0x7f8d59afc310>,
  <matplotlib.patches.Wedge at 0x7f8d59a

We find that the majority of our cases of analysis is from United States and Canada.So we are more focussing on the western world.

# Bivariate and Multivariate Analysis.

Using country as the first criteria for analysis.

In [21]:
country=data.groupby(data['Country'])

In [22]:
country['Age'].aggregate(np.median).sort_values()

Country
Bahamas, The               8.0
Slovenia                  19.0
Georgia                   20.0
India                     23.0
Austria                   24.0
Bosnia and Herzegovina    25.0
Nigeria                   25.0
Uruguay                   26.0
Russia                    26.0
Moldova                   26.0
Israel                    26.0
Sweden                    27.0
Hungary                   27.0
Australia                 27.0
Colombia                  27.0
Bulgaria                  27.5
France                    28.0
Brazil                    28.0
Canada                    29.0
Romania                   29.0
Latvia                    29.0
Poland                    29.0
Singapore                 29.5
Spain                     30.0
Belgium                   30.0
Switzerland               30.0
Germany                   30.0
New Zealand               30.0
Philippines               31.0
Mexico                    31.0
United Kingdom            31.0
Ireland                   32.0


We find that the median age of Japan is the highest with 49.0 yrs followed by Thialand and China.Lets see if this has any affect on mental health.

In [23]:
country['treatment','remote_work','self_employed'].describe()

Unnamed: 0_level_0,treatment,treatment,treatment,treatment,remote_work,remote_work,remote_work,remote_work,self_employed,self_employed,self_employed,self_employed
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Australia,21,2,Yes,13,21,2,No,13,21,2,No,19
Austria,3,1,No,3,3,2,Yes,2,3,2,No,2
"Bahamas, The",1,1,Yes,1,1,1,Yes,1,1,1,Yes,1
Belgium,6,2,No,5,6,1,No,6,6,1,No,6
Bosnia and Herzegovina,1,1,No,1,1,1,Yes,1,1,1,No,1
Brazil,6,2,No,4,6,2,No,5,6,2,No,4
Bulgaria,4,2,No,2,4,2,Yes,3,4,1,No,4
Canada,72,2,Yes,37,72,2,No,50,72,2,No,61
China,1,1,No,1,1,1,Yes,1,1,1,Yes,1
Colombia,2,1,No,2,2,1,No,2,2,1,No,2


From this we find out that countries like United states,Australia,United Kingdom and Canada have more than 50% of surveyed people who undergo a treatment for mental problems.

We also find that in the above countries, majority of people dont work at home or remotely which could be the reason for boredom ,anxiety and hence degrading mental health.

In [24]:
s_employ=data.groupby(['self_employed'])
s_employ['treatment'].describe()

Unnamed: 0_level_0,count,unique,top,freq
self_employed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,1110,2,Yes,557
Yes,144,2,Yes,76


We find regardless of whether a person is self_employed or not,it has no affect on the person recieving treatment for mental trauma.

In [25]:
treat=data.groupby(['treatment'])
treat['Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,621.0,31.458937,7.084081,5.0,27.0,31.0,35.0,65.0
Yes,633.0,32.56872,7.615281,8.0,27.0,32.0,37.0,72.0


We find that mean age of people recieving treatment is 32.5 years and there are some people who recieve aid at a small age of even 8yrs and sometimes even as old as 72yrs.

So we can see that the adult age faces more mental health issues compared to other phases but other phases are also not too free from this.

In [26]:
data.columns

Index(['Timestamp', 'Age', 'Gender', 'Country', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [27]:
treat['tech_company','work_interfere'].describe()

Unnamed: 0_level_0,tech_company,tech_company,tech_company,tech_company,work_interfere,work_interfere,work_interfere,work_interfere
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
No,621,2,Yes,517,621,4,Sometimes,366
Yes,633,2,Yes,511,633,4,Sometimes,361


From this,we get to know that people who work in tech are not so much mentally affected by their work.So type of work does not affect the mental health.

It is important to note that we are considering people who take medications or treatment as the people who are deeply mentally stressed .There may be cases where the health has degraded but may not prefer any medication.We are ignoring such cases.

We also find that both people who may or may not undergo treatment both agree that their mental health affect their productivity in work sometimes.

In [28]:
sns.barplot(data['leave'].unique(),data['leave'].value_counts())

<AxesSubplot:xlabel='Age', ylabel='leave'>

From this we can find that people find it somewhat on a easier side to get a leave sanctioned for mental health reasons because employers feel that mental health demands immense importance than work.
The company may sometimes deem to be responsible if its employeers health degrades.Hence companies dont take any risks which could be one of the prime reasons.

In [29]:
sns.barplot(data['mental_health_consequence'].unique(),data['mental_health_consequence'].value_counts())

<AxesSubplot:xlabel='Age', ylabel='mental_health_consequence'>

In [30]:
sns.barplot(data['phys_health_consequence'].unique(),data['phys_health_consequence'].value_counts())

<AxesSubplot:xlabel='Age', ylabel='phys_health_consequence'>

It is a good to see that people dont feel that discussing a mental health issue with your employer would have negative consequences. So people are ready to share their issues so that they feel light on heart.
The same is the case with physical health too.

In [31]:
plt.pie(data['coworkers'].value_counts(),labels=data['coworkers'].unique())
data['coworkers'].value_counts()

Some of them    772
No              258
Yes             224
Name: coworkers, dtype: int64

So people prefer to share about their mental health with only some of their coworkers or sometimes dont even want to share sometimes because there may be coworkers who would just empathize and demotivate even more.

In [32]:
print(data['mental_vs_physical'].value_counts())
plt.hist(data['mental_vs_physical'],histtype='step')

Don't know    574
Yes           342
No            338
Name: mental_vs_physical, dtype: int64


(array([342.,   0.,   0.,   0.,   0., 574.,   0.,   0.,   0., 338.]),
 array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8, 2. ]),
 [<matplotlib.patches.Polygon at 0x7f8d49042a50>])

So people dont know exactly whether employer would consider mental health as serious as a physical one .The ambiguity remains how people in a office would react to mental illness.

# Conclusion and Inferences.

1.This case study is majorly based on cases in the Western world.

2.Cases show that more than 50% of people surveyed in countries like US,Australia and Canada undergo treatment for mental ailments.

3.People who are not more prone to work at home are usually bored and filled with anxiety leading to degradation in mental health.

4.People who are in the early 30's usually undergo treatment but there are extreme cases like 8 years and 72 years people recieving the same treatment.

5.It is interesting to find that people face mental trauma regardless of whether they are self employed or not.

6.The surveyed people agree that their mental health somewhat affects their productivity at work. 

7.People feel that their employers somewhat easily sanction leave for mental health issues.The reason maybe that the employer does not want to take any risk of overloading the patient with work.

8.People feel that sharing about their mental or physical health with employers would help them a bit but they are reluctant to share the same with their coworkers.They would prefer to share with only some of the coworkers.

9.People dont know whether the employer considers mental health issues as seriously as the physical ones.The ambiguity still remains about people's reaction towards mental health.

## PLEASE DONT FORGET TO UPVOTE!!!