In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

# Libraries

Data preparation

In [19]:
import chardet

with open(r"C:\Users\rabbi\DataAnalysis\Parental Leave Policies\Data\parental_leave.csv", 'rb') as f:
    result = chardet.detect(f.read(10000))  
    print(result)

# First, let's check what encoding I need to read the data in with
# Reads the first 10,000 bytes to detect encoding

{'encoding': 'Windows-1252', 'confidence': 0.7279660342311264, 'language': ''}


In [23]:
data = pd.read_csv(r"C:\Users\rabbi\DataAnalysis\Parental Leave Policies\Data\parental_leave.csv", 
                   encoding='windows-1252', index_col=0)
# Reading the data


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1601 entries, Epsilon to Merrimac Industrial Sales
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Industry                1598 non-null   object 
 1   Paid Maternity Leave    1601 non-null   float64
 2   Unpaid Maternity Leave  1494 non-null   float64
 3   Paid Paternity Leave    289 non-null    float64
 4   Unpaid Paternity Leave  64 non-null     float64
 5   Unnamed: 6              0 non-null      float64
 6   Unnamed: 7              0 non-null      float64
 7   Unnamed: 8              0 non-null      float64
 8   Unnamed: 9              0 non-null      float64
dtypes: float64(8), object(1)
memory usage: 125.1+ KB


Here we have four unnamed columns with all NaN values. Looking at the csv, it's likely because of the way its formatted, with 4 extra commas at the end of each line. Let's get rid of it.

In [34]:
data = pd.read_csv(r"C:\Users\rabbi\DataAnalysis\Parental Leave Policies\Data\parental_leave.csv", 
                   encoding='windows-1252', index_col=None)
data = data.dropna(axis=1, how='all')  
# Drop columns where all values are NaN
# Make sure Company column isn't used as the index


In [31]:
data.head()

Unnamed: 0,Company,Industry,Paid Maternity Leave,Unpaid Maternity Leave,Paid Paternity Leave,Unpaid Paternity Leave
0,Epsilon,Advertising,6.0,6.0,6.0,6.0
1,The Walt Disney Company,Arts & Entertainment,5.0,4.0,4.5,4.0
2,Guild Education,Business Services: Other,14.0,0.0,8.0,4.0
3,WeWork,Business Services: Other,14.0,2.0,16.0,4.0
4,Randstad USA,Business Services: Staffing & Outsourcing,5.0,7.0,0.0,0.0


In [28]:
data.columns

Index(['Industry', 'Paid Maternity Leave', 'Unpaid Maternity Leave',
       'Paid Paternity Leave', 'Unpaid Paternity Leave',
       'Total Paid Parental Leave'],
      dtype='object')

This is what the data looks like, we can start analysis.

-- Which companies offer the most paid parental leave weeks?

In [32]:
data['Total Paid Parental Leave'] = data['Paid Maternity Leave'] + data['Paid Paternity Leave']
top_companies = data[['Company', 'Total Paid Parental Leave']].sort_values(by='Total Paid Parental Leave', ascending=False).head(5)

In [33]:
top_companies

Unnamed: 0,Company,Total Paid Parental Leave
47,Grant Thornton,102.0
148,LAC-Group,67.5
129,Flatiron Health,60.0
248,Salesforce,52.0
188,Bill and Melinda Gates Foundation,52.0


-- Is maternity leave typically longer than paternity leave?

In [37]:
avg_paid_maternity = data['Paid Maternity Leave'].mean()
avg_paid_paternity = data['Paid Paternity Leave'].mean()

averages = pd.DataFrame({
    'Avg Paid Maternity Leave': [avg_paid_maternity],
    'Avg Paid Paternity Leave': [avg_paid_paternity]
})

In [38]:
averages

Unnamed: 0,Avg Paid Maternity Leave,Avg Paid Paternity Leave
0,10.909119,7.33218


-- What is the distribution of parental leave weeks offered?

In [39]:
distribution_data = data[['Company', 'Paid Maternity Leave', 'Unpaid Maternity Leave']]
sorted_data = distribution_data.sort_values(by='Company', ascending=True)

In [40]:
sorted_data

Unnamed: 0,Company,Paid Maternity Leave,Unpaid Maternity Leave
465,Braum's Ice Cream & Dairy Store,3.0,1.0
1498,Charter Communications,8.0,12.0
870,1010data,12.0,8.0
499,10up,4.0,12.0
373,24-7 Intouch,0.0,
...,...,...,...
587,iRobot,15.0,0.0
386,nFuzion,4.0,0.0
144,nVent,6.0,0.0
1458,panOpen,2.0,0.0


In [42]:
stats = data[['Paid Maternity Leave', 'Unpaid Maternity Leave']].describe()

In [43]:
stats

Unnamed: 0,Paid Maternity Leave,Unpaid Maternity Leave
count,1601.0,1494.0
mean,10.909119,6.628849
std,8.024514,9.274953
min,0.0,0.0
25%,6.0,0.0
50%,11.0,4.0
75%,12.0,10.0
max,52.0,52.0
