# Day 7 - `14-JUL-2020`

# HR Attrition Case Study

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Reading the Data

In [None]:
df=pd.read_csv('general_data.csv') 
df.head().T

# Features of the Data

In [None]:
df.info();

There are 4410 rows and 24 columns

In [None]:
df.shape

In [None]:
df.describe().T # Transposing the data for better clarity

In [None]:
df.columns

# Cleaning the Data

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.isnull(),cbar=False,cmap="YlGnBu");

In [None]:
df.isnull().sum().sort_values(ascending=False)

#### We can see that `NumCompaniesWorked` and `TotalWorkingYears` have missing values.

We would handle the missing values by replacing them with the last valid observation value of each column before the missing item

In [None]:
cols_na = ["NumCompaniesWorked","TotalWorkingYears"]

#### Below are the columns with missing values

In [None]:
df_na = df[df.isnull().any(axis=1)][cols_na]
df_na.index

In [None]:
df.fillna(method='ffill', inplace=True) 

In [None]:
df[cols_na].iloc[df_na.index]

We check further on the counts using describe to see if counts are same for all columns or not :

In [None]:
df.describe().transpose()['count'].sort_values()

We can see now that there are no more missing values. <br> Now we will check if there are any duplicate rows - if yes those rows need to be removed.

In [None]:
df.duplicated().any()

# Univariate Analysis

## Attrition distribution

In [None]:
df['Attrition'].value_counts()

In [None]:
attrition_percent = df['Attrition'].value_counts()/df['Attrition'].count()

In [None]:
attrition_percent

In [None]:
df['Attrition'].value_counts().plot(kind='pie', autopct = "%1.1f%%",shadow=True, startangle=90)

### We can see that in the dataset 16.1% employees attrited and rest did not.

In [None]:
df['Gender'].value_counts().plot(kind='pie', autopct = "%1.1f%%");

### In the dataset : There are total 60% male employees and 40% female employees


In [None]:
sns.countplot(x='Gender', hue='Attrition',data=df );

In [None]:
df['StockOptionLevel'].value_counts().plot(kind='pie', autopct = "%1.1f%%");

In [None]:
sns.countplot(x='StockOptionLevel', hue='Attrition', data=df);

#### It can be seen that attrition is high is StockOptionLevel is less than 2.

In [None]:
df['Department'].value_counts().plot(kind='pie', autopct = "%1.1f%%");

In [None]:
sns.countplot(x='Department', hue='Attrition', data=df);

#### The employee from Research & Development Department have higher attrition rate compared to other departments.

##### Select columns and analyze
<br> Select columns that have non-categorical and non-ordinal data like - <br> 'Age','DistanceFromHome','MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike','TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 'YearsWithCurrManager'
 

In [None]:
uni_cols = ['Age','DistanceFromHome','MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike','TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 'YearsWithCurrManager']

In [None]:
df_desc=df[uni_cols].describe() 
df_desc = df_desc.transpose()
df_desc['IQR'] = df_desc['75%'] - df_desc['25%']
df_desc

#### Now below we choose only those cases where the `Attrition` = Yes

In [None]:
df_desc_yes=df[df['Attrition'] == 'Yes'][uni_cols].describe() 
df_desc_yes = df_desc_yes.transpose()
df_desc_yes['IQR'] = df_desc_yes['75%'] - df_desc_yes['25%']
df_desc_yes

#####  - It can be noted above that we calculate the Inter Quartile range and see that attrition is among a wide range of salary ~ 42K
#####  - Also, it can be seen that the mean salary who attrited is ~61k

### Median

In [None]:
# We check median values for all columns

df_med=df[uni_cols].median()
df_med

In [None]:
# We check median values for Attrition = Yes
df_med_yes=df[df['Attrition'] == 'Yes'][uni_cols].median()
df_med_yes

In [None]:
# We check median values for Attrition = No
df_med_no=df[df['Attrition'] == 'No'][uni_cols].median()
df_med_no

### Mode

In [None]:
df_mode=df[uni_cols].mode()
df_mode.transpose()

In [None]:
# We check median values for Attrition = Yes
df_mode_yes=df[df['Attrition'] == 'Yes'][uni_cols].mode()
df_mode_yes.transpose().iloc[:,0]


##### We can see that most employees are leaving at age of 29 and who have a `PercentSalaryHike` (13) lower than median (14)

### Variance

In [None]:
df_var=df[uni_cols].var().sort_values()
df_var

### Skewness

In [None]:
df_skew=df[uni_cols].skew().sort_values()
df_skew

### Kurtosis

In [None]:
df_kurt = df[uni_cols].kurt().sort_values()
df_kurt

# Inference 

- It can be noted above that we calculate the Inter Quartile range and see that attrition is among a wide range of salary ~ 42K

- All the above variables show positive skewness; while Age & Mean_distance_from_home are leptokurtic and all other variables are platykurtic.

- We can see that most employees are leaving at age of 29 and who have a PercentSalaryHike (13) lower than median (14)

- Mean age forms a near normal distribution with 13 years of IQR fo the overall sample and of 11 years of IQR for those attrited


# Further Outliers Check

In [None]:
box_plot=df.Age 
sns.boxplot(box_plot);

##### Age is normally distributed without any outliers 

In [None]:
box_plot_attrited = df[df['Attrition'] == 'Yes'].Age
sns.boxplot(box_plot_attrited);

##### Age of attrited is right skewed & few outliers are there between 55 to 60. We can see same depicted in the distribution plot below.


In [None]:
sns.distplot(df[df['Attrition'] == 'Yes'].Age);

In [None]:
box_plot=df.MonthlyIncome 
sns.boxplot(box_plot);

##### Monthly Income is Right skewed with several outliers 

In [None]:
box_plot_attrited = df[df['Attrition'] == 'Yes'].MonthlyIncome
sns.boxplot(box_plot_attrited);

##### Monthly income of attrited individuals look normally distributed - but there are outliers showing many individuals with high salary leaving the company

In [None]:
box_plot=df.YearsAtCompany 
sns.boxplot(box_plot);

##### Years at company is also Right Skewed with several outliers observed. 

In [None]:
box_plot_attrited = df[df['Attrition'] == 'Yes'].YearsAtCompany
sns.boxplot(box_plot_attrited);

##### Even for attrited employees the distribution of Years at company is rightly skewed and with several outliers

<br>