In [1]:
# Required imports for the Project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# Loading in the Dataset
df_attrition_data = pd.read_csv(r'./IBM Attrition Data.csv')

In [None]:
df_attrition_data.head()

In [None]:
# Exploring to better understand the Dataset
print(f'size: {df_attrition_data.size}')
print(f'shape: {df_attrition_data.shape}')
print(f'columns: {df_attrition_data.columns}')
# From exploring we observe that the response is the attrition column, whilst the others are the features

### EXPLORATORY DATA ANALYSIS 

##### 1)Find the age distribution of employees in IBM: Through an Histogram

In [None]:
# Extracting the age column from the dataset
ages_of_employees = df_attrition_data['Age']

In [None]:
# define the x_axis
x_axis = ages_of_employees

In [None]:
# Additional imports for the plot
from matplotlib import style
%matplotlib inline

In [None]:
# Configuration for the Histogram plot and creating the plot
style.use('ggplot')
plt.figure(figsize=(10, 10))
h = plt.hist(x_axis, bins=42, facecolor='g')
plt.xlim(17, 61)
plt.ylim(0, 80)
plt.title('AGE DISTRIBUTION OF EMPLOYEES IN IBM')
plt.xlabel('Age')
plt.ylabel('Number of Employees')
plt.show()

##### 2)Explore attrition by age: Through a Grouped Bar Chart

In [None]:
# First the age and the Attrition columns are extracted
df_attrition_by_age = df_attrition_data[['Attrition', 'Age']]
df_attrition_by_age.head()

In [None]:
# Extracting the age and the corresponding number of attrition status(Yes and No) into lists.
employee_ages = df_attrition_by_age.groupby('Age')
age_labels = []
attrition_value_yes = []
attrition_value_no = []
for age in range(18, 61):
    each_age = employee_ages.get_group(age)
    age_labels.append(age)
    attrition_value_yes.append(len(each_age[each_age.Attrition == "Yes"]))
    attrition_value_no.append(len(each_age[each_age.Attrition == "No"]))

In [None]:
# Addtional imports for the plot
from matplotlib import style
%matplotlib inline

In [None]:
# Creating and configuring the Grouped Bar Chart
x = np.arange(len(age_labels))
width = 0.5
style.use('ggplot')
plt.figure(figsize=(13, 7), tight_layout=True)
plt.ylim(0, 70)
yesBar = plt.bar(x - width/2, attrition_value_yes, width, label='Yes')
noBar = plt.bar(x + width/2, attrition_value_no, width, label='No')
plt.xticks(x, age_labels)
plt.legend()
plt.title('LEVEL OF ATTRITION OF EMPLOYEES PER AGE')
plt.xlabel('Age')
plt.ylabel('Number of Employees')
plt.bar_label(yesBar, padding=3)
plt.bar_label(noBar, padding=3)
plt.show()

##### 3)EXPLORE THE DATA FOR LEFT EMPLOYEES

In [None]:
# Interpreting this as the data for employees that have left the company that is Employees with an attrtion status of YES.
# Extracting the data for employees with an attrition status 'Yes'
employee_attrition_status = df_attrition_data.groupby('Attrition')
left_employee = employee_attrition_status.get_group('Yes')

In [None]:
# Viewing the shape
left_employee.shape

In [None]:
# Viewing the size
left_employee.size

In [None]:
# Viewing the first five records
left_employee.head()

In [None]:
# Exploring various stats of each column
left_employee.describe()

In [None]:
# Exploring the correlations between the columns
correlations = left_employee.corr()
correlations

In [None]:
# Generating a heatmap to better explore the data of the left employees
plt.figure(figsize=(8, 8))
sns.heatmap(data=correlations, square=True, cmap='bwr')
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

##### 4)DISTRIBUTION OF EMPLOYEES BY THE EDUCATION FIELD

In [None]:
# Ordering the data by the Education fields
employee_education_field = df_attrition_data.groupby('EducationField')
# Viewing all fields
employee_education_field.size()

In [None]:
fields = ['Human Resources', 'Life Sciences',
          'Marketing', 'Medical', 'Technical Degree', 'Other']
no_of_employees = [len(employee_education_field.get_group(i)) for i in fields]

In [None]:
# Additional imports for the plot
from matplotlib import style
%matplotlib inline

In [None]:
style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
edu_bar = ax.bar(fields, no_of_employees, facecolor='g')

# Set title
ax.set_title("DISTRIBUTION OF EMPLOYEES BY EDUCATION FIELD")
ax.set_xlabel('EDUCATION FIELD')
ax.set_ylabel('NUMBER OF EMPLOYEES')

ax.bar_label(edu_bar, padding=1)

x = np.arange(len(fields))
ax.set_xticks(x, fields)

plt.show()

##### 5)BAR CHART FOR THE NUMBER OF MARRIED AND UNMARRIED COUPLES

In [3]:
employee_marital_status = df_attrition_data.groupby('MaritalStatus')
employee_marital_status.size()

MaritalStatus
Divorced    327
Married     673
Single      470
dtype: int64

In [4]:
status = ['Married', 'Unmarried']
# To obtain the number of married employees, we get the total number of employees with their marital status as 'Married'
married_employees = len(employee_marital_status.get_group('Married'))

# To obtain the number of unmarried employees, we get the total number of employees with their marital status as 'Divorced' and 'Single'
unmarried_employees = len(employee_marital_status.get_group(
    'Divorced')) + len(employee_marital_status.get_group('Single'))

number_of_employees = [married_employees, unmarried_employees]

In [5]:
style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize=(7, 7))
bar = ax.bar(status, number_of_employees, facecolor='g')

# Set title
ax.set_title("BAR CHART FOR THE NUMBER OF MARRIED AND UNMARRIED EMPLOYEES")
ax.set_xlabel('MARITAL STATUS')
ax.set_ylabel('NUMBER OF EMPLOYEES')

ax.bar_label(bar, padding=1)

x = np.arange(len(status))
ax.set_xticks(x, status)

plt.show()

NameError: name 'style' is not defined