In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np

#Import hrdataset and read the first 5 records
df = pd.read_csv("hrdataset.csv")
df.head(5)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [None]:
#read last 5 records
df.tail(5)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
54803,3030,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,74592,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,13918,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,13614,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0
54807,51526,HR,region_22,Bachelor's,m,other,1,27,1.0,5,0,0,49,0


In [None]:
#Determine the size of the dataset
df.shape

(54808, 14)

In [None]:
#determine the datatypes
df.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [None]:
#convert float to int and drop unnecarry columns
df['previous_year_rating'] = df['previous_year_rating'].astype(np.int64)

columns_to_drop = [
        'recruitment_channel',     
]

df.drop(columns=columns_to_drop, axis=1, inplace=True)

In [None]:
#remove leading and trailing characters and convert all characters to lowercase
df_clean.columns = df_clean.columns.str.strip().str.lower()
df.head(1)

In [None]:
#check for missing data
print(df.isnull().values.any())

True


In [None]:
df.isnull().any()

employee_id             False
department              False
region                  False
education                True
gender                  False
recruitment_channel     False
no_of_trainings         False
age                     False
previous_year_rating     True
length_of_service       False
KPIs_met >80%           False
awards_won?             False
avg_training_score      False
is_promoted             False
dtype: bool

In [None]:
#drop the missing observations
df['previous_year_rating'].fillna(df['previous_year_rating'].median(), inplace=True)
df['education'] = df['education'].fillna(df['education'].mode()[0])
df.isnull().any()

employee_id             False
department              False
region                  False
education               False
gender                  False
recruitment_channel     False
no_of_trainings         False
age                     False
previous_year_rating    False
length_of_service       False
KPIs_met >80%           False
awards_won?             False
avg_training_score      False
is_promoted             False
dtype: bool

In [None]:
#check for duplicates
df = df[df.duplicated()]
sum(df.duplicated()) 

0

In [None]:
df['kpis_met >80%'].groupby([df['is_promoted']]).sum()

is_promoted
0    16029
1     3262
Name: kpis_met >80%, dtype: int64

In [None]:
#probability to Get Promotion based on KPIs_met >80%
prob_kpi = df.groupby(['kpis_met >80%','is_promoted'])['employee_id'].count().reset_index()
kpi = prob_kpi.pivot_table(index='kpis_met >80%', columns='is_promoted', 
                       values='employee_id').reset_index()
kpi.columns = ['kpi', 'not_promoted', 'promoted']
kpi['total'] = kpi['not_promoted']+kpi['promoted']
kpi['probability'] = round((kpi['promoted']/kpi['total'])*100,2)
kpi = kpi.sort_values(['probability'], ascending=False)
kpi

Unnamed: 0,kpi,not_promoted,promoted,total,probability
1,1,16029,3262,19291,16.91
0,0,34111,1406,35517,3.96


In [None]:
#probability to Get Promotion based on previous_year_rating
rating_probs = df.groupby(['previous_year_rating','is_promoted']).agg({
    'department': ['count']
}).reset_index()

rating_probs.columns = ['previous_year_rating', 'is_promoted', "employees"]

rating_probs = pd.pivot_table(rating_probs,
                              index = 'previous_year_rating',
                              columns = 'is_promoted',
                              values = "employees"
                             ).reset_index()

rating_probs.columns = ['previous_year_rating', 'not_promoted', 'promoted']
rating_probs['total_employees'] = rating_probs['not_promoted'] + rating_probs['promoted']
rating_probs['promotion_probs'] = (rating_probs['promoted']/(rating_probs['not_promoted']+rating_probs['promoted']))*100
rating_probs = rating_probs.sort_values('promotion_probs', ascending=False).reset_index(drop=True)
rating_probs

Unnamed: 0,previous_year_rating,not_promoted,promoted,total_employees,promotion_probs
0,5.0,9820,1921,11741,16.361468
1,4.0,9093,784,9877,7.937633
2,3.0,17263,1355,18618,7.277903
3,2.0,4044,181,4225,4.284024
4,1.0,6135,88,6223,1.414109


In [None]:
df.shape

(54808, 9)

In [None]:
df.head()

Unnamed: 0,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,sourcing,1,35,5,8,1,0,49,0
1,other,1,30,5,4,0,0,60,0
2,sourcing,1,34,3,7,0,0,50,0
3,other,2,39,1,10,0,0,50,0
4,other,1,45,3,2,0,0,73,0


In [None]:
#trained model
features = df.drop(['is_promoted'], axis=1)
target = df['is_promoted']

model = DecisionTreeClassifier()

model.fit(features, target)

print(model)


DecisionTreeClassifier()
