In [28]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib import pyplot
from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, classification_report, roc_auc_score

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head().transpose()

Unnamed: 0,0,1,2,3,4
EmployeeNo,YAK/S/00001,YAK/S/00002,YAK/S/00003,YAK/S/00004,YAK/S/00006
Division,Commercial Sales and Marketing,Customer Support and Field Operations,Commercial Sales and Marketing,Commercial Sales and Marketing,Information and Strategy
Qualification,"MSc, MBA and PhD",First Degree or HND,First Degree or HND,First Degree or HND,First Degree or HND
Gender,Female,Male,Male,Male,Male
Channel_of_Recruitment,Direct Internal process,Agency and others,Direct Internal process,Agency and others,Direct Internal process
Trainings_Attended,2,2,2,3,3
Year_of_birth,1986,1991,1987,1982,1990
Last_performance_score,12.5,12.5,7.5,2.5,7.5
Year_of_recruitment,2011,2015,2012,2009,2012
Targets_met,1,0,0,0,0


In [4]:
train['seperate'] = "train"
test['seperate'] = "test"
df = pd.concat([train, test], sort=False)

In [6]:
df.describe()

Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,Promoted_or_Not
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,38312.0
mean,2.253011,1986.194826,7.696869,2013.134488,0.351974,0.023172,55.38675,0.084595
std,0.609264,7.664765,3.741146,4.265094,0.47759,0.15045,13.371559,0.278282
min,2.0,1950.0,0.0,1982.0,0.0,0.0,31.0,0.0
25%,2.0,1982.0,5.0,2012.0,0.0,0.0,43.0,0.0
50%,2.0,1988.0,7.5,2014.0,0.0,0.0,52.0,0.0
75%,2.0,1992.0,10.0,2016.0,1.0,0.0,68.0,0.0
max,11.0,2001.0,12.5,2018.0,1.0,1.0,91.0,1.0


In [8]:
np.random.seed(0)
df['Year_of_birth'] = np.log(1 + df.Year_of_birth)
df['Last_performance_score'] = np.log(1 + df.Last_performance_score)
df['Year_of_recruitment'] = np.log(1 + df.Year_of_recruitment)
df['Training_score_average'] = np.log(1 + df.Training_score_average)

In [10]:
df.isna().any()

EmployeeNo                             False
Division                               False
Qualification                           True
Gender                                 False
Channel_of_Recruitment                 False
Trainings_Attended                     False
Year_of_birth                          False
Last_performance_score                 False
Year_of_recruitment                    False
Targets_met                            False
Previous_Award                         False
Training_score_average                 False
State_Of_Origin                        False
Foreign_schooled                       False
Marital_Status                         False
Past_Disciplinary_Action               False
Previous_IntraDepartmental_Movement    False
No_of_previous_employers               False
Promoted_or_Not                         True
seperate                               False
dtype: bool

In [11]:
df.Qualification.value_counts()

First Degree or HND         36669
MSc, MBA and PhD            14925
Non-University Education      805
Name: Qualification, dtype: int64

In [12]:
df['Qualification'].fillna(df['Qualification'].mode()[0], inplace=True)

In [13]:
df.select_dtypes([np.number]).columns

Index(['Trainings_Attended', 'Year_of_birth', 'Last_performance_score',
       'Year_of_recruitment', 'Targets_met', 'Previous_Award',
       'Training_score_average', 'Promoted_or_Not'],
      dtype='object')

In [17]:
cat = [i for i in df.columns if df[i].dtype == 'object' and i not in ['EmployeeNo', 'seperate']]
print(cat)

['Division', 'Qualification', 'Gender', 'Channel_of_Recruitment', 'State_Of_Origin', 'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement', 'No_of_previous_employers']


In [19]:
df.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,seperate
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,2.151109,1.281681,2.152562,1,0,1.555545,ANAMBRA,No,Married,No,No,0,0.0,train
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,2.151401,1.281681,2.152793,0,0,1.603479,ANAMBRA,Yes,Married,No,No,0,0.0,train
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,2.151167,1.144244,2.15262,0,0,1.5605,KATSINA,Yes,Married,No,No,0,0.0,train
3,YAK/S/00004,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,3,2.150874,0.812157,2.152447,0,0,1.5605,NIGER,Yes,Single,No,No,1,0.0,train
4,YAK/S/00006,Information and Strategy,First Degree or HND,Male,Direct Internal process,3,2.151343,1.144244,2.15262,0,0,1.67835,AKWA IBOM,Yes,Married,No,No,1,0.0,train


In [23]:
train = df[df.seperate == 'train'].reset_index(drop=True)
test = df[df.seperate == 'test'].reset_index(drop=True)
train.drop('seperate', axis=1, inplace=True)
test.drop('seperate', axis=1, inplace=True)

In [24]:
train['Promoted_or_Not'] = train['Promoted_or_Not'].astype('int64')

In [26]:
features = train.columns.tolist()
features = [i for i in features if i not in ["Promoted_or_Not", "EmployeeNo"]]

target = "Promoted_or_Not"

X = train[features]
y = train[target]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [112]:
X_train.dtypes

Division                                object
Qualification                           object
Gender                                  object
Channel_of_Recruitment                  object
Trainings_Attended                       int64
Year_of_birth                          float64
Last_performance_score                 float64
Year_of_recruitment                    float64
Targets_met                              int64
Previous_Award                           int64
Training_score_average                 float64
State_Of_Origin                         object
Foreign_schooled                        object
Marital_Status                          object
Past_Disciplinary_Action                object
Previous_IntraDepartmental_Movement     object
No_of_previous_employers                object
dtype: object

In [30]:
cat_features_index = np.where(X_train.dtypes == object)[0]

In [31]:
clf8 = CatBoostClassifier(eval_metric='Accuracy',use_best_model=True,random_seed=42)
clf8.fit(X_train,y_train,cat_features=cat_features_index,eval_set=(X_test,y_test))

Learning rate set to 0.110214
0:	learn: 0.9228876	test: 0.9242213	best: 0.9242213 (0)	total: 201ms	remaining: 3m 20s
1:	learn: 0.9228876	test: 0.9243083	best: 0.9243083 (1)	total: 296ms	remaining: 2m 27s
2:	learn: 0.9229995	test: 0.9248303	best: 0.9248303 (2)	total: 331ms	remaining: 1m 49s
3:	learn: 0.9229622	test: 0.9243083	best: 0.9248303 (2)	total: 370ms	remaining: 1m 32s
4:	learn: 0.9229995	test: 0.9248303	best: 0.9248303 (2)	total: 384ms	remaining: 1m 16s
5:	learn: 0.9229622	test: 0.9243083	best: 0.9248303 (2)	total: 453ms	remaining: 1m 14s
6:	learn: 0.9232605	test: 0.9248303	best: 0.9248303 (2)	total: 540ms	remaining: 1m 16s
7:	learn: 0.9232232	test: 0.9249173	best: 0.9249173 (7)	total: 614ms	remaining: 1m 16s
8:	learn: 0.9229995	test: 0.9242213	best: 0.9249173 (7)	total: 668ms	remaining: 1m 13s
9:	learn: 0.9241181	test: 0.9253524	best: 0.9253524 (9)	total: 752ms	remaining: 1m 14s
10:	learn: 0.9248639	test: 0.9265704	best: 0.9265704 (10)	total: 850ms	remaining: 1m 16s
11:	learn: 

92:	learn: 0.9432471	test: 0.9424047	best: 0.9424047 (80)	total: 7.77s	remaining: 1m 15s
93:	learn: 0.9430979	test: 0.9422307	best: 0.9424047 (80)	total: 7.85s	remaining: 1m 15s
94:	learn: 0.9431725	test: 0.9424047	best: 0.9424047 (80)	total: 7.94s	remaining: 1m 15s
95:	learn: 0.9431725	test: 0.9424917	best: 0.9424917 (95)	total: 8.02s	remaining: 1m 15s
96:	learn: 0.9432098	test: 0.9424917	best: 0.9424917 (95)	total: 8.09s	remaining: 1m 15s
97:	learn: 0.9432098	test: 0.9424917	best: 0.9424917 (95)	total: 8.16s	remaining: 1m 15s
98:	learn: 0.9431725	test: 0.9424047	best: 0.9424917 (95)	total: 8.22s	remaining: 1m 14s
99:	learn: 0.9431725	test: 0.9424047	best: 0.9424917 (95)	total: 8.29s	remaining: 1m 14s
100:	learn: 0.9431352	test: 0.9424047	best: 0.9424917 (95)	total: 8.38s	remaining: 1m 14s
101:	learn: 0.9432098	test: 0.9424047	best: 0.9424917 (95)	total: 8.47s	remaining: 1m 14s
102:	learn: 0.9432471	test: 0.9424917	best: 0.9424917 (95)	total: 8.55s	remaining: 1m 14s
103:	learn: 0.9432

184:	learn: 0.9454844	test: 0.9424917	best: 0.9426657 (142)	total: 15.3s	remaining: 1m 7s
185:	learn: 0.9454098	test: 0.9424047	best: 0.9426657 (142)	total: 15.4s	remaining: 1m 7s
186:	learn: 0.9454471	test: 0.9424047	best: 0.9426657 (142)	total: 15.5s	remaining: 1m 7s
187:	learn: 0.9456335	test: 0.9424047	best: 0.9426657 (142)	total: 15.6s	remaining: 1m 7s
188:	learn: 0.9454471	test: 0.9424047	best: 0.9426657 (142)	total: 15.7s	remaining: 1m 7s
189:	learn: 0.9454844	test: 0.9424917	best: 0.9426657 (142)	total: 15.7s	remaining: 1m 7s
190:	learn: 0.9455962	test: 0.9424917	best: 0.9426657 (142)	total: 15.8s	remaining: 1m 6s
191:	learn: 0.9455962	test: 0.9424917	best: 0.9426657 (142)	total: 15.9s	remaining: 1m 6s
192:	learn: 0.9455962	test: 0.9424917	best: 0.9426657 (142)	total: 16s	remaining: 1m 6s
193:	learn: 0.9456708	test: 0.9424917	best: 0.9426657 (142)	total: 16.1s	remaining: 1m 6s
194:	learn: 0.9456335	test: 0.9424917	best: 0.9426657 (142)	total: 16.2s	remaining: 1m 6s
195:	learn: 

277:	learn: 0.9467522	test: 0.9422307	best: 0.9427527 (208)	total: 23.1s	remaining: 1m
278:	learn: 0.9467522	test: 0.9422307	best: 0.9427527 (208)	total: 23.2s	remaining: 1m
279:	learn: 0.9467522	test: 0.9422307	best: 0.9427527 (208)	total: 23.3s	remaining: 60s
280:	learn: 0.9467895	test: 0.9422307	best: 0.9427527 (208)	total: 23.4s	remaining: 59.9s
281:	learn: 0.9468640	test: 0.9422307	best: 0.9427527 (208)	total: 23.5s	remaining: 59.8s
282:	learn: 0.9468268	test: 0.9422307	best: 0.9427527 (208)	total: 23.6s	remaining: 59.8s
283:	learn: 0.9468268	test: 0.9422307	best: 0.9427527 (208)	total: 23.7s	remaining: 59.7s
284:	learn: 0.9468640	test: 0.9423177	best: 0.9427527 (208)	total: 23.7s	remaining: 59.6s
285:	learn: 0.9468640	test: 0.9423177	best: 0.9427527 (208)	total: 23.8s	remaining: 59.5s
286:	learn: 0.9468640	test: 0.9423177	best: 0.9427527 (208)	total: 24s	remaining: 59.5s
287:	learn: 0.9469386	test: 0.9423177	best: 0.9427527 (208)	total: 24s	remaining: 59.4s
288:	learn: 0.9469386	

370:	learn: 0.9483556	test: 0.9424917	best: 0.9427527 (208)	total: 30.9s	remaining: 52.4s
371:	learn: 0.9483556	test: 0.9424917	best: 0.9427527 (208)	total: 31s	remaining: 52.3s
372:	learn: 0.9483556	test: 0.9424917	best: 0.9427527 (208)	total: 31.1s	remaining: 52.2s
373:	learn: 0.9483929	test: 0.9424917	best: 0.9427527 (208)	total: 31.1s	remaining: 52.1s
374:	learn: 0.9483929	test: 0.9424047	best: 0.9427527 (208)	total: 31.2s	remaining: 52s
375:	learn: 0.9484674	test: 0.9424047	best: 0.9427527 (208)	total: 31.3s	remaining: 51.9s
376:	learn: 0.9485047	test: 0.9423177	best: 0.9427527 (208)	total: 31.4s	remaining: 51.8s
377:	learn: 0.9484674	test: 0.9424047	best: 0.9427527 (208)	total: 31.5s	remaining: 51.8s
378:	learn: 0.9486539	test: 0.9424047	best: 0.9427527 (208)	total: 31.5s	remaining: 51.7s
379:	learn: 0.9485420	test: 0.9423177	best: 0.9427527 (208)	total: 31.6s	remaining: 51.6s
380:	learn: 0.9485793	test: 0.9423177	best: 0.9427527 (208)	total: 31.7s	remaining: 51.5s
381:	learn: 0.

463:	learn: 0.9499963	test: 0.9417087	best: 0.9427527 (208)	total: 38.5s	remaining: 44.5s
464:	learn: 0.9499217	test: 0.9416217	best: 0.9427527 (208)	total: 38.6s	remaining: 44.4s
465:	learn: 0.9499217	test: 0.9416217	best: 0.9427527 (208)	total: 38.7s	remaining: 44.3s
466:	learn: 0.9499590	test: 0.9416217	best: 0.9427527 (208)	total: 38.8s	remaining: 44.2s
467:	learn: 0.9499590	test: 0.9416217	best: 0.9427527 (208)	total: 38.9s	remaining: 44.2s
468:	learn: 0.9499590	test: 0.9416217	best: 0.9427527 (208)	total: 38.9s	remaining: 44.1s
469:	learn: 0.9499217	test: 0.9416217	best: 0.9427527 (208)	total: 39s	remaining: 44s
470:	learn: 0.9500336	test: 0.9416217	best: 0.9427527 (208)	total: 39.1s	remaining: 43.9s
471:	learn: 0.9501081	test: 0.9416217	best: 0.9427527 (208)	total: 39.2s	remaining: 43.8s
472:	learn: 0.9501081	test: 0.9417087	best: 0.9427527 (208)	total: 39.2s	remaining: 43.7s
473:	learn: 0.9501454	test: 0.9417087	best: 0.9427527 (208)	total: 39.3s	remaining: 43.6s
474:	learn: 0.

555:	learn: 0.9519353	test: 0.9413607	best: 0.9427527 (208)	total: 46.1s	remaining: 36.8s
556:	learn: 0.9520098	test: 0.9413607	best: 0.9427527 (208)	total: 46.2s	remaining: 36.7s
557:	learn: 0.9520471	test: 0.9413607	best: 0.9427527 (208)	total: 46.3s	remaining: 36.6s
558:	learn: 0.9521217	test: 0.9413607	best: 0.9427527 (208)	total: 46.3s	remaining: 36.6s
559:	learn: 0.9521217	test: 0.9413607	best: 0.9427527 (208)	total: 46.4s	remaining: 36.5s
560:	learn: 0.9522336	test: 0.9413607	best: 0.9427527 (208)	total: 46.5s	remaining: 36.4s
561:	learn: 0.9521963	test: 0.9413607	best: 0.9427527 (208)	total: 46.5s	remaining: 36.3s
562:	learn: 0.9522336	test: 0.9413607	best: 0.9427527 (208)	total: 46.6s	remaining: 36.2s
563:	learn: 0.9522709	test: 0.9413607	best: 0.9427527 (208)	total: 46.7s	remaining: 36.1s
564:	learn: 0.9522336	test: 0.9413607	best: 0.9427527 (208)	total: 46.8s	remaining: 36s
565:	learn: 0.9521963	test: 0.9413607	best: 0.9427527 (208)	total: 46.9s	remaining: 35.9s
566:	learn: 

648:	learn: 0.9534641	test: 0.9411867	best: 0.9427527 (208)	total: 53.7s	remaining: 29.1s
649:	learn: 0.9535387	test: 0.9412737	best: 0.9427527 (208)	total: 53.8s	remaining: 29s
650:	learn: 0.9535387	test: 0.9411867	best: 0.9427527 (208)	total: 53.9s	remaining: 28.9s
651:	learn: 0.9535760	test: 0.9411867	best: 0.9427527 (208)	total: 54s	remaining: 28.8s
652:	learn: 0.9536505	test: 0.9412737	best: 0.9427527 (208)	total: 54.1s	remaining: 28.7s
653:	learn: 0.9537251	test: 0.9412737	best: 0.9427527 (208)	total: 54.2s	remaining: 28.6s
654:	learn: 0.9537997	test: 0.9411867	best: 0.9427527 (208)	total: 54.2s	remaining: 28.6s
655:	learn: 0.9537624	test: 0.9412737	best: 0.9427527 (208)	total: 54.3s	remaining: 28.5s
656:	learn: 0.9538743	test: 0.9413607	best: 0.9427527 (208)	total: 54.4s	remaining: 28.4s
657:	learn: 0.9539488	test: 0.9413607	best: 0.9427527 (208)	total: 54.5s	remaining: 28.3s
658:	learn: 0.9539861	test: 0.9412737	best: 0.9427527 (208)	total: 54.6s	remaining: 28.2s
659:	learn: 0.

740:	learn: 0.9552166	test: 0.9413607	best: 0.9427527 (208)	total: 1m 1s	remaining: 21.5s
741:	learn: 0.9552539	test: 0.9413607	best: 0.9427527 (208)	total: 1m 1s	remaining: 21.4s
742:	learn: 0.9552539	test: 0.9413607	best: 0.9427527 (208)	total: 1m 1s	remaining: 21.4s
743:	learn: 0.9552539	test: 0.9413607	best: 0.9427527 (208)	total: 1m 1s	remaining: 21.3s
744:	learn: 0.9552539	test: 0.9413607	best: 0.9427527 (208)	total: 1m 1s	remaining: 21.2s
745:	learn: 0.9552539	test: 0.9413607	best: 0.9427527 (208)	total: 1m 2s	remaining: 21.1s
746:	learn: 0.9553658	test: 0.9413607	best: 0.9427527 (208)	total: 1m 2s	remaining: 21s
747:	learn: 0.9552912	test: 0.9414477	best: 0.9427527 (208)	total: 1m 2s	remaining: 21s
748:	learn: 0.9553658	test: 0.9413607	best: 0.9427527 (208)	total: 1m 2s	remaining: 20.9s
749:	learn: 0.9553658	test: 0.9413607	best: 0.9427527 (208)	total: 1m 2s	remaining: 20.8s
750:	learn: 0.9553658	test: 0.9413607	best: 0.9427527 (208)	total: 1m 2s	remaining: 20.7s
751:	learn: 0.

833:	learn: 0.9571556	test: 0.9411867	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.8s
834:	learn: 0.9571184	test: 0.9410997	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.7s
835:	learn: 0.9571929	test: 0.9410997	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.7s
836:	learn: 0.9572675	test: 0.9410997	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.6s
837:	learn: 0.9572302	test: 0.9412737	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.5s
838:	learn: 0.9573794	test: 0.9412737	best: 0.9427527 (208)	total: 1m 9s	remaining: 13.4s
839:	learn: 0.9573794	test: 0.9412737	best: 0.9427527 (208)	total: 1m 10s	remaining: 13.3s
840:	learn: 0.9573794	test: 0.9412737	best: 0.9427527 (208)	total: 1m 10s	remaining: 13.3s
841:	learn: 0.9573794	test: 0.9412737	best: 0.9427527 (208)	total: 1m 10s	remaining: 13.2s
842:	learn: 0.9574167	test: 0.9412737	best: 0.9427527 (208)	total: 1m 10s	remaining: 13.1s
843:	learn: 0.9575285	test: 0.9412737	best: 0.9427527 (208)	total: 1m 10s	remaining: 13s
844:	le

924:	learn: 0.9589455	test: 0.9409257	best: 0.9427527 (208)	total: 1m 18s	remaining: 6.39s
925:	learn: 0.9589455	test: 0.9409257	best: 0.9427527 (208)	total: 1m 19s	remaining: 6.31s
926:	learn: 0.9589455	test: 0.9409257	best: 0.9427527 (208)	total: 1m 19s	remaining: 6.23s
927:	learn: 0.9590201	test: 0.9408387	best: 0.9427527 (208)	total: 1m 19s	remaining: 6.15s
928:	learn: 0.9590201	test: 0.9407517	best: 0.9427527 (208)	total: 1m 19s	remaining: 6.06s
929:	learn: 0.9590573	test: 0.9407517	best: 0.9427527 (208)	total: 1m 19s	remaining: 5.98s
930:	learn: 0.9590946	test: 0.9407517	best: 0.9427527 (208)	total: 1m 19s	remaining: 5.9s
931:	learn: 0.9591692	test: 0.9408387	best: 0.9427527 (208)	total: 1m 19s	remaining: 5.82s
932:	learn: 0.9591319	test: 0.9408387	best: 0.9427527 (208)	total: 1m 19s	remaining: 5.73s
933:	learn: 0.9591319	test: 0.9408387	best: 0.9427527 (208)	total: 1m 19s	remaining: 5.65s
934:	learn: 0.9591319	test: 0.9408387	best: 0.9427527 (208)	total: 1m 20s	remaining: 5.57s


<catboost.core.CatBoostClassifier at 0x18a632f9b00>

In [32]:
print('the test accuracy is :{:.6f}'.format(accuracy_score(y_test,clf8.predict(X_test))))

the test accuracy is :0.942753


In [33]:
y_pred = clf8.predict(X_test)

In [34]:
print('accuracy')
print(accuracy_score(y_test, y_pred))
print('f1 score')
print(f1_score(y_test, y_pred, average='weighted'))
print(classification_report(y_test, y_pred))

accuracy
0.9427527405602923
f1 score
0.9296030879959152
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10537
           1       0.95      0.33      0.49       957

    accuracy                           0.94     11494
   macro avg       0.95      0.66      0.73     11494
weighted avg       0.94      0.94      0.93     11494



In [35]:
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,10522,15
1,643,314


In [38]:
test_features = test[features]

In [40]:
pred = clf8.predict(sub_test)

In [41]:
test["Promoted_or_Not"] = pred
test["Promoted_or_Not"] = test["Promoted_or_Not"].astype('int64')

In [44]:
submission = test[["EmployeeNo", "Promoted_or_Not"]]

In [46]:
submission.to_csv('Femi4.csv',index=False)