In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate,cross_val_score,KFold,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler,minmax_scale

### Datasets

In [3]:
HR = pd.read_csv('C:/Users/Nithin/Downloads/ML_DT/HR_comma_sep.csv')

### Data Prep

In [4]:
HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
HR.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

In [6]:
#Ordinal Label Encoding
HR.salary = np.where(HR.salary == 'low',1,
            np.where(HR.salary == 'medium',2,3))

In [7]:
#One hot encoding
HR.department.value_counts()

dummy_df = pd.get_dummies(HR.department,drop_first=True,prefix='Dept')

In [8]:
HR = pd.concat([HR,dummy_df],axis=1)
del HR['department']

### Train & Test Model

In [9]:
feat = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'Dept_RandD', 'Dept_accounting',
       'Dept_hr', 'Dept_management', 'Dept_marketing', 'Dept_product_mng',
       'Dept_sales', 'Dept_support', 'Dept_technical']

In [10]:
train_x,test_x,train_y,test_y = train_test_split(HR[feat],HR['left'],test_size=0.3,random_state=12345)

### Standardisation of Train & Test Data

In [13]:
sc = StandardScaler()

sc_model = sc.fit(train_x)

In [20]:
train_x_t = sc_model.transform(train_x)
train_x_t = pd.DataFrame(train_x_t)
train_x_t.columns = train_x.columns
train_x_t

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,Dept_RandD,Dept_accounting,Dept_hr,Dept_management,Dept_marketing,Dept_product_mng,Dept_sales,Dept_support,Dept_technical
0,1.076172,0.950348,-0.650406,-0.957957,-0.342879,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029
1,-1.085246,0.774938,0.155806,-1.555766,0.341120,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,4.371845,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,-0.464462
2,0.996120,-1.213036,0.155806,-0.977884,-0.342879,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029
3,0.275647,-0.160579,0.155806,0.476786,-1.026878,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029
4,0.795988,1.184227,-0.650406,-1.296716,-0.342879,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,4.026348,-0.250947,-0.621883,-0.415074,-0.464462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10494,-0.404799,0.248710,0.155806,-1.217008,-0.342879,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,2.409207,-0.464462
10495,-0.364773,-1.329976,0.155806,0.955034,0.341120,2.447450,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029
10496,-1.765692,1.184227,0.155806,-0.240585,-1.026878,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029
10497,-0.484852,0.950348,0.155806,0.775691,3.077116,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,3.984899,-0.621883,-0.415074,-0.464462


In [23]:
test_x_t = sc_model.transform(test_x)
test_x_t = pd.DataFrame(test_x_t)
test_x_t.columns = test_x_t.columns
test_x_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.156225,-0.920687,-0.650406,-0.997811,-0.342879,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,1.608019,-0.415074,-0.464462
1,-0.684983,-1.388445,-1.456617,-1.097446,-0.342879,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,2.409207,-0.464462
2,0.355700,-0.511398,-0.650406,-0.918103,0.341120,2.447450,-0.146975,-0.923000,4.257213,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,-0.464462
3,-0.164642,-0.745277,-1.456617,-1.794890,-1.026878,-0.408588,-0.146975,2.206526,-0.234895,-0.233538,-0.228736,4.764428,-0.248364,-0.250947,-0.621883,-0.415074,-0.464462
4,1.036146,1.067287,0.962017,1.134377,1.025119,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,2.409207,-0.464462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4495,-0.364773,0.365649,0.155806,0.098173,-1.026878,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,4.026348,-0.250947,-0.621883,-0.415074,-0.464462
4496,0.435752,-1.096096,-0.650406,-0.997811,0.341120,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,-0.464462
4497,0.916067,-0.920687,0.962017,0.456859,-1.026878,-0.408588,-0.146975,0.641763,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,2.409207,-0.464462
4498,-0.725009,-1.505385,-1.456617,-0.898176,-0.342879,-0.408588,-0.146975,-0.923000,-0.234895,-0.233538,-0.228736,-0.209889,-0.248364,-0.250947,-0.621883,-0.415074,2.153029


### Without Hyperparameter Tuning

In [25]:
KNN = KNeighborsClassifier(n_neighbors=3,weights='uniform',algorithm='auto')

KNN.fit(train_x,train_y)

KNeighborsClassifier(n_neighbors=3)

In [29]:
train_pred = pd.DataFrame(KNN.predict_proba(train_x_t))[1]
roc_auc_score(train_y,train_pred)

0.5

In [30]:
test_pred = pd.DataFrame(KNN.predict_proba(test_x_t))[1]
roc_auc_score(test_y,test_pred)

0.5

### With Hyperparameter Tuning

In [28]:
params = {'n_neighbors':[3,5,7,9],
          'weights':['uniform','distance'],
          'algorithm': ['ball_tree','kd_tree']}

In [31]:
KNN_H = GridSearchCV(KNeighborsClassifier(),param_grid=params,cv=5,scoring='roc_auc')

KNN_H.fit(train_x_t,train_y)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree'],
                         'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']},
             scoring='roc_auc')

In [32]:
KNN_H.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}

### Using Best Params

In [33]:
KNN = KNeighborsClassifier(n_neighbors=9,weights='distance',algorithm='ball_tree')

KNN.fit(train_x_t,train_y)

KNeighborsClassifier(algorithm='ball_tree', n_neighbors=9, weights='distance')

In [34]:
train_pred = pd.DataFrame(KNN.predict_proba(train_x_t))[1]
roc_auc_score(train_y,train_pred)

1.0

In [35]:
test_pred = pd.DataFrame(KNN.predict_proba(test_x_t))[1]
roc_auc_score(test_y,test_pred)

0.9834435883336463