In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate,cross_val_score,KFold,StratifiedKFold,GridSearchCV
from sklearn.preprocessing import StandardScaler,minmax_scale

### Datasets

In [3]:
HR = pd.read_csv('C:/Users/Nithin/Downloads/ML_DT/HR_comma_sep.csv')

### Data Prep

In [4]:
HR.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
HR.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
department               0
salary                   0
dtype: int64

In [6]:
#Ordinal Label Encoding
HR.salary = np.where(HR.salary == 'low',1,
            np.where(HR.salary == 'medium',2,3))

In [7]:
#One hot encoding
HR.department.value_counts()

dummy_df = pd.get_dummies(HR.department,drop_first=True,prefix='Dept')

In [8]:
HR = pd.concat([HR,dummy_df],axis=1)
del HR['department']

### Train & Test Model

In [9]:
feat = ['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'salary', 'Dept_RandD', 'Dept_accounting',
       'Dept_hr', 'Dept_management', 'Dept_marketing', 'Dept_product_mng',
       'Dept_sales', 'Dept_support', 'Dept_technical']

In [10]:
train_x,test_x,train_y,test_y = train_test_split(HR[feat],HR['left'],test_size=0.3,random_state=12345)

### Without Hyperparameter Tuning

In [11]:
svc = SVC(C=1,probability=True)

svc.fit(train_x,train_y)

SVC(C=1, probability=True)

In [12]:
train_pred = pd.DataFrame(svc.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

0.8047502182040569

In [13]:
test_pred = pd.DataFrame(svc.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)

0.8125346467816492

### With Hyperparameter Tuning

In [14]:
params = {'C':[0.1,1,10,100,100],
          'gamma':[0.0001,0.001,0.01,1],
          'kernel':['linear', 'poly', 'rbf', 'sigmoid']}

In [None]:
%time svc_H = GridSearchCV(SVC(), param_grid=params, cv=5, scoring = 'roc_auc')

In [None]:
%time svc_H.fit(train_x,train_y)

In [None]:
svc_H.best_params_

In [15]:
svc = SVC(C=10,gamma= 0.001,kernel='sigmoid',probability=True)

svc.fit(train_x,train_y)

SVC(C=10, gamma=0.001, kernel='sigmoid', probability=True)

In [None]:
train_pred = pd.DataFrame(svc.predict_proba(train_x))[1]
roc_auc_score(train_y,train_pred)

In [None]:
test_pred = pd.DataFrame(svc.predict_proba(test_x))[1]
roc_auc_score(test_y,test_pred)