In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, OneHotEncoder, StandardScaler

In [20]:
df = pd.read_csv("data/general_data.csv", index_col="EmployeeID")
df.drop(['EmployeeCount','StandardHours'],axis=1, inplace = True) # Remove boring columns 
cat_cols = list(df.dtypes[df.dtypes == 'object'].index.values)
cat_cols.remove('Attrition') # remove target column 
num_cols = list(df.dtypes[df.dtypes != 'object'].index.values) + ["Attrition"]
num_cols.remove('Attrition')

for col in cat_cols:
    df[col] = df[col].astype('category')

X = df[df.columns.difference(['Attrition'])]
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [40]:
numerical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(2))])


categorical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

column_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical, num_cols),
        ('cat', categorical, cat_cols)])

clf = Pipeline(steps=[('preprocessor', column_preprocessor),
                      ('classifier', KNeighborsClassifier())])

In [41]:
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.826530612244898
0.6650904605263158
0.6383409423722973
              precision    recall  f1-score   support

          No       0.88      0.91      0.90       741
         Yes       0.45      0.36      0.40       141

    accuracy                           0.83       882
   macro avg       0.67      0.64      0.65       882
weighted avg       0.81      0.83      0.82       882



In [42]:
from sklearn import set_config
set_config(display='diagram')
clf

# Gridsearch upon the pipeline

In [43]:
numerical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2))])
categorical = categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

column_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical, num_cols),
        ('cat', categorical, cat_cols)])

clf = Pipeline(steps=[('preprocessor', column_preprocessor),
                      ('classifier', KNeighborsClassifier())])

In [46]:
param_dict = { 
    "classifier__leaf_size": list(range(1,20,5)),
    "classifier__n_neighbors": list(range(1,20,5)),
    "classifier__p": [1,2,3]
}

grid = GridSearchCV(clf, param_dict, cv=3, verbose=1, n_jobs=-1)
best_model = grid.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  3.2min finished


In [48]:
neigh = best_model.best_estimator_
neigh.fit(X_train, y_train) 
y_pred = neigh.predict(X_test)

print(f1_score(y_test, y_pred, average='micro'))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))
print(classification_report(y_test,y_pred))

0.9897959183673469
0.9850981767180926
0.9766991127573434
              precision    recall  f1-score   support

          No       0.99      1.00      0.99       741
         Yes       0.98      0.96      0.97       141

    accuracy                           0.99       882
   macro avg       0.99      0.98      0.98       882
weighted avg       0.99      0.99      0.99       882



In [None]:
param_dict = { 
    "preprocessor__num__poly__degree": [1,2,3],
    "classifier__leaf_size": list(range(1,20,5)),
    "classifier__n_neighbors": list(range(1,20,5)),
    "classifier__p": [1,2,3]
}

grid = GridSearchCV(clf, param_dict, cv=3, verbose=1, n_jobs=-1)
best_model = grid.fit(X_train, y_train)