In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [None]:
#  import data
test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')
train.head()

In [None]:
train.shape, test.shape

N.B: As there are relatively few training examples (1677) compared to the number of variables, we'll need to be aware of overfitting


In [None]:
# chck nulls
train.isna().sum()

Training a baseline model (LGBM Classifier)

In [None]:
from lightgbm.sklearn import LGBMClassifier
import lightgbm as lgbm
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# Removing id and target from training set
features = list(train.columns)
features.remove('id')
features.remove('Attrition')

target = 'Attrition'

In [None]:
#  encoding string fields to integers
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
string_cols = list((train.dtypes[train.dtypes == 'object']).index)
string_cols

In [None]:
# applying encoding
label_encoder = MultiColumnLabelEncoder(columns=string_cols)

In [None]:
train = label_encoder.fit_transform(train)
test = label_encoder.transform(test)

In [None]:
clfs = []
scores = []

'''
As our target classes are imbalanced, StratifiedKFold ensures each fold has a good representation of all classes.
This helps to avoid over-representing one class in training and under-representing another in testing.
'''
kf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
for train_index, val_index in kf.split(train, y=train['Attrition']):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]
    
    X_train = lgbm.Dataset(X_train)
    y_train = lgbm.Dataset(y_train)

    clf = LGBMClassifier(n_estimators=150, categorical_feature=[1, 3, 6, 9, 13, 15, 19, 20, 33], metric='auc')
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    preds = clf.predict_proba(X_val)
    
    clfs.append(clf)
    scores.append(roc_auc_score(y_val, preds[:, 1]))
print(f'mean score across all folds: {np.mean(scores)}')

In [None]:
# which variables have most significance:
for i in clf.feature_importances_.argsort()[::-1]:
    print(features[i], clf.feature_importances_[i]/clf.feature_importances_.sum())

Clearly employee pay has the greatest impact on our model, along with age and commute.

Below we include CatBoostClassifier, another gradient boosting algorithm for classification before ensembling our models.

In [None]:
from catboost import CatBoostClassifier

scores = []
kf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
for train_index, val_index in kf.split(train, y=train['Attrition']):
    X_train, X_val = train[features].loc[train_index], train[features].loc[val_index]
    y_train, y_val = train[target][train_index], train[target][val_index]

    clf = CatBoostClassifier(iterations=200)
    clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    
    preds = clf.predict_proba(X_val.values)[:, 1]
    clfs.append(clf)
    scores.append(roc_auc_score(y_val, preds))
print(f'mean auc across all folds: {np.mean(scores)}')

In [None]:
# Applying prediction to test set 
test_preds = []

for clf in clfs:
    preds = clf.predict_proba(test[features].values)
    test_preds.append(preds[:, 1])

In [None]:
#  Mean of predictions
test_preds = np.stack(test_preds).mean(0)
test_preds

In [None]:
prediction = pd.DataFrame(data={'id': test.id, 'Attrition': test_preds})
prediction.head()

In [None]:
prediction.to_csv('..data/predicition.csv', index=False)