In [1]:
import matplotlib.pyplot as plt
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.model_selection import StratifiedShuffleSplit

In [27]:
train = pd.read_csv("01. Data/train.csv")
test = pd.read_csv("01. Data/test.csv")

In [28]:
oe_label = OrdinalEncoder()
train["Churn_risk"] = oe_label.fit_transform(np.array(train["Churn_risk"]).reshape(-1, 1))

oe = OrdinalEncoder()
train["Department"] = oe.fit_transform(np.array(train["Department"]).reshape(-1, 1))
test["Department"] = oe.transform(np.array(test["Department"]).reshape(-1, 1))
train["Gender"] = oe.fit_transform(np.array(train["Gender"]).reshape(-1, 1))
test["Gender"] = oe.transform(np.array(test["Gender"]).reshape(-1, 1))

marital_columns = list(train["Marital_status"].unique())
train = pd.concat([train.drop("Marital_status", axis=1), pd.get_dummies(train["Marital_status"])], axis=1)
test = pd.concat([test.drop("Marital_status", axis=1), pd.get_dummies(test["Marital_status"])], axis=1)

In [29]:
#Train data

X = train.drop(["Employee_ID", "Churn_risk"], axis=1)
y = train["Churn_risk"]

In [30]:
#Categorical variables

categorical_columns = ["Department", "Gender"]
categorical_columns.extend(marital_columns)

other_columns = ['Age', 'Days_off', 'Rotations', 'Satis_leader', 'Satis_team', 
                 'Emails', 'Tenure', 'Bonus', 'Distance', 'Kids', 'Overtime']

all_columns = other_columns
all_columns.extend(categorical_columns)

In [31]:
# Missing Values

imputer = KNNImputer()
X = imputer.fit_transform(X)
X = pd.DataFrame(X, columns=all_columns)

test_ids = test["Employee_ID"]
test = pd.DataFrame(imputer.transform(test.drop("Employee_ID", axis=1)), columns=all_columns)
test += test.min().min()

In [32]:
#Scaling

X = pd.DataFrame(X, columns=all_columns)
X += abs(X.min().min())

In [33]:
# Naive Bayes Model

clf1 = MultinomialNB()
clf2 = CategoricalNB()

In [41]:
def kfold(clf1, clf2):
    
    nfolds = 600
    print("# of Folds:", nfolds)
        
    strat = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=1421)
    outcomes = []

    for train_indices, test_indices in strat.split(X, y):
        X_train1, X_test1 = X[other_columns].values[train_indices], X[other_columns].values[test_indices]
        X_train2, X_test2 = X[categorical_columns].values[train_indices], X[categorical_columns].values[test_indices]
        y_train, y_test = y.values[train_indices], y.values[test_indices]

        clf1.fit(X_train1, y_train)
        clf2.fit(X_train2, y_train)        

        predictions1 = clf1.predict_proba(X_test1)
        predictions2 = clf2.predict_proba(X_test2)
        predictions = ((predictions1 + predictions2)/2).argmax(axis=1)

        f1 = f1_score(y_test, predictions, average="micro")
        outcomes.append(f1)

    mean_outcome = np.mean(outcomes)
    return mean_outcome

mean_outcome = kfold(clf1, clf2)
print("mean_outcome:", mean_outcome)

# of Folds: 600
mean_outcome: 0.668775641025641


In [43]:
clf1.fit(X[other_columns], y);
clf2.fit(X[categorical_columns], y);

In [58]:
#predictions

preds1 = clf1.predict_proba(pd.DataFrame(test, columns=all_columns)[other_columns])
preds2 = clf2.predict_proba(pd.DataFrame(test, columns=all_columns)[categorical_columns])
preds = ((preds1 + preds2)/2).argmax(axis=1)
preds = oe_label.inverse_transform(preds.reshape(-1,1)).reshape(-1,)

In [63]:
predictions = pd.DataFrame({"Employee_ID": test_ids, "Churn_risk": preds})
predictions.to_csv('02. Predictions/m20190450_version2.csv', index = False)
predictions.head()

Unnamed: 0,Employee_ID,Churn_risk
0,1005201,low
1,1005202,medium
2,1005203,medium
3,1005204,medium
4,1005205,medium
