In [1]:
# # Utils

# def run_kfold(clf, X, y):
#     nfolds = 1000    
#     sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=2020)
#     mean_outcome = 0
    
#     for train_indices, test_indices in sss.split(X, y):
#         X_train, X_test = X.values[train_indices], X.values[test_indices]
#         y_train, y_test = y.values[train_indices], y.values[test_indices]
#         clf.fit(X_train, y_train)
#         predictions = clf.predict(X_test)
#         f1 = f1_score(y_test, predictions, average="micro")
#         mean_outcome += f1

#     mean_outcome /= nfolds
#     print(f"\t...mean F1-Score: {mean_outcome}")
    
#     return mean_outcome

# 0. Imports

In [2]:
import os
from datetime import datetime
import itertools
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_selection import RFECV

# 1. Loading Data

In [3]:
input_folder = "./data"
output_folder = "./results"
profiles_folder = "./profiles"
train = pd.read_csv(f"{input_folder}/train.csv")
test = pd.read_csv(f"{input_folder}/test.csv")

# 2. Preprocessing data

In [4]:
target = "Churn_risk"
train["is_train"] = 1
y = train[target]
test["is_train"] = 0
alldata = pd.concat([train.drop(target, axis=1), test])

In [5]:
for column in ["Department", "Gender", "Marital_status"]:
    alldata = pd.concat([alldata.drop(column, axis=1), pd.get_dummies(alldata[column], prefix=column.split("_")[0])], axis=1)

train = alldata[alldata["is_train"]==1].drop("is_train", axis=1)
test  = alldata[alldata["is_train"]==0].drop("is_train", axis=1)

In [6]:
churn_risk_dict = {"low": 0, "medium": 1, "high": 2}
reverse_churn_risk_dict = {value: key for key, value in churn_risk_dict.items()}

X = train.drop(["Employee_ID"], axis=1)
y = y.map(churn_risk_dict).astype(np.int8)

train_cols = X.columns

Dealing with missing values:

In [7]:
X = pd.DataFrame(X.fillna(X.mean()), columns=train_cols)

test_ids = test["Employee_ID"]
test = pd.DataFrame(test.fillna(test.mean()), columns=train_cols)

# 2. Train

Create the DT model:

In [8]:
# clf = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
#                            max_depth=13, max_features=0.9, max_leaf_nodes=None,
#                            min_impurity_decrease=0.0, min_impurity_split=None,
#                            min_samples_leaf=1, min_samples_split=2,
#                            min_weight_fraction_leaf=0.0, presort='deprecated',
#                            random_state=2020, splitter='best')

clf = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features=0.7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2020, splitter='best')

Feature Selection:

In [9]:
# %%time

# nfolds = 1000
# sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=2020)

# selector = RFECV(clf, step=1, cv=sss, scoring="f1_micro")
# selector = selector.fit(X, y)
# print("selector", selector)
# print("selector.support_", selector.support_)

In [10]:
# best_cols = X.columns[selector.support_]
# best_cols

In [11]:
best_cols = ['Age', 'Days_off', 'Satis_leader', 'Satis_team', 'Emails', 'Tenure', 'Bonus', 'Distance', 'Kids', 'Overtime']

Evaluate the model:

In [12]:
# score = run_kfold(clf, X[best_cols], y)

Tune the model:

In [13]:
# %%time

# clf = DecisionTreeClassifier(random_state=2020)

# # Choose some parameter combinations to try
# parameters = {
#     "criterion": ["gini", "entropy"],
#     "class_weight": [None, "balanced"],
#     "max_features": ["sqrt", "log2", 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#     "splitter": ["best", "random"],
#     "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
# }

# # Run the grid search
# grid_obj = GridSearchCV(clf, parameters, scoring="f1_micro", cv=5, verbose=2, refit=True, n_jobs=-1)
# grid_obj.fit(X[best_cols], y)

# # Set the clf to the best combination of parameters
# clf = grid_obj.best_estimator_
# clf_params = grid_obj.best_params_
# clf_score = grid_obj.best_score_

In [14]:
# clf_params

Evaluate the model:

In [15]:
# %%time

# run_kfold(clf, X[best_cols], y)

Train the models:

In [16]:
clf.fit(X[best_cols], y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features=0.7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2020, splitter='best')

# 3. Predictions

Making predictions:

In [17]:
preds = clf.predict(test[best_cols])

# 4. Writing Submission

Reversing `Churn_risk` encoding:

In [18]:
preds = pd.Series(preds).map(reverse_churn_risk_dict)

Lazy get submission version:

In [19]:
def find_version():
    """
    Finds automatically the version of the submission. :)
    """
    version = 1
    if len(os.listdir("results")) > 0:
        for file in os.listdir("results"):
            if file.split("_")[1].startswith("version"):
                if int(file.split("_")[1].split("n")[1].split(".")[0]) > version:
                    version = int(file.split("_")[1].split("n")[1].split(".")[0])
        return version + 1
    else:
        return version

Writing submission:

In [20]:
output = pd.DataFrame({"Employee_ID": test_ids, "Churn_risk": preds})
output.to_csv(f'{output_folder}/m20180428_version{find_version()}.csv', index = False)
output.head()

Unnamed: 0,Employee_ID,Churn_risk
0,1005201,low
1,1005202,medium
2,1005203,medium
3,1005204,low
4,1005205,medium
