In [None]:
# Utils

def run_kfold(clf, X, y):
    nfolds = 1000    
    sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=2020)
    mean_outcome = 0
    
    for train_indices, test_indices in sss.split(X, y):
        X_train, X_test = X.values[train_indices], X.values[test_indices]
        y_train, y_test = y.values[train_indices], y.values[test_indices]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        f1 = f1_score(y_test, predictions, average="micro")
        mean_outcome += f1

    mean_outcome /= nfolds
    print(f"\t...mean F1-Score: {mean_outcome}")
    
    return mean_outcome

# 0. Imports

In [1]:
import os
from datetime import datetime
import itertools
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit

# 1. Loading Data

In [2]:
input_folder = "./data"
output_folder = "./results"
profiles_folder = "./profiles"
train = pd.read_csv(f"{input_folder}/train.csv")
test = pd.read_csv(f"{input_folder}/test.csv")

# 2. Preprocessing data

In [3]:
target = "Churn_risk"
train["is_train"] = 1
y = train[target]
test["is_train"] = 0
alldata = pd.concat([train.drop(target, axis=1), test])

In [4]:
for column in ["Department", "Gender", "Marital_status"]:
    alldata = pd.concat([alldata.drop(column, axis=1), pd.get_dummies(alldata[column], prefix=column.split("_")[0])], axis=1)

train = alldata[alldata["is_train"]==1].drop("is_train", axis=1)
test  = alldata[alldata["is_train"]==0].drop("is_train", axis=1)

In [5]:
churn_risk_dict = {"low": 0, "medium": 1, "high": 2}
reverse_churn_risk_dict = {value: key for key, value in churn_risk_dict.items()}

X = train.drop(["Employee_ID"], axis=1)
y = y.map(churn_risk_dict).astype(np.int8)

train_cols = X.columns

Dealing with missing values:

In [6]:
impute = "mean"

if impute == "KNN":
    imputer = KNNImputer()
    X = pd.DataFrame(imputer.fit_transform(X), columns=train_cols)
else:
    X = pd.DataFrame(X.fillna(X.mean()), columns=train_cols)

test_ids = test["Employee_ID"]

if impute == "KNN":
    test = pd.DataFrame(imputer.transform(test.drop("Employee_ID", axis=1)), columns=train_cols)
else:
    test = pd.DataFrame(test.fillna(test.mean()), columns=train_cols)

# 2. Train

Create the DT model:

In [7]:
clf = DecisionTreeClassifier(random_state=2020)

In [8]:
clf = DecisionTreeClassifier(
    random_state=2020,
    class_weight=None,
    criterion="gini",
    max_depth=10,
    max_features=0.7,
    splitter="best",
    )

Evaluate the model:

In [10]:
%%time

nfolds = 1000
sss = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.25, random_state=2020)
total_combs = 0
best_score, best_combo = -1, []

Xraw = X.copy()

for length in [len(train_cols)-2, len(train_cols)-1, len(train_cols)]:
    for combo in itertools.combinations(train_cols, length):
        total_combs += 1
        if total_combs % 10 == 0: print(f"{total_combs}...")
        mean_outcome = 0
        
        for train_indices, test_indices in sss.split(X, y):
            X_train, X_test = X[list(combo)].values[train_indices], X[list(combo)].values[test_indices]
            y_train, y_test = y.values[train_indices], y.values[test_indices]
            
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
            f1 = f1_score(y_test, predictions, average="micro")
            mean_outcome += f1

        mean_outcome /= nfolds        
        
        if mean_outcome > best_score:
            best_score = mean_outcome
            best_combo = combo
            
print(f"{total_combs} total combinations")
print(f"best combination: {best_combo}")

10...
20...
30...
40...
50...
60...
70...
80...
90...
100...
110...
120...
130...
140...
150...
160...
170...
180...
190...
200...
210...
220...
230...
240...
250...
260...
270...
277 total combinations
best combination: ('Age', 'Days_off', 'Rotations', 'Satis_leader', 'Satis_team', 'Emails', 'Tenure', 'Bonus', 'Distance', 'Kids', 'Overtime', 'Department_accounting', 'Department_finances', 'Department_human resources', 'Department_sales', 'Gender_female', 'Gender_male', 'Marital_married', 'Marital_single', 'Marital_together', 'Marital_widow')
Wall time: 1h 22min 27s


In [17]:
best_combo = list(best_combo)
len(best_combo)

21

In [14]:
score = run_kfold(clf, X[list(best_combo)], y)

# of Folds: 1000
	...mean F1-Score: 0.7058861538461554


Tune the model:

In [18]:
%%time

clf = DecisionTreeClassifier(random_state=2020)

# Choose some parameter combinations to try
parameters = {
    "criterion": ["gini", "entropy"],
    "class_weight": [None, "balanced"],
    "max_features": ["sqrt", "log2", 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "splitter": ["best", "random"],
    "max_depth": [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}

# Run the grid search
grid_obj = GridSearchCV(clf, parameters, scoring="f1_micro", cv=5, verbose=1, refit=True)
grid_obj.fit(X[best_combo], y)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_
clf_params = grid_obj.best_params_
clf_score = grid_obj.best_score_

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Wall time: 1min 6s


[Parallel(n_jobs=1)]: Done 5400 out of 5400 | elapsed:  1.1min finished


In [19]:
clf_params

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 14,
 'max_features': 0.7,
 'splitter': 'best'}

In [21]:
%%time

run_kfold(clf, X[best_combo], y)

# of Folds: 1000
	...mean F1-Score: 0.714363846153846
Wall time: 19.3 s


0.714363846153846

Train the models:

In [22]:
clf.fit(X[best_combo], y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=14, max_features=0.7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2020, splitter='best')

# 3. Predictions

Making predictions:

In [23]:
# preds = clf.predict(test)
preds = clf.predict(test[best_combo])

# 4. Writing Submission

Reversing `Churn_risk` encoding:

In [24]:
preds = pd.Series(preds).map(reverse_churn_risk_dict)

Lazy get submission version:

In [25]:
def find_version():
    """
    Finds automatically the version of the submission. :)
    """
    version = 1
    if len(os.listdir("results")) > 0:
        for file in os.listdir("results"):
            if file.split("_")[1].startswith("version"):
                if int(file.split("_")[1].split("n")[1].split(".")[0]) > version:
                    version = int(file.split("_")[1].split("n")[1].split(".")[0])
        return version + 1
    else:
        return version

Writing submission:

In [26]:
output = pd.DataFrame({"Employee_ID": test_ids, "Churn_risk": preds})
output.to_csv(f'{output_folder}/m20180428_version{find_version()}.csv', index = False)
output.head()

Unnamed: 0,Employee_ID,Churn_risk
0,1005201,low
1,1005202,medium
2,1005203,medium
3,1005204,medium
4,1005205,medium
