Uses the [churn](https://epistasislab.github.io/pmlb/profile/churn.html) [dataset](https://www.openml.org/d/41283).

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from pmlb import fetch_data
from sklearn.model_selection import train_test_split

In [None]:
df = fetch_data('churn', dropna=True)

In [None]:
df.rename(columns={"target": "label"}, inplace=True)
df.drop(columns=["phone number"], inplace=True)

In [None]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train = df_train_split.copy()
df_test = df_test_split.copy()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
num_train = df_train.shape[0]

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df = pd.concat([df_train, df_test])

In [None]:
df_one_hot = pd.get_dummies(df, columns=['state', 'area code'])
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [None]:
df_train_one_hot.shape[0] == num_train

In [None]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [None]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [None]:
X_train.shape[0] == num_train

In [None]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='f1_weighted')
clf.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)

In [None]:
train_preds = np.where(clf.predict(X_train) > 0.5, 1, 0)
(train_preds == y_train).sum() / num_train

In [None]:
probs = clf.predict(X_test)

In [None]:
predictions = np.where(probs > 0.5, 1, 0)

In [None]:
(predictions == y_test).sum() / df_test.shape[0]

In [None]:
df_test['prediction'] = predictions

In [None]:
df_test['label'].replace({ 0: 'no churn', 1: 'churn' }, inplace=True)
df_test['prediction'].replace({ 0: 'no churn', 1: 'churn' }, inplace=True)

In [None]:
df_test.to_csv('churn.csv', index=False)