Uses the [Census Income Data Set](http://archive.ics.uci.edu/ml/datasets/Census+Income).

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://gist.githubusercontent.com/DanielKerrigan/45a3fb3a2bd3d4863d56107f245b20d1/raw/ac994dd75e5a623fcb481f7396a718c9a1ea5900/bank-additional-full.csv'

In [None]:
df = pd.read_csv(url, sep=';')

In [None]:
df.rename(columns={"y": "label"}, inplace=True)

In [None]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train = df_train_split.copy()
df_test = df_test_split.copy()

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
num_train = df_train.shape[0]

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_test['label'].replace({ 'no': 0, 'yes': 1 }, inplace=True)
df_train['label'].replace({ 'no': 0, 'yes': 1 }, inplace=True)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df = pd.concat([df_train, df_test])

In [None]:
df_one_hot = pd.get_dummies(df)
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [None]:
df_train_one_hot.shape[0] == num_train

In [None]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [None]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [None]:
X_train.shape[0] == num_train

In [None]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='f1_weighted')
clf.fit(X_train, y_train)

In [None]:
print(clf.best_score_)
print(clf.best_params_)

In [None]:
train_preds = np.where(clf.predict(X_train) > 0.5, 1, 0)
(train_preds == y_train).sum() / num_train

In [None]:
probs = clf.predict(X_test)

In [None]:
predictions = np.where(probs > 0.5, 1, 0)

In [None]:
df_test['prediction'] = predictions

In [None]:
df_test['label'].replace({ 0: 'no term deposit', 1: 'term deposit' }, inplace=True)
df_test['prediction'].replace({ 0: 'no term deposit', 1: 'term deposit' }, inplace=True)

In [None]:
df_test.to_csv('bank-marketing.csv', index=False, float_format='%.3f')