Uses the [Bank Marketing Data Set](https://archive.ics.uci.edu/ml/datasets/bank+marketing).

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
url = 'https://gist.githubusercontent.com/DanielKerrigan/45a3fb3a2bd3d4863d56107f245b20d1/raw/ac994dd75e5a623fcb481f7396a718c9a1ea5900/bank-additional-full.csv'

In [3]:
df = pd.read_csv(url, sep=';')

In [4]:
df.rename(columns={
    "y": "label",
    'housing': 'housing_loan',
    'loan': 'personal_loan',
    'campaign': 'num_contacts_cur',
    'pdays': 'days_passed',
    'previous': 'num_contacts_prev',
    'poutcome': 'prev_outcome',
}, inplace=True)
df.drop(columns=['duration'], inplace=True)

In [5]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train = df_train_split.copy()
df_test = df_test_split.copy()

In [6]:
df_train.shape

(30891, 20)

In [7]:
df_test.shape

(10297, 20)

In [8]:
num_train = df_train.shape[0]

In [9]:
df_train.head()

Unnamed: 0,age,job,marital,education,default,housing_loan,personal_loan,contact,month,day_of_week,num_contacts_cur,days_passed,num_contacts_prev,prev_outcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
9347,45,admin.,married,high.school,no,no,no,telephone,jun,fri,5,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
29556,36,management,married,university.degree,no,yes,yes,cellular,apr,mon,1,999,1,failure,-1.8,93.075,-47.1,1.405,5099.1,no
5331,39,blue-collar,married,basic.9y,no,yes,no,telephone,may,fri,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
36786,64,retired,married,unknown,no,yes,no,cellular,jun,mon,3,999,0,nonexistent,-2.9,92.963,-40.8,1.26,5076.2,no
36717,31,management,married,university.degree,no,no,no,telephone,jun,fri,2,999,0,nonexistent,-2.9,92.963,-40.8,1.268,5076.2,no


In [10]:
df_test.head()

Unnamed: 0,age,job,marital,education,default,housing_loan,personal_loan,contact,month,day_of_week,num_contacts_cur,days_passed,num_contacts_prev,prev_outcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
14482,57,self-employed,married,professional.course,no,yes,no,cellular,jul,tue,2,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,yes
40775,25,student,single,high.school,no,no,no,telephone,sep,mon,1,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,yes
26172,30,services,married,high.school,no,yes,no,cellular,nov,wed,4,999,1,failure,-0.1,93.2,-42.0,4.12,5195.8,no
13116,28,technician,divorced,university.degree,no,no,no,cellular,jul,wed,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
21993,30,technician,married,university.degree,no,no,yes,cellular,aug,wed,1,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,no


In [11]:
df_test['label'].replace({ 'no': 0, 'yes': 1 }, inplace=True)
df_train['label'].replace({ 'no': 0, 'yes': 1 }, inplace=True)

In [12]:
df_train.head()

Unnamed: 0,age,job,marital,education,default,housing_loan,personal_loan,contact,month,day_of_week,num_contacts_cur,days_passed,num_contacts_prev,prev_outcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
9347,45,admin.,married,high.school,no,no,no,telephone,jun,fri,5,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,0
29556,36,management,married,university.degree,no,yes,yes,cellular,apr,mon,1,999,1,failure,-1.8,93.075,-47.1,1.405,5099.1,0
5331,39,blue-collar,married,basic.9y,no,yes,no,telephone,may,fri,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
36786,64,retired,married,unknown,no,yes,no,cellular,jun,mon,3,999,0,nonexistent,-2.9,92.963,-40.8,1.26,5076.2,0
36717,31,management,married,university.degree,no,no,no,telephone,jun,fri,2,999,0,nonexistent,-2.9,92.963,-40.8,1.268,5076.2,0


In [13]:
df_test.head()

Unnamed: 0,age,job,marital,education,default,housing_loan,personal_loan,contact,month,day_of_week,num_contacts_cur,days_passed,num_contacts_prev,prev_outcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
14482,57,self-employed,married,professional.course,no,yes,no,cellular,jul,tue,2,999,0,nonexistent,1.4,93.918,-42.7,4.961,5228.1,1
40775,25,student,single,high.school,no,no,no,telephone,sep,mon,1,999,0,nonexistent,-1.1,94.199,-37.5,0.876,4963.6,1
26172,30,services,married,high.school,no,yes,no,cellular,nov,wed,4,999,1,failure,-0.1,93.2,-42.0,4.12,5195.8,0
13116,28,technician,divorced,university.degree,no,no,no,cellular,jul,wed,1,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,0
21993,30,technician,married,university.degree,no,no,yes,cellular,aug,wed,1,999,0,nonexistent,1.4,93.444,-36.1,4.964,5228.1,0


In [14]:
df = pd.concat([df_train, df_test])

In [15]:
df_one_hot = pd.get_dummies(df)
df_train_one_hot = df_one_hot[:num_train]
df_test_one_hot = df_one_hot[num_train:]

In [16]:
df_train_one_hot.shape[0] == num_train

True

In [17]:
df_train.head()

Unnamed: 0,age,job,marital,education,default,housing_loan,personal_loan,contact,month,day_of_week,num_contacts_cur,days_passed,num_contacts_prev,prev_outcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label
9347,45,admin.,married,high.school,no,no,no,telephone,jun,fri,5,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,0
29556,36,management,married,university.degree,no,yes,yes,cellular,apr,mon,1,999,1,failure,-1.8,93.075,-47.1,1.405,5099.1,0
5331,39,blue-collar,married,basic.9y,no,yes,no,telephone,may,fri,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
36786,64,retired,married,unknown,no,yes,no,cellular,jun,mon,3,999,0,nonexistent,-2.9,92.963,-40.8,1.26,5076.2,0
36717,31,management,married,university.degree,no,no,no,telephone,jun,fri,2,999,0,nonexistent,-2.9,92.963,-40.8,1.268,5076.2,0


In [18]:
df_train_one_hot.head()

Unnamed: 0,age,num_contacts_cur,days_passed,num_contacts_prev,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,label,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,prev_outcome_failure,prev_outcome_nonexistent,prev_outcome_success
9347,45,5,999,0,1.4,94.465,-41.8,4.967,5228.1,0,...,0,0,1,0,0,0,0,0,1,0
29556,36,1,999,1,-1.8,93.075,-47.1,1.405,5099.1,0,...,0,0,0,1,0,0,0,1,0,0
5331,39,2,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,1,0,0,0,0,0,1,0
36786,64,3,999,0,-2.9,92.963,-40.8,1.26,5076.2,0,...,0,0,0,1,0,0,0,0,1,0
36717,31,2,999,0,-2.9,92.963,-40.8,1.268,5076.2,0,...,0,0,1,0,0,0,0,0,1,0


In [19]:
X_train = df_train_one_hot.drop(columns=['label']).values
y_train = df_train_one_hot['label'].values

In [20]:
X_test = df_test_one_hot.drop(columns=['label']).values
y_test = df_test_one_hot['label'].values

In [21]:
X_train.shape[0] == num_train

True

In [22]:
parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 4, 8, 16, 32]
}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, scoring='f1_weighted')
clf.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_split': [2, 4, 8, 16, 32],
                         'splitter': ['best', 'random']},
             scoring='f1_weighted')

In [23]:
print(clf.best_score_)
print(clf.best_params_)

0.8766741401283837
{'criterion': 'entropy', 'min_samples_split': 32, 'splitter': 'random'}


In [24]:
train_preds = np.where(clf.predict(X_train) > 0.5, 1, 0)
(train_preds == y_train).sum() / num_train

0.9187465604868732

In [25]:
probs = clf.predict(X_test)

In [26]:
predictions = np.where(probs > 0.5, 1, 0)

In [27]:
df_test['prediction'] = predictions

In [28]:
df_test['label'].replace({ 0: 'no subscription', 1: 'subscription' }, inplace=True)
df_test['prediction'].replace({ 0: 'no subscription', 1: 'subscription' }, inplace=True)

In [29]:
df_test.to_csv('bank-marketing.csv', index=False, float_format='%.3f')