In [None]:
import pandas as pd
import pickle
from sklearn import metrics 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from numpy import mean, absolute

In [None]:
# load data
infile = open('../01_data/FINALsmallSampleSet_3months_without_duplicates.pkl','rb')
df = pickle.load(infile)
infile.close()

In [4]:
## DTE

# prepare features
feature_cols = list(df.columns)
feature_cols.remove('Erfolg')
X = df[feature_cols]
y = df.Erfolg
y= y.astype('int')

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
k = 5
cv = KFold(n_splits=k, random_state=None)

# classifier
clf = DecisionTreeClassifier(criterion="entropy", splitter='best', max_depth=10)
clf = clf.fit(X_train,y_train)

# calculate scores
scores_a = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=cv, n_jobs=-1)
a = mean(scores_a)

scores_p = cross_val_score(clf, X_test, y_test, scoring='precision', cv=cv, n_jobs=-1)
p = mean(scores_p)

scores_r = cross_val_score(clf, X_test, y_test, scoring='recall', cv=cv, n_jobs=-1)
r = mean(scores_r)

scores_f1 = cross_val_score(clf, X_test, y_test, scoring='f1', cv=cv, n_jobs=-1)
f1 = mean(scores_f1)

# predict
pred = clf.predict(X_test)
probs = clf.predict_proba(X_test)

In [5]:
# print scores
print("Accuracy: %.8f" %a)
print("Precision: %.8f" %p)
print("Recall: %.8f" %r)
print("F1: %.8f" %f1)

Accuracy: 0.96987386
Precision: 0.97598093
Recall: 0.96376906
F1: 0.96960684


In [6]:
# print confusion matrix
metrics.confusion_matrix(y_test, pred)

array([[29453,   547],
       [ 1114, 28834]], dtype=int64)

In [7]:
# calculate feature importance
importance = clf.feature_importances_
for p,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (p,v))

Feature: 0, Score: 0.00118
Feature: 1, Score: 0.01291
Feature: 2, Score: 0.00020
Feature: 3, Score: 0.00005
Feature: 4, Score: 0.63805
Feature: 5, Score: 0.00129
Feature: 6, Score: 0.04603
Feature: 7, Score: 0.00034
Feature: 8, Score: 0.00000
Feature: 9, Score: 0.00026
Feature: 10, Score: 0.18505
Feature: 11, Score: 0.06244
Feature: 12, Score: 0.01206
Feature: 13, Score: 0.01944
Feature: 14, Score: 0.00000
Feature: 15, Score: 0.00003
Feature: 16, Score: 0.00000
Feature: 17, Score: 0.00000
Feature: 18, Score: 0.00000
Feature: 19, Score: 0.00032
Feature: 20, Score: 0.00005
Feature: 21, Score: 0.00004
Feature: 22, Score: 0.00000
Feature: 23, Score: 0.00000
Feature: 24, Score: 0.00000
Feature: 25, Score: 0.00000
Feature: 26, Score: 0.00004
Feature: 27, Score: 0.00000
Feature: 28, Score: 0.00004
Feature: 29, Score: 0.00003
Feature: 30, Score: 0.00000
Feature: 31, Score: 0.00004
Feature: 32, Score: 0.00000
Feature: 33, Score: 0.00000
Feature: 34, Score: 0.00000
Feature: 35, Score: 0.00000
Fe

In [8]:
# save probabilities as list
t = probs[:,:1].tolist()
data_df = pd.DataFrame(t)

Unnamed: 0,0
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
59943,0.0
59944,0.0
59945,1.0
59946,1.0


In [9]:
#save model as pickle dump
pickle.dump(clf, open('DecisionTreemodel_3months.pkl', 'wb'))
pickle.dump(X_train, open('X_train_3months.pkl', 'wb'))
pickle.dump(X_test, open('X_test_3months.pkl', 'wb'))
pickle.dump(y_train, open('y_train_3months.pkl', 'wb'))
pickle.dump(y_test, open('y_test_3months.pkl', 'wb'))
pickle.dump(data_df, open('df_prob_3months.pkl', 'wb'))