In [21]:
import numpy as np
import os
os.chdir("..")
data = np.loadtxt(('data/preprocessed-dataset.csv'), delimiter=',', skiprows=1)
os.chdir("methods")
x = data[:,1:35]
y = data[:,35]


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.2)

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree = dtree.fit(x_train, y_train)

y_predict=dtree.predict(x_test)


In [22]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

def model_info(model_pred):
    print("Accuracy:", metrics.accuracy_score(y_test, model_pred))
    print("Precision:", metrics.precision_score(y_test, model_pred))
    print("Recall:", metrics.recall_score(y_test, model_pred), end="\n\n")
    print(classification_report(y_test, model_pred))

model_info(y_predict)

Accuracy: 0.8523725834797891
Precision: 0.8464285714285714
Recall: 0.8525179856115108

              precision    recall  f1-score   support

         0.0       0.86      0.85      0.86       291
         1.0       0.85      0.85      0.85       278

    accuracy                           0.85       569
   macro avg       0.85      0.85      0.85       569
weighted avg       0.85      0.85      0.85       569



In [66]:
from sklearn.model_selection import GridSearchCV

# Applying grid search for the linear model
# This block is reall resource heavy as we have a quite big grid to search
tree_parameters = {'criterion' : ['gini', 'entropy'],
                    'max_depth' : [i for i in range(1, 15, 2)],
                      'max_features' : ['auto','sqrt'],
                      'min_samples_split' : [2, 3, 5],
                      'min_samples_leaf' : [1, 2, 3]
                      }
model_grid = GridSearchCV(dtree, tree_parameters, refit = True, verbose = 3, n_jobs=2)
model_grid.fit(x_train, y_train)
grid_pred = model_grid.predict(x_test)

model_info(grid_pred)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Accuracy: 0.8646748681898067
Precision: 0.8430034129692833
Recall: 0.8884892086330936

              precision    recall  f1-score   support

         0.0       0.89      0.84      0.86       291
         1.0       0.84      0.89      0.87       278

    accuracy                           0.86       569
   macro avg       0.87      0.87      0.86       569
weighted avg       0.87      0.86      0.86       569



In [67]:
model_grid.best_params_

{'criterion': 'gini',
 'max_depth': 9,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [68]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
dtree_model = DecisionTreeClassifier(ccp_alpha=0.0, criterion='gini', max_depth=9, max_features='sqrt', min_samples_leaf=1, min_samples_split=2)
dtree_model.fit(x_train, y_train)
y_predict = dtree_model.predict(x_test)

In [72]:
model_info(y_predict)
print(confusion_matrix(y_test, y_predict))
print(roc_auc_score(y_test, y_predict))

Accuracy: 0.8523725834797891
Precision: 0.825503355704698
Recall: 0.8848920863309353

              precision    recall  f1-score   support

         0.0       0.88      0.82      0.85       291
         1.0       0.83      0.88      0.85       278

    accuracy                           0.85       569
   macro avg       0.85      0.85      0.85       569
weighted avg       0.85      0.85      0.85       569

[[239  52]
 [ 32 246]]
0.853098964127667


In [4]:
DecisionTreeClassifier().get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])