In [106]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree
from sklearn.metrics import accuracy_score, classification_report

%run util.ipynb

# DT 

In [107]:
# https://scikit-learn.org/stable/modules/tree.html
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

## Load Data

In [108]:
X, Y = get_data()

In [109]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

scalar = StandardScaler()
Xtrain = scalar.fit_transform(Xtrain)
Xtest = scalar.transform(Xtest)

In [110]:
clf = tree.DecisionTreeClassifier(random_state=42)

clf.fit(Xtrain, Ytrain)

In [111]:
Ypred = clf.predict(Xtest)

accuracy_score(Ytest, Ypred)

0.6979166666666666

In [112]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70        50
           1       0.67      0.72      0.69        46

    accuracy                           0.70        96
   macro avg       0.70      0.70      0.70        96
weighted avg       0.70      0.70      0.70        96



### hyperparam tuning

In [113]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}

In [114]:
param_grid_dt = {
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_leaf': range(1,4),
    'min_samples_split': range(3,8)
}

grid_search_dt = GridSearchCV(estimator=tree.DecisionTreeClassifier(random_state=42), param_grid=param_grid_dt, cv=10, scoring='accuracy', n_jobs=-1)

In [115]:
grid_search_dt.fit(Xtrain, Ytrain)

In [116]:
grid_search_dt.best_params_, grid_search_dt.best_score_

({'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3},
 np.float64(0.6768774703557312))

In [117]:
model = grid_search_dt.best_estimator_

YpredGrid = grid_search_dt.predict(Xtest)

accuracy_score(Ytest, YpredGrid)

0.7083333333333334

In [118]:
print(classification_report(Ytest, YpredGrid))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74        50
           1       0.72      0.63      0.67        46

    accuracy                           0.71        96
   macro avg       0.71      0.71      0.71        96
weighted avg       0.71      0.71      0.71        96



### check for overfititng

In [119]:
# training accuracy

YtrainPred = grid_search_dt.best_estimator_.predict(Xtrain)

accuracy_score(Ytrain, YtrainPred)

0.8834080717488789