In [1]:
from sklearn.model_selection import train_test_split

# D-Tree with 3-Fold CV

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score

In [3]:
# Load the data
data = pd.read_csv('procesed_imgdata_32x32.csv')

  data = pd.read_csv('procesed_imgdata_32x32.csv')


In [4]:
# Split the data into features and target
X = data.iloc[:,:-3]
y = data["labelsEnc"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [5]:
X.head()

Unnamed: 0,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,...,pixel_1014,pixel_1015,pixel_1016,pixel_1017,pixel_1018,pixel_1019,pixel_1020,pixel_1021,pixel_1022,pixel_1023
0,0.478431,0.435294,0.462745,0.537255,0.631373,0.670588,0.654902,0.639216,0.619608,0.592157,...,0.439216,0.392157,0.435294,0.419608,0.4,0.407843,0.403922,0.403922,0.4,0.392157
1,0.47451,0.439216,0.447059,0.533333,0.627451,0.670588,0.654902,0.639216,0.627451,0.596078,...,0.411765,0.415686,0.411765,0.407843,0.415686,0.415686,0.411765,0.403922,0.403922,0.407843
2,0.47451,0.427451,0.396078,0.356863,0.321569,0.290196,0.262745,0.247059,0.231373,0.215686,...,0.529412,0.282353,0.227451,0.211765,0.180392,0.172549,0.168627,0.156863,0.27451,0.568627
3,0.466667,0.411765,0.384314,0.341176,0.298039,0.203922,0.152941,0.231373,0.223529,0.203922,...,0.537255,0.541176,0.556863,0.568627,0.576471,0.576471,0.576471,0.572549,0.580392,0.588235
4,0.454902,0.407843,0.388235,0.341176,0.309804,0.282353,0.254902,0.243137,0.219608,0.207843,...,0.133333,0.168627,0.52549,0.537255,0.552941,0.556863,0.54902,0.545098,0.54902,0.556863


In [8]:
tree_clf = DecisionTreeClassifier()

In [9]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [10]:
grid_search = GridSearchCV(tree_clf, param_grid, cv=3)
grid_search.fit(X_train, y_train)

In [12]:
best_tree_clf = grid_search.best_estimator_
best_tree_clf.fit(X_train, y_train)

In [13]:
print("Before Parameter Tuning:")
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_results = cross_validate(best_tree_clf, X_train, y_train, cv=3, scoring=scoring)
print('CV Accuracy:', cv_results['test_accuracy'].mean())
print('CV Precision (macro):', cv_results['test_precision_macro'].mean())
print('CV Recall (macro):', cv_results['test_recall_macro'].mean())
print('CV F1-score (macro):', cv_results['test_f1_macro'].mean())

Before Parameter Tuning:
CV Accuracy: 0.9586803854319778
CV Precision (macro): 0.9588339937165568
CV Recall (macro): 0.9585941324135713
CV F1-score (macro): 0.9585948303654078


In [14]:
print("After Parameter Tuning:")
y_pred = best_tree_clf.predict(X_test)
print('Test Accuracy:', best_tree_clf.score(X_test, y_test))
print('Test Precision (macro):', precision_score(y_test, y_pred, average='macro'))
print('Test Recall (macro):', recall_score(y_test, y_pred, average='macro'))
print('Test F1-score (macro):', f1_score(y_test, y_pred, average='macro'))

After Parameter Tuning:
Test Accuracy: 0.9678262289727257
Test Precision (macro): 0.9683031668963795
Test Recall (macro): 0.9681466635259878
Test F1-score (macro): 0.968089384570302


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       291
           1       0.99      0.99      0.99       292
           2       1.00      0.99      0.99       309
           3       1.00      0.99      1.00       323
           4       0.98      0.98      0.98       308
           5       1.00      0.98      0.99       298
           6       0.98      0.99      0.98       307
           7       0.99      0.99      0.99       361
           8       0.99      0.99      0.99       311
           9       0.99      1.00      0.99       340
          10       0.85      0.89      0.87       298
          11       0.84      0.82      0.83       322
          12       0.91      0.89      0.90       331
          13       0.93      0.85      0.89       326
          14       0.93      0.92      0.93       317
          15       0.82      0.91      0.86       315
          16       0.94      0.89      0.91       325
          17       0.94    

In [18]:
import pickle
with open('decision_tree_model.pkl', 'wb') as file:
    pickle.dump(best_tree_clf, file)