**Part 3. Supervised Learning: Generalisation & Overfitting; Decision trees**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**DecisionTreeClassifier**

In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score


X = np.load('/content/drive/MyDrive/Data/smiley_dataset/smiley_X.npy')
y = np.load('/content/drive/MyDrive/Data/smiley_dataset/smiley_Y.npy')

X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree on training set
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate performance on training set
y_train_pred = clf.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)
print("Training accuracy:", train_acc*100)
print("Training confusion matrix:\n", confusion_matrix(y_train, y_train_pred))
print("Training classification report:\n", classification_report(y_train, y_train_pred))

# Evaluate performance on test set
y_test_pred = clf.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)
print("Test accuracy:", test_acc*100)
print("Test confusion matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Test classification report:\n", classification_report(y_test, y_test_pred))


cv_scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Training accuracy: 100.0
Training confusion matrix:
 [[57  0  0]
 [ 0 51  0]
 [ 0  0 55]]
Training classification report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        57
         1.0       1.00      1.00      1.00        51
         2.0       1.00      1.00      1.00        55

    accuracy                           1.00       163
   macro avg       1.00      1.00      1.00       163
weighted avg       1.00      1.00      1.00       163

Test accuracy: 97.5609756097561
Test confusion matrix:
 [[15  0  0]
 [ 0  9  0]
 [ 0  1 16]]
Test classification report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        15
         1.0       0.90      1.00      0.95         9
         2.0       1.00      0.94      0.97        17

    accuracy                           0.98        41
   macro avg       0.97      0.98      0.97        41
weighted avg       0.98      0.98      0.98     

**Random Forest**

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
X = np.load('/content/drive/MyDrive/Data/smiley_dataset/smiley_X.npy')
y = np.load('/content/drive/MyDrive/Data/smiley_dataset/smiley_Y.npy')
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5, min_samples_split=5, min_samples_leaf=2)

rf_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_train_pred = rf_clf.predict(X_train)
y_test_pred = rf_clf.predict(X_test)
score_1 = accuracy_score(y_train, y_train_pred)*100
score_2 = accuracy_score(y_test, y_test_pred)*100
print("Training set accuracy:", score_1)
print("Testing set accuracy:",score_2 )

print("Training set confusion matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("Testing set confusion matrix:")
print(confusion_matrix(y_test, y_test_pred))

print("Training set classification report:")
print(classification_report(y_train, y_train_pred))
print("Testing set classification report:")
print(classification_report(y_test, y_test_pred))
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_clf, X, y, cv=10)
scores_percent = scores * 100
print("Cross-validation scores:", scores_percent)
print("Average cross-validation score:", np.mean(scores_percent))


Training set accuracy: 100.0
Testing set accuracy: 97.5609756097561
Training set confusion matrix:
[[57  0  0]
 [ 0 51  0]
 [ 0  0 55]]
Testing set confusion matrix:
[[15  0  0]
 [ 0  9  0]
 [ 0  1 16]]
Training set classification report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        57
         1.0       1.00      1.00      1.00        51
         2.0       1.00      1.00      1.00        55

    accuracy                           1.00       163
   macro avg       1.00      1.00      1.00       163
weighted avg       1.00      1.00      1.00       163

Testing set classification report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        15
         1.0       0.90      1.00      0.95         9
         2.0       1.00      0.94      0.97        17

    accuracy                           0.98        41
   macro avg       0.97      0.98      0.97        41
weighted avg       