In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# from google.colab import drive
# drive.mount('/content/drive')

def preprocess():
    # Data paths
    train_data_file = 'sign_mnist_train.csv'
    test_data_file = 'sign_mnist_test.csv'
    folder_name = 'Project Data'

    desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
    # train_data_path = os.path.join(desktop_path, folder_name, train_data_file)
    # test_data_path = os.path.join(desktop_path, folder_name, test_data_file)
    train_data_path = f'/content/drive/My Drive/Project Data/{train_data_file}'
    test_data_path = f'/content/drive/My Drive/Project Data/{test_data_file}'

    # Load data
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)

    train_labels = train_df['label']
    train = train_df.drop(['label'], axis=1)

    test_labels = test_df['label']
    test = test_df.drop(['label'], axis=1)

    # Reshaping images
    train_images = train.values
    train_images = np.array([np.reshape(i, (28, 28)) for i in train_images])
    train_images = np.array([i.flatten() for i in train_images])

    test_images = test.values
    test_images = np.array([np.reshape(i, (28, 28)) for i in test_images])
    test_images = np.array([i.flatten() for i in test_images])

    # One hot encoding labels
    binrizer = LabelBinarizer()
    #train_labels = binrizer.fit_transform(train_labels)
    #test_labels = binrizer.fit_transform(test_labels)

    # Split into train and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(train_images, train_labels, test_size=0.2, random_state=42)

    # Normalize pixel values [0, 1]
    X_train = X_train/ 255.0
    X_valid = X_valid / 255.0

    return X_train, X_valid, test_images, y_train, y_valid, test_labels

In [None]:
X_train, X_valid, test_images, y_train, y_valid, test_labels = preprocess()

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_images)

# Apply PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

dt = DecisionTreeClassifier()

dt_param_grid = {
    'max_depth': [30, None],
    'min_samples_split': [20, 30, 40],
    'min_samples_leaf': [20,30, 40],
}
dt_grid_search = GridSearchCV(dt, dt_param_grid)
dt_grid_search.fit(X_train_pca, y_train)
best_dt = dt_grid_search.best_estimator_
print(dt_grid_search.best_params_)

y_pred = best_dt.predict(X_test_pca)
print("Decision Tree performance:")
print(classification_report(test_labels, y_pred))
train_accuracy = accuracy_score(test_labels, y_pred)
print("Accuracy on the training set:", train_accuracy)