In [1]:
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv("../data/datasets/data.csv")
train = pd.read_csv("../data/datasets/train.csv")
test = pd.read_csv("../data/datasets/test.csv")

def transform(X):
    img = np.array(Image.open(f"../data/datasets/samples/{X['filename']}"))
    flattened_img = img.flatten()
    return flattened_img

# Apply to each DataFrame and store the flattened images
data['flattened_image'] = data.apply(transform, axis=1)
train['flattened_image'] = train.apply(transform, axis=1)
test['flattened_image'] = test.apply(transform, axis=1)

print("\nSample flattened image from 'data':", data['flattened_image'][0][:10])  # First 10 pixels
print("\nSample flattened image from 'train':", train['flattened_image'][0][:10])
print("\nSample flattened image from 'test':", test['flattened_image'][0][:10])


Sample flattened image from 'data': [248 248 248 248 248 248 247 247 247 247]

Sample flattened image from 'train': [252 252 252 255 255 255 248 248 248 250]

Sample flattened image from 'test': [245 245 245 245 245 245 244 244 244 243]


In [3]:
X_train = np.stack(train['flattened_image'].values)  # Convert list of arrays to a 2D numpy array
y_train = train['label'].values

X_test = np.stack(test['flattened_image'].values)
y_test = test['label'].values
print("\n--- Training Data ---")
print(f"X_train shape: {X_train.shape}")  # (num_samples, flattened_image_length)
print(f"Sample X_train[0] (first 10 pixels): {X_train[0][:10]}")  # First 10 pixels of first image
print(f"y_train shape: {y_train.shape}")  # (num_samples,)
print(f"Sample y_train values: {y_train[:5]}")  # First 5 labels

print("\n--- Test Data ---")
print(f"X_test shape: {X_test.shape}")  # (num_samples, flattened_image_length)
print(f"Sample X_test[0] (first 10 pixels): {X_test[0][:10]}")  # First 10 pixels of first test image
print(f"y_test shape: {y_test.shape}")  # (num_samples,)
print(f"Sample y_test values: {y_test[:5]}")  # First 5 test labels


--- Training Data ---
X_train shape: (160, 2352)
Sample X_train[0] (first 10 pixels): [252 252 252 255 255 255 248 248 248 250]
y_train shape: (160,)
Sample y_train values: [8 6 5 0 5]

--- Test Data ---
X_test shape: (40, 2352)
Sample X_test[0] (first 10 pixels): [245 245 245 245 245 245 244 244 244 243]
y_test shape: (40,)
Sample y_test values: [1 2 5 6 1]


In [4]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = DummyClassifier(strategy="most_frequent")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\n--- Dummy Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--- Dummy Classifier ---
Accuracy: 0.1

Classification Report:
               precision    recall  f1-score   support

           0       0.10      1.00      0.18         4
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         4

    accuracy                           0.10        40
   macro avg       0.01      0.10      0.02        40
weighted avg       0.01      0.10      0.02        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=0
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.67      0.50      0.57         4
           1       0.40      0.50      0.44         4
           2       0.40      0.50      0.44         4
           3       0.50      0.50      0.50         4
           4       0.80      1.00      0.89         4
           5       1.00      0.50      0.67         4
           6       0.75      0.75      0.75         4
           7       0.67      0.50      0.57         4
           8       1.00      0.75      0.86         4
           9       0.67      1.00      0.80         4

    accuracy                           0.65        40
   macro avg       0.69      0.65      0.65        40
weighted avg       0.69      0.65      0.65        40



In [6]:
from sklearn.svm import SVC

svm = SVC(
    kernel='rbf',
    C=1.0,          # Regularization (try 0.1, 1, 10)
    gamma='scale',   # Kernel coefficient (try 'auto', 0.1, 1)
    random_state=0
)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.75      0.75      0.75         4
           2       0.75      0.75      0.75         4
           3       0.75      0.75      0.75         4
           4       0.57      1.00      0.73         4
           5       1.00      0.50      0.67         4
           6       1.00      1.00      1.00         4
           7       1.00      0.50      0.67         4
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         4

    accuracy                           0.80        40
   macro avg       0.84      0.80      0.80        40
weighted avg       0.84      0.80      0.80        40



In [7]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
rf_clf = RandomForestClassifier(
    random_state=0     # For reproducibility
)
rf_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_clf.predict(X_test)

print("\n--- Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")  # Prints 2 decimal places
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


--- Random Forest Classifier ---
Accuracy: 0.78

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.75      0.75      0.75         4
           2       0.60      0.75      0.67         4
           3       1.00      0.50      0.67         4
           4       0.75      0.75      0.75         4
           5       1.00      0.50      0.67         4
           6       1.00      0.75      0.86         4
           7       1.00      1.00      1.00         4
           8       1.00      1.00      1.00         4
           9       0.67      1.00      0.80         4

    accuracy                           0.78        40
   macro avg       0.83      0.78      0.78        40
weighted avg       0.83      0.78      0.78        40



In [8]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('xgb', XGBClassifier(random_state=0)),
    ('rf', RandomForestClassifier(random_state=0)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=0))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)
stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_test)

print(f"Stacking Accuracy: {accuracy_score(y_test, y_pred_stack):.2f}")
print(classification_report(y_test, y_pred_stack))

Stacking Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.75      0.75      0.75         4
           2       0.60      0.75      0.67         4
           3       0.50      0.50      0.50         4
           4       0.80      1.00      0.89         4
           5       1.00      0.50      0.67         4
           6       1.00      0.75      0.86         4
           7       1.00      0.50      0.67         4
           8       1.00      1.00      1.00         4
           9       0.67      1.00      0.80         4

    accuracy                           0.75        40
   macro avg       0.79      0.75      0.75        40
weighted avg       0.79      0.75      0.75        40



In [9]:

# Initialize and train the model
dt_clf = DecisionTreeClassifier(
    random_state=0
)
dt_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = dt_clf.predict(X_test)

print("\n--- Random Forest Classifier ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


--- Random Forest Classifier ---
Accuracy: 0.45

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.50      0.33         4
           1       0.67      0.50      0.57         4
           2       0.50      0.50      0.50         4
           3       0.00      0.00      0.00         4
           4       0.60      0.75      0.67         4
           5       0.20      0.25      0.22         4
           6       1.00      0.75      0.86         4
           7       0.40      0.50      0.44         4
           8       0.50      0.25      0.33         4
           9       1.00      0.50      0.67         4

    accuracy                           0.45        40
   macro avg       0.51      0.45      0.46        40
weighted avg       0.51      0.45      0.46        40



# Hyperparameter Tuning & Cross Validation

In [10]:
from sklearn.model_selection import GridSearchCV

parameters = {'criterion':('gini', 'entropy', 'log_loss'), 'max_depth':[None, 5, 10, 20]}
clf = DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, cv=5, scoring='f1_weighted')
clf.fit(X_train, y_train)
clf.best_score_, clf.best_params_

(np.float64(0.4532440476190477), {'criterion': 'log_loss', 'max_depth': 5})

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],                      # Regularization strength
    'gamma': ['scale', 'auto', 0.1, 1],     # Kernel coefficient
    'kernel': ['rbf']                       # Fixing to RBF kernel
}

# Create SVC model
svc = SVC(random_state=0)

# Grid search with 5-fold cross-validation
grid_svc = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit to training data
grid_svc.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_svc.best_params_)
print(f"Best Cross-Val Accuracy: {grid_svc.best_score_:.2f}")

# Predict using the best model
y_pred_svm = grid_svc.predict(X_test)

# Evaluation
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")
print(classification_report(y_test, y_pred_svm))


Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Val Accuracy: 0.87
Test Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      1.00      1.00         4
           2       0.75      0.75      0.75         4
           3       0.80      1.00      0.89         4
           4       0.80      1.00      0.89         4
           5       1.00      0.75      0.86         4
           6       1.00      1.00      1.00         4
           7       1.00      0.50      0.67         4
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         4

    accuracy                           0.90        40
   macro avg       0.92      0.90      0.89        40
weighted avg       0.92      0.90      0.89        40



In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

class CustomGridSearchCV:
    def __init__(self, estimator, param_grid, scoring='accuracy', cv=5):
        self.estimator = estimator
        self.param_grid = param_grid
        self.scoring = scoring
        self.cv = cv
        self.best_score_ = None
        self.best_params_ = None
        self.best_estimator_ = None

    def _score(self, y_true, y_pred):
        if self.scoring == 'accuracy':
            return accuracy_score(y_true, y_pred)
        elif self.scoring == 'f1_weighted':
            return f1_score(y_true, y_pred, average='weighted')
        else:
            raise ValueError(f"Scoring method '{self.scoring}' is not implemented.")

    def _generate_param_combinations(self):
        from itertools import product
        keys = list(self.param_grid.keys())
        values = list(self.param_grid.values())
        for v in product(*values):
            yield dict(zip(keys, v))

    def fit(self, X, y):
        kf = KFold(n_splits=self.cv, shuffle=True, random_state=42)
        best_score = -np.inf
        best_params = None
        best_model = None

        for param_set in self._generate_param_combinations():
            scores = []

            for train_idx, val_idx in kf.split(X):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                model = self.estimator.__class__(**param_set)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                score = self._score(y_val, y_pred)
                scores.append(score)

            avg_score = np.mean(scores)

            if avg_score > best_score:
                best_score = avg_score
                best_params = param_set
                best_model = self.estimator.__class__(**param_set)
                best_model.fit(X, y)

        self.best_score_ = best_score
        self.best_params_ = best_params
        self.best_estimator_ = best_model


Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Val Accuracy: 0.87
Test Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       1.00      1.00      1.00         4
           2       0.75      0.75      0.75         4
           3       0.80      1.00      0.89         4
           4       0.80      1.00      0.89         4
           5       1.00      0.75      0.86         4
           6       1.00      1.00      1.00         4
           7       1.00      0.50      0.67         4
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         4

    accuracy                           0.90        40
   macro avg       0.92      0.90      0.89        40
weighted avg       0.92      0.90      0.89        40



In [39]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

class CustomGridSearchCV:
    def __init__(self, estimator, param_grid, scoring='accuracy', cv=5):
        self.estimator = estimator
        self.param_grid = param_grid
        self.scoring = scoring
        self.cv = cv
        self.best_score_ = None
        self.best_params_ = None
        self.best_estimator_ = None

    def _score(self, y_true, y_pred):
        if self.scoring == 'accuracy':
            return accuracy_score(y_true, y_pred)
        elif self.scoring == 'f1_weighted':
            return f1_score(y_true, y_pred, average='weighted')
        else:
            raise ValueError(f"Scoring method '{self.scoring}' is not implemented.")

    def _generate_param_combinations(self):
        from itertools import product
        keys = list(self.param_grid.keys())
        values = list(self.param_grid.values())
        for v in product(*values):
            yield dict(zip(keys, v))

    def fit(self, X, y):
        kf = KFold(n_splits=self.cv, shuffle=True, random_state=42)
        best_score = -np.inf
        best_params = None
        best_model = None

        for param_set in self._generate_param_combinations():
            scores = []

            for train_idx, val_idx in kf.split(X):
                X_train, X_val = X[train_idx], X[val_idx]
                y_train, y_val = y[train_idx], y[val_idx]

                model = self.estimator.__class__(**param_set)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_val)
                score = self._score(y_val, y_pred)
                scores.append(score)

            avg_score = np.mean(scores)

            if avg_score > best_score:
                best_score = avg_score
                best_params = param_set
                best_model = self.estimator.__class__(**param_set)
                best_model.fit(X, y)

        self.best_score_ = best_score
        self.best_params_ = best_params
        self.best_estimator_ = best_model


In [43]:
from sklearn.svm import SVC
from sklearn.datasets import load_iris

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

grid = CustomGridSearchCV(SVC(), param_grid, scoring='f1_weighted', cv=5)
grid.fit(X_train, y_train )

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


Best params: {'C': 0.1, 'kernel': 'linear'}
Best score: 0.8991829004329004


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# params
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 4]
}

grid = CustomGridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=5
)

# Fit the model
grid.fit(X_train, y_train)

# Print results
print("Best Parameters:", grid.best_params_)
print("Best F1 Weighted Score:", grid.best_score_)


Best Parameters: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 2}
Best F1 Weighted Score: 0.50625
