# **MNIST Classification**

---


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

X = X / 255.0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
non_zero_mask = (X != 0).any(axis=0)
X = X.loc[:, non_zero_mask]

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# **Logistic Regression**

In [None]:
class LogisticRegression:
    def __init__(self, alpha=0.01, eps=1e-4, max_iter=2000):
        self.alpha = alpha
        self.eps = eps
        self.max_iter = max_iter
        self.weights = None
        self.classes = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        return np.c_[np.ones(X.shape[0]), X]

    def logistic_regression(self, X, y):
        m, n = X.shape
        w = np.zeros(n)

        for i in range(self.max_iter):
            y_pred = self.sigmoid(X @ w)
            gradient = (X.T @ (y_pred - y)) / m
            if np.linalg.norm(gradient) < self.eps:
                break
            w -= self.alpha * gradient

        return w

    def fit(self, X, y):
        X = self._add_bias(X)
        self.classes = np.unique(y)
        self.weights = []

        for c in self.classes:
            y_c = (y == c).astype(int)
            w_c = self.logistic_regression(X, y_c)
            self.weights.append(w_c)

        self.weights = np.array(self.weights)

    def predict(self, X):
        X = self._add_bias(X)
        probs = self.sigmoid(X @ self.weights.T)
        return self.classes[np.argmax(probs, axis=1)]

model_logistic = LogisticRegression(alpha=0.01, max_iter=2000)
model_logistic.fit(X_train_pca, y_train)
y_pred_lr = model_logistic.predict(X_test_pca)


In [None]:
print(f"Accuracy of Logistic Regression with PCA: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score of Logistic Regression with PCA: {f1_score(y_test, y_pred_lr, average='weighted'):.4f}")
print(f"Precision Score of Logistic Regression with PCA: {precision_score(y_test, y_pred_lr, average='weighted'):.4f}")
print(f"Recall Score of Logistic Regression with PCA: {recall_score(y_test, y_pred_lr, average='weighted'):.4f}")

Accuracy of Logistic Regression with PCA: 0.8729
F1 Score of Logistic Regression with PCA: 0.8716
Precision Score of Logistic Regression with PCA: 0.8734
Recall Score of Logistic Regression with PCA: 0.8729


# **Quadratic Discriminant Analysis (QDA)**

In [None]:
class QDA:
    def __init__(self):
        self.labels = None
        self.means = {}
        self.covariance_matrices = {}
        self.covariance_matrix_inverses = {}
        self.label_log_probabilities = {}
    @staticmethod
    def get_stats(x):
        mean = np.mean(x, axis=0)
        covariance_matrix = np.cov(x.T, ddof=1)
        return mean, covariance_matrix
    @staticmethod
    def get_label_log_probabilities(y):
        return np.log(np.bincount(y)/len(y))

    def qdf(self, x, mean, covariance_matrix_inverse):
        cov_mean = covariance_matrix_inverse @ mean
        return (-1/2*np.log(np.linalg.det(covariance_matrix_inverse))
                -1/2 * (mean.T @ cov_mean) + x @ cov_mean - 1/2 * np.diag(x @ covariance_matrix_inverse @ x.T))

    def fit(self, x_train, y_train):
        self.labels = np.unique(y_train)
        label_log_probabilities = self.get_label_log_probabilities(y_train)
        self.label_log_probabilities = dict(zip(self.labels, label_log_probabilities))
        for label in self.labels:
            x_labeled = x_train[y_train == label]
            mean, covariance_matrix = self.get_stats(x_labeled)
            self.means[label] = mean
            self.covariance_matrices[label] = covariance_matrix
            covariance_matrix_inverse = np.linalg.inv(covariance_matrix)
            self.covariance_matrix_inverses[label] = covariance_matrix_inverse

    def predict(self, x_test):
        log_posteriors = self.predict_probabilities(x_test)
        return self.labels[np.argmax(log_posteriors, axis=0)]

    def predict_probabilities(self, x_test):
        probabilities = []
        for label in self.labels:
            mean = self.means[label]
            covariance_matrix_inverse = np.linalg.inv(self.covariance_matrices[label])
            prob = self.qdf(x_test, mean, covariance_matrix_inverse) + self.label_log_probabilities[label]
            probabilities.append(prob)
        return np.array(probabilities)

model_qda = QDA()
model_qda.fit(X_train_pca, y_train)
y_pred_qda = model_qda.predict(X_test_pca)

In [None]:
# change PCA components to 30
print(f"Accuracy of QDA: {accuracy_score(y_test, y_pred_qda):.4f}")
print(f"F1 Score of QDA: {f1_score(y_test, y_pred_qda, average='weighted'):.4f}")
print(f"Precision Score of QDA: {precision_score(y_test, y_pred_qda, average='weighted', zero_division=0):.4f}")
print(f"Recall Score of QDA: {recall_score(y_test, y_pred_qda, average='weighted'):.4f}")

Accuracy of QDA: 0.7372
F1 Score of QDA: 0.7070
Precision Score of QDA: 0.7158
Recall Score of QDA: 0.7372


# **Support Vector Machine (SVM)**

In [None]:
class SVM:
    def __init__(self, C=1.0, alpha=0.01, iter=100):
        self.C = C
        self.alpha = alpha
        self.iter = iter
        self.models = {}

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        for label in self.classes:
            w = np.zeros(n_features)
            b = 0
            y_binary = np.where(y == label, 1, -1)

            for _ in range(self.iter):
                for i in range(n_samples):
                    cond = y_binary[i] * (X[i] @ w + b) >= 1
                    if cond:
                        dw = w
                        db = 0
                    else:
                        dw = w - self.C * y_binary[i] * X[i]
                        db = -self.C * y_binary[i]
                    w -= self.alpha * dw
                    b -= self.alpha * db

            self.models[label] = (w, b)

    def predict(self, X):
        n_samples = X.shape[0]
        scores = np.zeros((n_samples, len(self.models)))

        for i, label in enumerate(self.models):
            w, b = self.models[label]
            scores[:, i] = X @ w + b

        predictions = np.argmax(scores, axis=1)
        return self.classes[predictions]


model_svm = SVM(C=1.0, alpha=0.01, iter=100)
model_svm.fit(X_train_pca, y_train)
y_pred_svm = model_svm.predict(X_test_pca)


# **Random Forest**

In [None]:
from sklearn.tree import DecisionTreeClassifier

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, n_features=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.n_features = n_features
        self.random_state = random_state
        self.trees = []
    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idx = np.random.choice(n_samples, size=n_samples, replace=True)
        return np.array(X)[idx], np.array(y)[idx]
    def get_rand_features(self, X):
        n_features = X.shape[1]
        if self.n_features is None:
            self.n_features = int(np.sqrt(n_features))
        features_idx = np.random.choice(n_features, self.n_features, replace=False)
        return features_idx

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            X_sample, y_sample = self.bootstrap_sample(X, y)
            features_idx = self.get_rand_features(X_sample)
            X_sample_rand_features = X_sample[:, features_idx]
            tree = DecisionTreeClassifier(max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(X_sample_rand_features, y_sample.astype(int))
            self.trees.append((tree, features_idx))

    def predict(self, X):
        tree_preds = []
        for tree, features_idx in self.trees:
            X_selected = X[:, features_idx]
            pred = tree.predict(X_selected)
            tree_preds.append(pred)

        tree_preds = np.array(tree_preds)
        final_preds = []

        for i in range(tree_preds.shape[1]):
            votes = tree_preds[:, i]
            majority_vote = np.bincount(votes.astype(int)).argmax()
            final_preds.append(majority_vote)

        return np.array(final_preds)

model_rf = RandomForest(n_estimators=100, max_depth=20, random_state=42)
model_rf.fit(X_train_pca, y_train)
y_pred_rf = model_rf.predict(X_test_pca)

In [None]:
y_test = y_test.astype(int)
y_pred_rf = y_pred_rf.astype(int)

# change PCA components to 100-150
print(f"Accuracy of Random Forest: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score of Random Forest: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Precision Score of Random Forest: {precision_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"Recall Score of Random Forest: {recall_score(y_test, y_pred_rf, average='weighted'):.4f}")


Accuracy of Random Forest: 0.9136
F1 Score of Random Forest: 0.9135
Precision Score of Random Forest: 0.9141
Recall Score of Random Forest: 0.9136


# **Report about MNIST dataset and Implementations**

The MNIST dataset is a large collection of handwritten digits commonly used for evaluating machine learning algorithms in the field of computer vision and image processing.The dataset consists of 70,000 datapoints each representing a digit from 0 to 9. Each image is 28x28 pixels and represents one of the 10 possible classes (digits 0 to 9).
The pixel values of the images are normalized to the range [0, 1] by dividing by 255.The dataset is split into training and testing sets using train_test_split, with 80% of the data used for training and 20% for testing.
 A mask is created to select features that have non-zero values across all images, effectively removing features that are all zeros (columns that contain only background pixels).The data is scaled using StandardScaler to standardize the features by removing the mean and scaling to unit variance.
 PCA is applied to reduce the dimensionality of the data.It helps in retaining the most significant features and reducing the computational cost of subsequent machine learning models.

 **Logistic Regression** struggled with the high dimensionality of the original MNIST dataset (784 features). To address this, Principal Component Analysis (PCA) was applied to reduce the feature space to 150 components, improving both performance and training efficiency.

 **Quadratic Discriminant Analysis (QDA)** was more sensitive to high dimensionality, so the number of PCA components was further reduced to 30 to avoid issues like singular covariance matrices and to improve model stability.

 For the **SVM** implementation, an attempt was made to use the cvxopt solver, but due to RAM limitations, it was not feasible. Instead, a custom SVM was implemented using Stochastic Gradient Descent (SGD), with 100 PCA components and 100 training iterations



In [None]:
metrics = {
    'Model': ['Logistic Regression (PCA)', 'QDA', 'SVM', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_qda),
        accuracy_score(y_test, y_pred_svm),
        accuracy_score(y_test, y_pred_rf)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_lr, average='weighted'),
        f1_score(y_test, y_pred_qda, average='weighted'),
        f1_score(y_test, y_pred_svm, average='weighted'),
        f1_score(y_test, y_pred_rf, average='weighted')
    ],
    'Precision': [
        precision_score(y_test, y_pred_lr, average='weighted'),
        precision_score(y_test, y_pred_qda, average='weighted', zero_division=0),
        precision_score(y_test, y_pred_svm, average='weighted'),
        precision_score(y_test, y_pred_rf, average='weighted')
    ],
    'Recall': [
        recall_score(y_test, y_pred_lr, average='weighted'),
        recall_score(y_test, y_pred_qda, average='weighted'),
        recall_score(y_test, y_pred_svm, average='weighted'),
        recall_score(y_test, y_pred_rf, average='weighted')
    ]
}

df_metrics = pd.DataFrame(metrics)
print(df_metrics.round(4))

ValueError: Mix of label input types (string and number)