## STAT3612: Statistical Machine Learning
### Assignment 2: Classification
### DUE: Nov 10, 2024, Sunday, 11:59 PM

In [1]:
! pip install numpy pandas matplotlib scikit-learn keras

Collecting numpy
  Downloading numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Using cached matplotlib-3.9.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting keras
  Downloading keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.0-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.54.1-cp311-cp311-win_amd64.whl.metadat

In [2]:
# Q7 (a)

# ------------------
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

mnist = datasets.fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target

X = X / 255.0  # Normalize the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lda = LDA(n_components=9)  # 9 components used because MNIST has 10 classes
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_lda, y_train)

y_pred = classifier.predict(X_test_lda)
accuracy = accuracy_score(y_test, y_pred)

print(f"Testing data accuracy: {accuracy * 100:.2f}%")
# ------------------

Testing data accuracy: 88.36%


In [3]:
# Q7 (b)

# ------------------
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

mask = (y == '0') | (y == '1')
X, y = X[mask], y[mask]

X = X / 255.0  # Normalize the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lda = LDA(n_components=1)  # 1 component used for binary classification (2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.transform(X_test)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_lda, y_train)

y_pred = classifier.predict(X_test_lda)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='1')
recall = recall_score(y_test, y_pred, pos_label='1')
f1 = f1_score(y_test, y_pred, pos_label='1')

print(f"Testing data accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
# ------------------

Testing data accuracy: 99.59%
Precision: 1.00
Recall: 1.00
F1-Score: 1.00


In [6]:
from sklearn import datasets
from keras.datasets import cifar10

(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('X_train shape:', x_train.shape)
print('X_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

ModuleNotFoundError: No module named 'wrapt'

Consider flatten the images to 1D vectors and make the shape of the X be (Batch, dim).

In [7]:
# Q8 (a)

# ------------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from keras.datasets import cifar10

# Load CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Flatten the images
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Normalize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Flatten the labels
y_train = y_train.flatten()
y_test = y_test.flatten()

# Define the range of weights for L1 penalty
weights = np.logspace(-4, 4, 10)

# List to store accuracy for each weight
accuracies = []

# Train and evaluate the model for each weight
for weight in weights:
    model = LogisticRegression(penalty='l1', solver='saga', C=1/weight, max_iter=100, multi_class='multinomial')
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f'Weight: {weight}, Accuracy: {accuracy}')

# Plot the accuracy vs. weight curve
plt.figure(figsize=(10, 6))
plt.plot(weights, accuracies, marker='o')
plt.xscale('log')
plt.xlabel('Weight of L1 penalty (log scale)')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Weight of L1 penalty')
plt.grid(True)
plt.show()
# ------------------

ModuleNotFoundError: No module named 'wrapt'

In [None]:
# Q8 (b)

def train_with_SGD(model, X_train, y_train, lr, epoch_num):

  # ------------------
  # Write your code here


  # ------------------

  return model


def test(model, X_test, y_test):

  # ------------------
  # Write your code here


  # ------------------


# ------------------
# Write your code here








# ------------------