In [45]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [48]:
# load data set
mnist_data = np.load("../data/mnist-data.npz")
spam_data = np.load("../data/spam-data.npz")
cifar10_data = np.load("../data/cifar10-data.npz")

data_files = [mnist_data, spam_data, cifar_data]

# generate numpy.Generator
rng = np.random.default_rng()

# Q1. Shuffle and Partition

In [55]:
def shuffle_and_partition(data_file, num_validation_samples):
    # Extract features and labels
    print(data_file)
    X = data_file['training_data']
    y = data_file['training_labels']

    # Shuffle the data and labels together in a random order
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_shuffled = X[indices]
    y_shuffled = y[indices]

    # Set aside validation samples
    X_train = X_shuffled[:-num_validation_samples]
    y_train = y_shuffled[:-num_validation_samples]
    X_val = X_shuffled[-num_validation_samples:]
    y_val = y_shuffled[-num_validation_samples:]

    return X_train, y_train, X_val, y_val


# Define the validation sample sizes
validation_sample_sizes = [10000, int(0.2 * 5172), 5000]
mnist_vset, spam_vset, cifar_vset = None, None, None

# Process each dataset using a list comprehension
validation_sets = [
    [shuffle_and_partition(data_file, val_samples) for data_file, val_samples in zip(data_files, validation_sample_sizes)] for sets in validation_sets]

0
<numpy.lib.npyio.NpzFile object at 0x16a29a190>


UnboundLocalError: cannot access local variable 'i' where it is not associated with a value

# Q2. SVM

In [44]:
def train_and_plot_svm(X_train, y_train, X_val, y_val, num_train_examples, title):
    # Create and train the SVM model
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train, y_train)

    # Predict on training and validation sets
    y_train_pred = svm_model.predict(X_train)
    y_val_pred = svm_model.predict(X_val)

    # Calculate accuracy on training and validation sets
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Plot the accuracies
    plt.plot(num_train_examples, train_accuracy, 'o-', label='Training Accuracy')
    plt.plot(num_train_examples, val_accuracy, 'o-', label='Validation Accuracy')
    plt.xlabel('Number of Training Examples')
    plt.ylabel('Accuracy')
    plt.title(title + ': SVM Accuracy vs. Number of Training Examples')
    plt.legend()
    plt.grid(True)
    plt.show()

# (a) MNIST dataset
mnist_data = np.load('../data/mnist_data.npz')
X_mnist = mnist_data['training_data']
y_mnist = mnist_data['training_labels']
X_mnist_val = mnist_data['test_data'].astype(float) / 255.0  # Normalize test data as well

training_examples = [100, 200, 500, 1000, 2000, 5000, 10000]
train_and_plot_svm(X_mnist, y_mnist, X_mnist_val, y_mnist_val, training_examples, 'MNIST')

# (b) Spam dataset
spam_data = np.load('../data/spam_data.npz')
X_spam = spam_data['training_data']
y_spam = spam_data['training_labels']
X_spam_val = spam_data['test_data']

training_examples = [100, 200, 500, 1000, 2000, X_spam.shape[0]]
train_and_plot_svm(X_spam, y_spam, X_spam_val, y_spam_val, training_examples, 'Spam')

# (c) CIFAR-10 dataset
cifar10_data = np.load('data/cifar10_data.npz')
X_cifar10 = cifar10_data['training_data']
y_cifar10 = cifar10_data['training_labels']
X_cifar10_val = cifar10_data['test_data'].astype(float) / 255.0  # Normalize test data as well

training_examples = [100, 200, 500, 1000, 2000, 5000]
train_and_plot_svm(X_cifar10, y_cifar10, X_cifar10_val, y_cifar10_val, training_examples, 'CIFAR-10')


FileNotFoundError: [Errno 2] No such file or directory: '../data/mnist_data.npz'