Natasha Nicholas 

January 20, 2025

HW1 Question 3-4

Question 3

In [1]:
#all packages and libraries used
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



Parsing and Normalization

In [2]:
#parse and normalize MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

x_train_flat = x_train.reshape(x_train.shape[0], -1)
x_test_flat = x_test.reshape(x_test.shape[0], -1)

In [3]:
#parse and normalize 20 NewsGroups
newsgroups = fetch_20newsgroups(subset='all')

texts = newsgroups.data
labels = newsgroups.target
target_names = newsgroups.target_names

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)

Euclidian Distance -- Library

In [4]:
#computes euclidean distance using a library
euclidean_distances = pairwise_distances(x_train_flat, metric='euclidean')


Euclidian Distance -- Function

In [5]:
#computes euclidean distance using a written function

def euclidean_distance_batch_optimized(X, batch_size=100):
    num_samples = X.shape[0]
    distance_matrix = np.zeros((num_samples, num_samples), dtype=np.float32)
    
    for i in range(0, num_samples, batch_size):
        batch_end = min(i + batch_size, num_samples)
        
        batch = X[i:batch_end]

        dist_batch = np.sqrt(np.sum(batch**2, axis=1)[:, np.newaxis] + np.sum(X**2, axis=1) - 2 * np.dot(batch, X.T))
        
        distance_matrix[i:batch_end, :] = dist_batch
    
    return distance_matrix

x_train_flat = x_train.reshape(x_train.shape[0], -1)

euclidean_distances_manual = euclidean_distance_batch_optimized(x_train_flat, batch_size=100)

  dist_batch = np.sqrt(np.sum(batch**2, axis=1)[:, np.newaxis] + np.sum(X**2, axis=1) - 2 * np.dot(batch, X.T))


Cosine Similarity

In [6]:
#computes cosine similarity using a library
cosine_sim_sklearn = cosine_similarity(X)

Question 4

In [5]:
def split_data(X, y, validation_size=0.1, test_size=0.1):
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=validation_size + test_size, random_state=42)
    validation_size_adjusted = validation_size / (validation_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=validation_size_adjusted, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

#knn preidction
def knn_predict(X_train, y_train, X_test, k=5, distance_matrix=None, batch_size=10):
    predictions = []
    
    num_samples_test = X_test.shape[0]
    
    for i in range(0, num_samples_test, batch_size):
        batch_end = min(i + batch_size, num_samples_test)
        batch = X_test[i:batch_end]
        
        if distance_matrix is None:
            distances = np.linalg.norm(X_train[:, np.newaxis] - batch, axis=2)
        else:
            distances = distance_matrix[i:batch_end, :]
        
        for j in range(batch.shape[0]):
            neighbors_idx = np.argsort(distances[j])[:k]

            neighbor_labels = y_train[neighbors_idx]
            
            most_common = np.bincount(neighbor_labels).argmax()
            predictions.append(most_common)
    
    return np.array(predictions)

#evaluates knn
def evaluate_knn(X_train, y_train, X_test, y_test, k=5, distance_matrix=None, batch_size=10):
    predictions = knn_predict(X_train, y_train, X_test, k, distance_matrix, batch_size)
    
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

#mnist
X_train, X_val, X_test, y_train, y_val, y_test = split_data(x_train_flat, y_train)

euclidean_distances_train = pairwise_distances(X_train, metric='euclidean')
euclidean_distances_test = pairwise_distances(X_test, X_train, metric='euclidean')

#knn on MNIST (euclidean distance)
print("Training KNN on MNIST (euclidean distance):")
train_accuracy_mnist = evaluate_knn(X_train, y_train, X_train, y_train, k=5, distance_matrix=euclidean_distances_train, batch_size=10)
test_accuracy_mnist = evaluate_knn(X_train, y_train, X_test, y_test, k=5, distance_matrix=euclidean_distances_test, batch_size=10)

print(f"MNIST Training Accuracy: {train_accuracy_mnist * 100:.2f}%")
print(f"MNIST Test Accuracy: {test_accuracy_mnist * 100:.2f}%")

#20ng
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, labels)

cosine_sim_train = cosine_similarity(X_train)
cosine_sim_test = cosine_similarity(X_test, X_train)

#knn on 20ng (cosine)
print("\nTraining KNN on 20 Newsgroups (cosine similarity):")
train_accuracy_newsgroups = evaluate_knn(X_train, y_train, X_train, y_train, k=5, distance_matrix=cosine_sim_train, batch_size=10)
test_accuracy_newsgroups = evaluate_knn(X_train, y_train, X_test, y_test, k=5, distance_matrix=cosine_sim_test, batch_size=10)

print(f"20 Newsgroups Training Accuracy: {train_accuracy_newsgroups * 100:.2f}%")
print(f"20 Newsgroups Test Accuracy: {test_accuracy_newsgroups * 100:.2f}%")

Training KNN on MNIST (euclidean distance):
MNIST Training Accuracy: 97.97%
MNIST Test Accuracy: 97.33%

Training KNN on 20 Newsgroups (cosine similarity):
20 Newsgroups Training Accuracy: 5.00%
20 Newsgroups Test Accuracy: 4.83%
