## Homework 3 - Parth Doshi, UID - 805623259

In [None]:
import pandas as pd
import numpy as np

# Question 3

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cross_entropy_loss(y, t):
    return -np.mean(t * np.log(y) + (1 - t) * np.log(1 - y))


In [None]:
class LogisticRegression:
    def __init__(self, learning_rate, batch_size, max_iters):
      # required hyperparameters
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.max_iters = max_iters

    def fit(self, X, t):
        N, D = X.shape
        # initialize weights to 0s
        self.w = np.zeros(D)

        for i in range(self.max_iters):
            # shuffle rows
            indices = np.arange(N)
            np.random.shuffle(indices)
            self.w = np.random.randn(D)
            # loop through the batch size
            for start_idx in range(0, N, self.batch_size):
                end_idx = min(start_idx + self.batch_size, N)
                # obtain the batch indices
                batch_indices = indices[start_idx:end_idx]
                # feature batch
                X_batch = X[batch_indices]
                # target batch
                t_batch = t[batch_indices]

                y_batch = sigmoid(np.dot(X_batch, self.w))

                # average gradient of each batch
                gradient = np.dot(X_batch.T, (y_batch - t_batch)) / len(t_batch)

                # Update weights
                self.w -= self.learning_rate * gradient

    def predict_proba(self, X):
      # probabilities
        return sigmoid(np.dot(X, self.w))

    def predict(self, X):
      # class label
        return (self.predict_proba(X) >= 0.5).astype(int)

    def accuracy(self, X, t):
      """

      Compute the accuracy of the model on a given dataset.

      Parameters:
      X (array): The input features.
      t (array): The target labels.

      Returns:
      accuracy (float): The accuracy of the model.
      """
      predictions = self.predict(X)
      return np.mean(predictions == t)

    def confusion_mat(self, y_true, y_pred):
      """
      Compute the confusion matrix for binary classification.

      Parameters:
      y_true (array-like): The true binary labels (0 or 1).
      y_pred (array-like): The predicted binary labels (0 or 1).

      Returns:
      TP (int): True Positives
      FP (int): False Positives
      FN (int): False Negatives
      TN (int): True Negatives
      """
      TP = np.sum((y_true == 1) & (y_pred == 1))
      TN = np.sum((y_true == 0) & (y_pred == 0))
      FP = np.sum((y_true == 0) & (y_pred == 1))
      FN = np.sum((y_true == 1) & (y_pred == 0))
      return TP, FP, FN, TN

    def precision_recall_f1(self, y_true, y_pred):
      """
      Compute the precision, recall, and F1-score for binary classification.
      Parameters:
      y_true (array-like): The true binary labels (0 or 1).
      y_pred (array-like): The predicted binary labels (0 or 1).
      Returns:
      precision (float): The precision value.
      recall (float): The recall value.
      f1 (float): The F1-score value.
      """
      TP, FP, FN, TN = self.confusion_mat(y_true, y_pred)

      # Precision
      precision = TP / (TP + FP) if (TP + FP) > 0 else 0

      # Recall
      recall = TP / (TP + FN) if (TP + FN) > 0 else 0

      # F1-Score
      f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

      return precision, recall, f1

    def evaluate(self, X, t):
      """
      Evaluate the model on a given dataset.

      Parameters:
      X (array): The input features.
      t (array): The target labels.

      Returns:
      loss (float): The cross-entropy loss
      accuracy (float): The accuracy of the model
      precision (float): The precision of the model
      recall (float): The recall of the model
      f1 (float): The F1-score of the model
      """
      y_pred = self.predict(X)
      y_prob = self.predict_proba(X)

      loss = cross_entropy_loss(y_prob, t)
      accuracy = self.accuracy(X, t)

      precision, recall, f1 = self.precision_recall_f1(t, y_pred)

      return loss, accuracy, precision, recall, f1

# Question 4

### a

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


cancer = load_breast_cancer()


### b

In [None]:
X = cancer.data
y = cancer.target

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Apply the same transformation to validation and test data
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### c

In [None]:
y_train_combined = np.concatenate([y_train, y_val])

# Report the size of each class
class_0_count = sum(y_train_combined == 0)
class_1_count = sum(y_train_combined == 1)

print(f"Class 0 count in training + validation set: {class_0_count}")
print(f"Class 1 count in training + validation set: {class_1_count}")


Class 0 count in training + validation set: 170
Class 1 count in training + validation set: 285


### d & e

In [None]:
model1 = LogisticRegression(learning_rate=0.1, batch_size=10, max_iters=1000)
model1.fit(X_train, y_train)

train_loss, train_acc, train_prec, train_rec, train_f1 = model1.evaluate(X_train, y_train)
val_loss, val_acc, val_prec, val_rec, val_f1 = model1.evaluate(X_val, y_val)
test_loss, test_acc, test_prec, test_rec, test_f1 = model1.evaluate(X_test, y_test)

print(f"Training set: Loss = {train_loss}, Accuracy = {train_acc}, Precision = {train_prec}, Recall = {train_rec}, F1-score = {train_f1}")
print(f"Validation set: Loss = {val_loss}, Accuracy = {val_acc}, Precision = {val_prec}, Recall = {val_rec}, F1-score = {val_f1}")
print(f"Test set: Loss = {test_loss}, Accuracy = {test_acc}, Precision = {test_prec}, Recall = {test_rec}, F1-score = {test_f1}")

Training set: Loss = 0.3114193793669439, Accuracy = 0.9032258064516129, Precision = 0.9593908629441624, Recall = 0.883177570093458, F1-score = 0.9197080291970803
Validation set: Loss = 0.33800057790800325, Accuracy = 0.8508771929824561, Precision = 0.8857142857142857, Recall = 0.8732394366197183, F1-score = 0.8794326241134751
Test set: Loss = 0.4038411571519004, Accuracy = 0.8596491228070176, Precision = 0.9516129032258065, Recall = 0.8194444444444444, F1-score = 0.8805970149253732


In [None]:
model2 = LogisticRegression(learning_rate=0.001, batch_size=32, max_iters=1000)
model2.fit(X_train, y_train)

# Evaluate on training, validation, and test sets
train_loss, train_acc, train_prec, train_rec, train_f1 = model2.evaluate(X_train, y_train)
val_loss, val_acc, val_prec, val_rec, val_f1 = model2.evaluate(X_val, y_val)
test_loss, test_acc, test_prec, test_rec, test_f1 = model2.evaluate(X_test, y_test)

print(f"Training set: Loss = {train_loss}, Accuracy = {train_acc}, Precision = {train_prec}, Recall = {train_rec}, F1-score = {train_f1}")
print(f"Validation set: Loss = {val_loss}, Accuracy = {val_acc}, Precision = {val_prec}, Recall = {val_rec}, F1-score = {val_f1}")
print(f"Test set: Loss = {test_loss}, Accuracy = {test_acc}, Precision = {test_prec}, Recall = {test_rec}, F1-score = {test_f1}")

Training set: Loss = 0.8639039850716768, Accuracy = 0.7243401759530792, Precision = 0.8061224489795918, Recall = 0.7383177570093458, F1-score = 0.7707317073170732
Validation set: Loss = 1.0615579916626554, Accuracy = 0.6403508771929824, Precision = 0.7027027027027027, Recall = 0.7323943661971831, F1-score = 0.7172413793103449
Test set: Loss = 0.9386456117534226, Accuracy = 0.7105263157894737, Precision = 0.8095238095238095, Recall = 0.7083333333333334, F1-score = 0.7555555555555556


### f

Model 1 achieved better overall performance on all sets (training, validation, and test) compared to Model 2. It exhibited higher accuracy, precision, recall, and F1-score, indicating that a higher learning rate (0.1) and smaller batch size (10) improved model performance. In addition, in the context of diagnosing cancer, recall  may be more critical because it measures the model’s ability to correctly identify positive cases (cancerous instances). A high recall ensures fewer missed cancer cases, which is essential in medical diagnostics where failing to identify a cancerous instance can have serious consequences.
