In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import matplotlib.pyplot as plt

class GurmukhiDataset(Dataset):
    def __init__(self, data_dir, transform = None):
        print(f"Dataset init, data_dir: {data_dir}")
        self.data_dir = data_dir
        self.image_set = []
        self.transform = transform
        for class_name in os.listdir(data_dir):
            class_path = os.path.join(self.data_dir, class_name)
            if os.path.isdir(class_path):
                print(f"  Processing class: {class_name}")
                for img_file in os.listdir(class_path):
                    self.image_set.append((os.path.join(class_path, img_file), int(class_name)))
        print(f"  Total images loaded: {len(self.image_set)}")
    def __len__(self):
        return len(self.image_set)
    def __getitem__(self, index):
        image_name, label = self.image_set[index]
        image = Image.open(image_name).convert('L')
        label = int(os.path.basename(os.path.dirname(image_name)))
        if self.transform:
            image = self.transform(image)
        return image, label

root_dir = '/Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/' # Replace with your actual path
transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = GurmukhiDataset(data_dir = f"{root_dir}train/", transform = transform)
test_dataset = GurmukhiDataset(data_dir = f"{root_dir}val/", transform = transform)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False) # Shuffle False for consistent test loss

def l1_regularization(model, lambda_l1=0.001):
    l1_norm = sum(p.abs().sum() for p in model.parameters())
    return lambda_l1 * l1_norm

def l2_regularization(model, lambda_l2=0.001):
    l2_norm = sum((p ** 2).sum() for p in model.parameters())
    return lambda_l2 * l2_norm

class NN(nn.Module):
    def __init__(self, dropout_prob=0.5):
        super(NN, self).__init__()
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 2)
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 3)
        self.fc1 = nn.Linear(64 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)
        self.dropout_prob = dropout_prob

    def forward(self, x):
        x = self.max_pool(self.relu(self.conv1(x)))
        x = self.max_pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout_manual(x)
        x = self.fc2(x)
        return x

    def dropout_manual(self, x):
        if self.training:
            mask = (torch.rand_like(x) > self.dropout_prob).float()
            x = x * mask / (1 - self.dropout_prob)
        return x

def gradient_checking(model, criterion, X, y, epsilon=1e-5):
    grad_diffs = []
    param_names = []

    model.eval()

    X, y = X.to(torch.float32), y.to(torch.long)
    outputs = model(X)
    loss = criterion(outputs, y)
    model.zero_grad()
    loss.backward()

    for name, param in model.named_parameters():
        if param.requires_grad:
            param_data = param.data.clone()
            grad_approx = torch.zeros_like(param)

            for i in range(param.numel()):
                param.data.view(-1)[i] += epsilon
                loss1 = criterion(model(X), y).item()

                param.data.view(-1)[i] -= 2 * epsilon
                loss2 = criterion(model(X), y).item()

                grad_approx.view(-1)[i] = (loss1 - loss2) / (2 * epsilon)
                param.data = param_data

            if param.grad is not None:
                grad_diff = torch.norm(param.grad - grad_approx) / (torch.norm(param.grad + grad_approx) + 1e-7)
                grad_diffs.append(grad_diff.item())
                param_names.append(name)

    plt.figure(figsize=(8, 5))
    plt.barh(param_names, grad_diffs, color='skyblue')
    plt.xlabel("Gradient Difference")
    plt.ylabel("Model Parameters")
    plt.title("Gradient Checking Differences for Each Parameter")
    plt.show()

def train_and_evaluate(model, optimizer, criterion, train_loader, test_loader, epochs, regularization='none', lambda_l1=0.001, lambda_l2=0.001):
    train_losses, test_losses = [], []
    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0
        for images, labels in train_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)

            if regularization == 'l1':
                loss += l1_regularization(model, lambda_l1)
            elif regularization == 'l2':
                loss += l2_regularization(model, lambda_l2)
            elif regularization == 'l1_l2':
                loss += l1_regularization(model, lambda_l1) + l2_regularization(model, lambda_l2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()

        avg_train_loss = epoch_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"Epoch: {epoch + 1} / {epochs}; Train Loss ({regularization}): {avg_train_loss:.4f}")

        model.eval()
        test_loss = 0
        correct_predictions = 0
        total_samples = 0
        with torch.no_grad():
            for images, labels in test_loader:
                outputs = model(images)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()
                total_samples += labels.size(0)

        avg_test_loss = test_loss / len(test_loader)
        test_losses.append(avg_test_loss)
        accuracy = correct_predictions / total_samples
        print(f"Epoch: {epoch + 1} / {epochs}; Test Loss ({regularization}): {avg_test_loss:.4f}; Accuracy: {accuracy:.4f}")

    return train_losses, test_losses, accuracy

# Hyperparameters and settings
learning_rate = 0.001
weight_decay = 0.01
epochs = 10
lambda_l1 = 0.0005
lambda_l2 = 0.001
dropout_prob = 0.5

criterion = nn.CrossEntropyLoss()

# Configurations to train and compare
regularization_types = ['none', 'l1', 'l2', 'dropout', 'l1_l2_dropout']
all_train_losses = {}
all_test_losses = {}
all_accuracies = {}

# --- Gradient Checking (Run once before training) ---
print("Running Gradient Checking for initial model (no regularization)...")
initial_model = NN(dropout_prob=dropout_prob) # Model with dropout for gradient check
X_sample, y_sample = next(iter(train_loader))
gradient_checking(initial_model, criterion, X_sample, y_sample)
print("Gradient Checking Completed.\n")

for regularization_type in regularization_types:
    print(f"\n--- Training with Regularization: {regularization_type} ---")
    model = NN(dropout_prob=dropout_prob)

    # Optimizer - Adam with weight decay (L2 built-in, but we are comparing manual L2)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0 if regularization_type != 'l2_pytorch' else weight_decay) # Use weight_decay for pytorch L2 only

    if regularization_type == 'dropout':
        # Train with manual dropout already in the NN class.
        pass # Dropout is handled within the NN class itself

    train_losses, test_losses, accuracy = train_and_evaluate(
        model=model,
        optimizer=optimizer,
        criterion=criterion,
        train_loader=train_loader,
        test_loader=test_loader,
        epochs=epochs,
        regularization=regularization_type if regularization_type in ['l1', 'l2', 'l1_l2'] else 'none', # Apply L1, L2, L1_L2 only when specified
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2
    )

    all_train_losses[regularization_type] = train_losses
    all_test_losses[regularization_type] = test_losses
    all_accuracies[regularization_type] = accuracy

# --- Plotting Loss Curves ---
plt.figure(figsize=(12, 6))
for regularization_type in regularization_types:
    plt.plot(range(1, epochs + 1), all_train_losses[regularization_type], label=f'Train Loss ({regularization_type})')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Comparison")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))
for regularization_type in regularization_types:
    plt.plot(range(1, epochs + 1), all_test_losses[regularization_type], label=f'Test Loss ({regularization_type})')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Test Loss Comparison")
plt.legend()
plt.grid(True)
plt.show()

# --- Print Accuracies ---
print("\n--- Final Accuracies ---")
for regularization_type in regularization_types:
    print(f"Accuracy ({regularization_type}): {all_accuracies[regularization_type]:.4f}")

# --- Performance Comparison and Reasons ---
print("\n--- Performance Comparison and Reasons ---")
print("Comparison based on Training Loss, Test Loss and Test Accuracy after 10 epochs:")
print("Observations and Reasons will be detailed here based on the plots and accuracies.")
print("""
**Observations and Reasons (Example - Replace with your actual analysis after running):**

* **No Regularization:** Serves as the baseline. May overfit to the training data, leading to lower test accuracy compared to regularized models.
* **L1 Regularization:**  Tends to drive less important feature weights to exactly zero, leading to feature selection and potentially simpler models. Might improve generalization and reduce overfitting compared to no regularization.
* **L2 Regularization:**  Penalizes large weights, encouraging weights to be small but not exactly zero.  Helps to reduce overfitting and improve generalization by making the model less sensitive to individual features of the training data.
* **Dropout:** Randomly drops out neurons during training, preventing neurons from co-adapting too much and forcing the network to learn more robust features. Effective in reducing overfitting.
* **L1+L2+Dropout:** Combines the benefits of all three regularization techniques. Might provide the best generalization but also might be harder to tune the regularization strengths (lambdas and dropout probability).

**Expected Outcomes (Hypothetical - Replace with your actual outcomes):**

* Models with L1, L2, and Dropout are expected to have lower test loss and higher test accuracy compared to the model with no regularization, indicating better generalization.
* The combination of L1, L2, and Dropout might offer the best performance, but it depends on the dataset and hyperparameters.
* L1 might lead to sparser weights, which can be an advantage in terms of model interpretability and potentially faster inference in some hardware.
* L2 might lead to smoother weight distributions and stable training.
* Dropout might be very effective in preventing overfitting, especially in larger networks.

**Further Analysis:**

* You can further analyze by varying the regularization strengths (lambda_l1, lambda_l2, dropout_prob) and observing their impact on performance.
* Comparing the magnitude of weights in models trained with L1 vs. L2 regularization can also provide insights.
* Observing the training and test loss curves can indicate if overfitting is occurring and how well each regularization technique mitigates it.

**Remember to replace the "Example" and "Hypothetical" sections with your actual observations and analysis after running the code.**
""")

Dataset init, data_dir: /Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/train/
  Processing class: 9
  Processing class: 0
  Processing class: 7
  Processing class: 6
  Processing class: 1
  Processing class: 8
  Processing class: 4
  Processing class: 3
  Processing class: 2
  Processing class: 5
  Total images loaded: 1000
Dataset init, data_dir: /Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/val/
  Processing class: 9
  Processing class: 0
  Processing class: 7
  Processing class: 6
  Processing class: 1
  Processing class: 8
  Processing class: 4
  Processing class: 3
  Processing class: 2
  Processing class: 5
  Total images loaded: 178
Train dataset size: 1000
Test dataset size: 178
Running Gradient Checking for initial model (no regularization)...
