In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import matplotlib.pyplot as plt

class GurmukhiDataset(Dataset):
    def __init__(self, data_dir, transform = None):
        self.data_dir = data_dir
        self.image_set = []
        self.transform = transform
        for class_name in os.listdir(data_dir):
            class_path = os.path.join(self.data_dir, class_name)
            if os.path.isdir(class_path):
                for img_file in os.listdir(class_path):
                    self.image_set.append((os.path.join(class_path, img_file), int(class_name)))
    def __len__(self):
        return len(self.image_set)
    def __getitem__(self, index):
        image_name, label = self.image_set[index]
        image = Image.open(image_name).convert('L')
        label = int(os.path.basename(os.path.dirname(image_name)))
        if self.transform:
            image = self.transform(image)
        return image, label

root_dir = '/Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/'
transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = GurmukhiDataset(data_dir = f"{root_dir}train/", transform = transform)
test_dataset = GurmukhiDataset(data_dir = f"{root_dir}val/", transform = transform)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = True)

def l1_regularization(model, lambda_l1=0.001):
    l1_norm = sum(p.abs().sum() for p in model.parameters())
    return lambda_l1 * l1_norm
def l2_regularization(model, lambda_l2=0.001):
    l2_norm = sum((p ** 2).sum() for p in model.parameters())
    return lambda_l2 * l2_norm

class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 2)
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 3)
        self.fc1 = nn.Linear(64 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)
        # self.dropout = self.dropout_manual(0.5)
    def forward(self, x):
        x = self.max_pool(self.relu(self.conv1(x)))
        x = self.max_pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        # x = self.dropout(x)
        x = self.fc2(x)
        return x
    def dropout_manual(self, x, dropout_prob=0.5):
        if self.training:  # Apply dropout only during training
            mask = (torch.rand(x.shape, device=x.device) > dropout_prob).float()
            x = x * mask / (1 - dropout_prob)  # Scale activations
        return x

def gradient_checking(model, X, y, epsilon=1e-5):
    grad_diffs = []
    param_names = []

    # Ensure model is in evaluation mode
    model.eval()
    
    # Compute actual gradients via backprop
    X, y = X.to(torch.float32), y.to(torch.long)
    outputs = model(X)
    loss = criterion(outputs, y)
    model.zero_grad()
    loss.backward()

    for name, param in model.named_parameters():
        if param.requires_grad:
            param_data = param.data.clone()
            grad_approx = torch.zeros_like(param)

            for i in range(param.numel()):
                param.data.view(-1)[i] += epsilon
                loss1 = criterion(model(X), y).item()

                param.data.view(-1)[i] -= 2 * epsilon
                loss2 = criterion(model(X), y).item()

                grad_approx.view(-1)[i] = (loss1 - loss2) / (2 * epsilon)
                param.data = param_data  # Reset parameter
            
            if param.grad is not None:
                grad_diff = torch.norm(param.grad - grad_approx) / (torch.norm(param.grad + grad_approx) + 1e-7)
                grad_diffs.append(grad_diff.item())
                param_names.append(name)

    # Plot Gradient Differences
    plt.figure(figsize=(8, 5))
    plt.barh(param_names, grad_diffs, color='skyblue')
    plt.xlabel("Gradient Difference")
    plt.ylabel("Model Parameters")
    plt.title("Gradient Checking Differences for Each Parameter")
    plt.show()

model = NN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.01)

epochs = 1
losses = []
train_losses, test_losses = [], []

for epoch in range(epochs):
    model.train()
    epoch_train_loss = 0
    for images, labels in train_loader:
        print('asld,las,d')
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss += l1_regularization(model) + l2_regularization(model)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()
        # Run Gradient Checking on a small batch
        X_sample, y_sample = next(iter(train_loader))
        gradient_checking(model, X_sample, y_sample)
    
    avg_loss = epoch_train_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch: {epoch + 1} / {epochs}; Loss: {loss.item():.4f}")
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for images, labels in test_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    avg_test_loss = test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    print(f"Epoch: {epoch + 1} / {epochs}; Test Loss: {avg_test_loss:.4f}")
    if epoch == 0: #only run gradient check at the first epoch.
        X_sample, y_sample = next(iter(train_loader))
        gradient_checking(model, X_sample, y_sample)

plt.plot(range(1, epochs+1), losses)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.show()

# Plot Training vs Test Loss
plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs + 1), train_losses, label="Train Loss")
plt.plot(range(1, epochs + 1), test_losses, label="Test Loss", linestyle="--")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Train vs Test Loss")
plt.legend()
plt.show()

In [None]:
class GurmukhiDataset(Dataset):
    def __init__(self, data_dir, transform = None):
        self.data_dir = data_dir
        self.image_set = []
        self.transform = transform
        for class_name in os.listdir(data_dir):
            class_path = os.path.join(self.data_dir, class_name)
            if os.path.isdir(class_path):
                for img_file in os.listdir(class_path):
                    self.image_set.append((os.path.join(class_path, img_file), int(class_name)))
    def __len__(self):
        return len(self.image_set)
    def __getitem__(self, index):
        image_name, label = self.image_set[index]
        image = Image.open(image_name).convert('L')
        label = int(os.path.basename(os.path.dirname(image_name)))
        if self.transform:
            image = self.transform(image)
        return image, label

In [None]:
root_dir = '/Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/'
transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = GurmukhiDataset(data_dir = f"{root_dir}train/", transform = transform)
test_dataset = GurmukhiDataset(data_dir = f"{root_dir}val/", transform = transform)

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = True)

def l1_regularization(model, lambda_l1=0.001):
    l1_norm = sum(p.abs().sum() for p in model.parameters())
    return lambda_l1 * l1_norm
def l2_regularization(model, lambda_l2=0.001):
    l2_norm = sum((p ** 2).sum() for p in model.parameters())
    return lambda_l2 * l2_norm

In [None]:
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 2)
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 3)
        self.fc1 = nn.Linear(64 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)
        # self.dropout = self.dropout_manual(0.5)
    def forward(self, x):
        x = self.max_pool(self.relu(self.conv1(x)))
        x = self.max_pool(self.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        # x = self.dropout(x)
        x = self.fc2(x)
        return x
    def dropout_manual(self, x, dropout_prob=0.5):
        if self.training:  # Apply dropout only during training
            mask = (torch.rand(x.shape, device=x.device) > dropout_prob).float()
            x = x * mask / (1 - dropout_prob)  # Scale activations
        return x

In [None]:
def gradient_checking(model, X, y, epsilon=1e-5):
    grad_diffs = []
    param_names = []

    # Ensure model is in evaluation mode
    model.eval()
    
    # Compute actual gradients via backprop
    X, y = X.to(torch.float32), y.to(torch.long)
    outputs = model(X)
    loss = criterion(outputs, y)
    model.zero_grad()
    loss.backward()

    for name, param in model.named_parameters():
        if param.requires_grad:
            param_data = param.data.clone()
            grad_approx = torch.zeros_like(param)

            for i in range(param.numel()):
                param.data.view(-1)[i] += epsilon
                loss1 = criterion(model(X), y).item()

                param.data.view(-1)[i] -= 2 * epsilon
                loss2 = criterion(model(X), y).item()

                grad_approx.view(-1)[i] = (loss1 - loss2) / (2 * epsilon)
                param.data = param_data  # Reset parameter
            
            if param.grad is not None:
                grad_diff = torch.norm(param.grad - grad_approx) / (torch.norm(param.grad + grad_approx) + 1e-7)
                grad_diffs.append(grad_diff.item())
                param_names.append(name)

    # Plot Gradient Differences
    plt.figure(figsize=(8, 5))
    plt.barh(param_names, grad_diffs, color='skyblue')
    plt.xlabel("Gradient Difference")
    plt.ylabel("Model Parameters")
    plt.title("Gradient Checking Differences for Each Parameter")
    plt.show()

In [None]:
model = NN()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.01)

epochs = 1
losses = []
train_losses, test_losses = [], []

for epoch in range(epochs):
    model.train()
    epoch_train_loss = 0
    for images, labels in train_loader:
        print('asld,las,d')
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss += l1_regularization(model) + l2_regularization(model)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()
        # Run Gradient Checking on a small batch
        X_sample, y_sample = next(iter(train_loader))
        gradient_checking(model, X_sample, y_sample)
    
    avg_loss = epoch_train_loss / len(train_loader)
    train_losses.append(avg_loss)
    print(f"Epoch: {epoch + 1} / {epochs}; Loss: {loss.item():.4f}")
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for images, labels in test_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
    avg_test_loss = test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    print(f"Epoch: {epoch + 1} / {epochs}; Test Loss: {avg_test_loss:.4f}")
    if epoch == 0: #only run gradient check at the first epoch.
        X_sample, y_sample = next(iter(train_loader))
        gradient_checking(model, X_sample, y_sample)

In [None]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f"Accuracy of prediction: {100 * correct / total: .2f}")

In [None]:
plt.plot(range(1, epochs+1), losses)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.show()

In [None]:
# Plot Training vs Test Loss
plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs + 1), train_losses, label="Train Loss")
plt.plot(range(1, epochs + 1), test_losses, label="Test Loss", linestyle="--")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Train vs Test Loss")
plt.legend()
plt.show()

In [None]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# from torchvision import transforms
# from PIL import Image
# import os
# import matplotlib.pyplot as plt

# class GurmukhiDataset(Dataset):
#     def __init__(self, data_dir, transform=None):
#         self.data_dir = data_dir
#         self.image_set = []
#         self.transform = transform
#         for class_name in os.listdir(data_dir):
#             class_path = os.path.join(self.data_dir, class_name)
#             if os.path.isdir(class_path):
#                 for img_file in os.listdir(class_path):
#                     self.image_set.append((os.path.join(class_path, img_file), int(class_name)))
    
#     def __len__(self):
#         return len(self.image_set)
    
#     def __getitem__(self, index):
#         image_name, label = self.image_set[index]
#         image = Image.open(image_name).convert('L')
#         if self.transform:
#             image = self.transform(image)
#         return image, label

# root_dir = '/Users/sundarasubramanian/yoyo/CIS-583/HW3/Q2_GurMukhi/GurNum/'
# transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# train_dataset = GurmukhiDataset(data_dir=f"{root_dir}train/", transform=transform)
# test_dataset = GurmukhiDataset(data_dir=f"{root_dir}val/", transform=transform)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def l1_regularization(model, lambda_l1=0.001):
#     return lambda_l1 * sum(p.abs().sum() for p in model.parameters())

# def l2_regularization(model, lambda_l2=0.001):
#     return lambda_l2 * sum((p ** 2).sum() for p in model.parameters())

# class NN(nn.Module):
#     def __init__(self):
#         super(NN, self).__init__()
#         self.relu = nn.ReLU()
#         self.max_pool = nn.MaxPool2d(kernel_size=2)
#         self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
#         self.fc1 = nn.Linear(64 * 5 * 5, 128)
#         self.fc2 = nn.Linear(128, 10)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.max_pool(self.relu(self.conv1(x)))
#         x = self.max_pool(self.relu(self.conv2(x)))
#         x = x.view(x.size(0), -1)
#         x = self.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = self.fc2(x)
#         return x

# def gradient_checking(model, X, y, epsilon=1e-5):
#     grad_diffs = []
#     param_names = []

#     model.eval()
#     X, y = X.to(torch.float32), y.to(torch.long)
#     outputs = model(X)
#     loss = criterion(outputs, y)
#     model.zero_grad()
#     loss.backward()

#     for name, param in model.named_parameters():
#         if param.requires_grad:
#             param_data = param.data.clone()
#             grad_approx = torch.zeros_like(param)

#             for i in range(param.numel()):
#                 param.data.view(-1)[i] += epsilon
#                 loss1 = criterion(model(X), y).item()

#                 param.data.view(-1)[i] -= 2 * epsilon
#                 loss2 = criterion(model(X), y).item()

#                 grad_approx.view(-1)[i] = (loss1 - loss2) / (2 * epsilon)
#                 param.data = param_data

#             if param.grad is not None:
#                 grad_diff = torch.norm(param.grad - grad_approx) / (torch.norm(param.grad + grad_approx) + 1e-7)
#                 grad_diffs.append(grad_diff.item())
#                 param_names.append(name)

#     plt.figure(figsize=(8, 5))
#     plt.barh(param_names, grad_diffs, color='skyblue')
#     plt.xlabel("Gradient Difference")
#     plt.ylabel("Model Parameters")
#     plt.title("Gradient Checking Differences for Each Parameter")
#     plt.show()

# model = NN().to(device)

# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

# epochs = 10
# losses = []
# train_losses, test_losses = [], []

# for epoch in range(epochs):
#     model.train()
#     epoch_train_loss = 0

#     for images, labels in train_loader:
#         images, labels = images.to(device), labels.to(device)

#         outputs = model(images)
#         loss = criterion(outputs, labels)
#         loss += l1_regularization(model) + l2_regularization(model)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         epoch_train_loss += loss.item()

#     avg_loss = epoch_train_loss / len(train_loader)
#     train_losses.append(avg_loss)
#     losses.append(avg_loss)

#     print(f"Epoch: {epoch + 1} / {epochs}; Loss: {avg_loss:.4f}")

#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for images, labels in test_loader:
#             images, labels = images.to(device), labels.to(device)
#             outputs = model(images)
#             loss = criterion(outputs, labels)
#             test_loss += loss.item()
    
#     avg_test_loss = test_loss / len(test_loader)
#     test_losses.append(avg_test_loss)

#     print(f"Epoch: {epoch + 1} / {epochs}; Test Loss: {avg_test_loss:.4f}")

#     if epoch == 0:
#         X_sample, y_sample = next(iter(train_loader))
#         X_sample, y_sample = X_sample.to(device), y_sample.to(device)
#         gradient_checking(model, X_sample, y_sample)

# plt.plot(range(1, epochs+1), losses)
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Training Loss Curve")
# plt.show()

# plt.figure(figsize=(8, 5))
# plt.plot(range(1, epochs + 1), train_losses, label="Train Loss")
# plt.plot(range(1, epochs + 1), test_losses, label="Test Loss", linestyle="--")
# plt.xlabel("Epochs")
# plt.ylabel("Loss")
# plt.title("Train vs Test Loss")
# plt.legend()
# plt.show()