## Download and preprocess data

In [1]:
!kaggle competitions download -c digit-recognizer -p .

digit-recognizer.zip: Skipping, found more recently modified local copy (use --force to force download)


- Use cuda to train

In [2]:
import pandas as pd
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
import zipfile
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms

from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [3]:
with zipfile.ZipFile("digit-recognizer.zip", "r") as zip_ref:
    zip_ref.extractall(".")  


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- Split train dataset to train and validate dataset

In [7]:
X = train.drop("label", axis=1).values / 255.0  # Normalize
y = train["label"].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

- Use torchvision transformation to augment and populate more data for more generalization and robustness

In [8]:
transform = transforms.Compose([
    transforms.RandomRotation(15),        # Rotate ±15 degrees
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),  # Shift by 10%
    transforms.RandomHorizontalFlip(),    # Horizontal Flip (not useful for digits, but added for variety)
    transforms.RandomErasing(p=0.3)       # Randomly erase part of the image
])

def apply_transform(X):
    X = X.view(-1, 1, 28, 28)  # Reshape to (N, C, H, W)
    X = transform(X)           # Apply transforms
    X = X.view(-1, 784)        # Flatten back
    return X


def generate_augmented_data(X, y, num_augments=2):
    X_aug, y_aug = [], []

    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.long)

    for _ in range(num_augments):
        X_transformed = apply_transform(X_tensor)
        X_aug.append(X_transformed)
        y_aug.append(y_tensor)

    X_aug = torch.cat([X_tensor] + X_aug)  # Stack original + augmented
    y_aug = torch.cat([y_tensor] + y_aug)

    return X_aug.numpy(), y_aug.numpy()

X_train_aug, y_train_aug = generate_augmented_data(X_train, y_train, num_augments=2)  # Doubles dataset



### Prepare dataloader

In [9]:
X_train_tensor = torch.tensor(X_train_aug, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_aug, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(test.values / 255.0, dtype=torch.float32)


batch_size = 128
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=batch_size, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_tensor), batch_size=batch_size, shuffle=False)



### Defining the model:
- Fully connected NLP, 10 hidden layers, use dropout with skip connection
- Activation function is `swish`

In [10]:
def swish(x):
    return x * torch.sigmoid(x)

class MLP(nn.Module):
    def __init__(self, input_size=784, hidden_size=1024, output_size=10, dropout_rate=0.3):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(dropout_rate)
        
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        
        for _ in range(9):  # 9 additional hidden layers
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))
            self.batch_norms.append(nn.BatchNorm1d(hidden_size))
            self.dropouts.append(nn.Dropout(dropout_rate))
        
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x_res = swish(self.bn1(self.fc1(x)))
        x_res = self.dropout1(x_res)
        
        for i in range(9):
            x_new = swish(self.batch_norms[i](self.hidden_layers[i](x_res)))
            x_new = self.dropouts[i](x_new)
            x_res = x_res + x_new  # Skip connection
        
        out = self.fc_out(x_res)
        return out



- Use a scheduler for learning rate to help convergence

In [12]:
model = MLP(hidden_size=512).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)  

epochs=200
for epoch in range(epochs):
    model.train()
    total_loss, correct = 0, 0
    
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        correct += (outputs.argmax(1) == y_batch).sum().item()

    train_acc = correct / len(X_train_aug)
    
    # Validation
    model.eval()
    correct = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            outputs = model(X_val_batch)
            correct += (outputs.argmax(1) == y_val_batch).sum().item()
    
    val_acc = correct / len(X_val)
    
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}")

    # Adjust learning rate
    scheduler.step()



Epoch 1/200 - Loss: 2192.3208 - Train Acc: 0.8111 - Val Acc: 0.9324
Epoch 2/200 - Loss: 379.0217 - Train Acc: 0.8996 - Val Acc: 0.9467
Epoch 3/200 - Loss: 243.8747 - Train Acc: 0.9247 - Val Acc: 0.9576
Epoch 4/200 - Loss: 229.5039 - Train Acc: 0.9338 - Val Acc: 0.9524
Epoch 5/200 - Loss: 177.8361 - Train Acc: 0.9436 - Val Acc: 0.9595
Epoch 6/200 - Loss: 160.8040 - Train Acc: 0.9485 - Val Acc: 0.9643
Epoch 7/200 - Loss: 146.6121 - Train Acc: 0.9528 - Val Acc: 0.9662
Epoch 8/200 - Loss: 154.0072 - Train Acc: 0.9515 - Val Acc: 0.9690
Epoch 9/200 - Loss: 138.3157 - Train Acc: 0.9560 - Val Acc: 0.9710
Epoch 10/200 - Loss: 121.0048 - Train Acc: 0.9597 - Val Acc: 0.9714
Epoch 11/200 - Loss: 118.3342 - Train Acc: 0.9612 - Val Acc: 0.9700
Epoch 12/200 - Loss: 111.4227 - Train Acc: 0.9629 - Val Acc: 0.9757
Epoch 13/200 - Loss: 100.6233 - Train Acc: 0.9658 - Val Acc: 0.9714
Epoch 14/200 - Loss: 99.9965 - Train Acc: 0.9671 - Val Acc: 0.9757
Epoch 15/200 - Loss: 99.3849 - Train Acc: 0.9670 - Val Ac

### Generate .csv file for submission

In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for X_test_batch in test_loader:
        X_test_batch = X_test_batch[0].to(device)
        outputs = model(X_test_batch)
        predictions.extend(outputs.argmax(1).cpu().numpy())

# Create submission file
submission = pd.DataFrame({"ImageId": np.arange(1, len(predictions) + 1), "Label": predictions})
submission.to_csv("submission.csv", index=False)


Screenshot: https://github.com/phuongwhuynh/NLP_Lab/blob/main/Lab8/hw/Screenshot.png