In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np

# Optional: For progress visualization
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# --- Data Preparation ---
train_data = pd.read_csv('/Users/matthew/Downloads/archive/mnist_train.csv')
test_data = pd.read_csv('/Users/matthew/Downloads/archive/mnist_test.csv')

y_train = train_data.iloc[:, [0]]
X_train = train_data.iloc[:, 1:]
y_test = test_data.iloc[:, [0]]
X_test = test_data.iloc[:, 1:]

print(f"x_label shape: {y_train.shape}")
print(f"x_train shape: {X_train.shape}")
print(f"x_test shape: {y_test.shape}")
print(f"x_test shape: {X_test.shape}")

X_train_np = X_train.values.astype(np.float32)
y_train_np = y_train.values.astype(np.int64).squeeze()  # Shape: (60000,)
X_test_np = X_test.values.astype(np.float32)
y_test_np = y_test.values.astype(np.int64).squeeze()    # Shape: (10000,)

# Normalize the feature data (values between 0 and 1)
X_train_np /= 255.0
X_test_np /= 255.0

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train_np)
y_train_tensor = torch.from_numpy(y_train_np)
X_test_tensor = torch.from_numpy(X_test_np)
y_test_tensor = torch.from_numpy(y_test_np)

# Verify shapes
print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"y_train_tensor shape: {y_train_tensor.shape}")
print(f"X_test_tensor shape: {X_test_tensor.shape}")
print(f"y_test_tensor shape: {y_test_tensor.shape}")

# --- Creating Datasets and DataLoaders ---

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Define hyperparameters
batch_size = 64

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f'Number of training batches: {len(train_loader)}')
print(f'Number of testing batches: {len(test_loader)}')

# --- Defining the MLP Model ---

class MLP(nn.Module):
    def __init__(self, input_size=784, hidden_sizes=[512, 256], num_classes=10, dropout=0.2):
        super(MLP, self).__init__()
        # Define the first hidden layer
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        
        # Define the second hidden layer
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        
        # Define the output layer
        self.fc3 = nn.Linear(hidden_sizes[1], num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.dropout1(out)
        
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        
        out = self.fc3(out)
        return out

# Initialize the model
model = MLP().to(device)
print(model)

# --- Setting Up Loss Function and Optimizer ---

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- Training the Model ---

# Define number of epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    # Iterate over batches with progress bar
    for batch_X, batch_y in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch'):
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Calculate average loss over the epoch
    avg_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')

# --- Evaluating the Model ---

# Evaluation function
def evaluate(model, data_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation
        for batch_X, batch_y in data_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    accuracy = 100 * correct / total
    return accuracy

# Calculate accuracy on test data
test_accuracy = evaluate(model, test_loader)
print(f'Accuracy of the MLP on the 10000 test images: {test_accuracy:.2f}%')

# --- Saving the Model (Optional) ---

# Save the trained model
torch.save(model.state_dict(), 'mnist_mlp.pth')
print('Model saved to mnist_mlp.pth')


Using device: cpu
x_label shape: (60000, 1)
x_train shape: (60000, 784)
x_test shape: (10000, 1)
x_test shape: (10000, 784)
X_train_tensor shape: torch.Size([60000, 784])
y_train_tensor shape: torch.Size([60000])
X_test_tensor shape: torch.Size([10000, 784])
y_test_tensor shape: torch.Size([10000])
Number of training batches: 938
Number of testing batches: 157
MLP(
  (fc1): Linear(in_features=784, out_features=512, bias=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=256, out_features=10, bias=True)
)


Epoch 1/10: 100%|██████████| 938/938 [00:02<00:00, 358.12batch/s]


Epoch [1/10], Loss: 0.2637


Epoch 2/10: 100%|██████████| 938/938 [00:02<00:00, 400.49batch/s]


Epoch [2/10], Loss: 0.1086


Epoch 3/10: 100%|██████████| 938/938 [00:02<00:00, 392.14batch/s]


Epoch [3/10], Loss: 0.0768


Epoch 4/10: 100%|██████████| 938/938 [00:02<00:00, 397.80batch/s]


Epoch [4/10], Loss: 0.0611


Epoch 5/10: 100%|██████████| 938/938 [00:02<00:00, 411.06batch/s]


Epoch [5/10], Loss: 0.0522


Epoch 6/10: 100%|██████████| 938/938 [00:02<00:00, 400.15batch/s]


Epoch [6/10], Loss: 0.0433


Epoch 7/10: 100%|██████████| 938/938 [00:02<00:00, 423.36batch/s]


Epoch [7/10], Loss: 0.0382


Epoch 8/10: 100%|██████████| 938/938 [00:02<00:00, 395.77batch/s]


Epoch [8/10], Loss: 0.0345


Epoch 9/10: 100%|██████████| 938/938 [00:02<00:00, 385.26batch/s]


Epoch [9/10], Loss: 0.0319


Epoch 10/10: 100%|██████████| 938/938 [00:02<00:00, 417.00batch/s]


Epoch [10/10], Loss: 0.0278
Accuracy of the MLP on the 10000 test images: 98.18%


RuntimeError: File mnist_mlp.pth cannot be opened.