# Imports

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torchvision.datasets import MNIST
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

# 1. Data preparation

### 1.1. Read data from file

In [61]:
#load the datasets (train data and test data which are pre-splitted)
data1 = MNIST(root='MNIST_dataset', train=True, download=True)
data2 = MNIST(root='MNIST_dataset', train=False, download=True)

#concatenate the data and labels from train and test datasets
all_images = torch.cat((data1.data, data2.data), dim=0)
all_labels = torch.cat((data1.targets, data2.targets), dim=0)

Now, we have 70,000 images intotal.

In [62]:
x = all_images.numpy()
y = all_labels.numpy()

print("Total images:", x.shape[0])

Total images: 70000


### 1.2. Normalizing the data

In [63]:
x = x / 255.0  # Normalize pixel values to [0, 1] range

print("min pixel value:", x.min(), ", max pixel value:", x.max())

min pixel value: 0.0 , max pixel value: 1.0


### 1.3. Splitting into train, validtaion and test sets

In [68]:
x_train, x_rest, y_train, y_rest = train_test_split(x, y, train_size= 0.6, random_state=42, stratify=y) # 60% train, 40% to split again into val and test
x_val, x_test, y_val, y_test = train_test_split(x_rest, y_rest, train_size=0.5, random_state=42, stratify=y_rest) # 40% * 50% = 20% val, 20% test

print("Training set = ", x_train.shape[0])
print("Validation set = ", x_val.shape[0])
print("Test set = ", x_test.shape[0])

Training set =  42000
Validation set =  14000
Test set =  14000


### 1.4. Pytorch DataLoaders

In [86]:
# change from numpy arrays to tensors
x_train_tensor = torch.from_numpy(x_train).unsqueeze(1).float()
y_train_tensor = torch.from_numpy(y_train).long()

x_val_tensor = torch.from_numpy(x_val).unsqueeze(1).float()
y_val_tensor = torch.from_numpy(y_val).long()

x_test_tensor = torch.from_numpy(x_test).unsqueeze(1).float()
y_test_tensor = torch.from_numpy(y_test).long()

train_dataset = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = torch.utils.data.TensorDataset(x_val_tensor, y_val_tensor)
test_dataset = torch.utils.data.TensorDataset(x_test_tensor, y_test_tensor)

# Create DataLoaders with batch size = 64 to optimize training
train_NN_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_NN_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_NN_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# ?. Neural Network model
### ?.1. Feed Forward NN

In [89]:
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, number_hidden_layers, hidden_size, output_size):
        super(FeedForwardNN, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(input_size, hidden_size)) 
        for i in range(number_hidden_layers-1):
            self.layers.append(nn.Linear(hidden_size, hidden_size)) 
        self.layers.append(nn.Linear(hidden_size, output_size)) 

        self.weight_initialization()

    def weight_initialization(self):
        # He initialization
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_normal_(module.weight, nonlinearity= 'relu')
                
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
   
    def forward(self, x):
        
        x = x.view(x.size(0), -1) # flatten input (28x28 to 784)
        
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        
        x = self.layers[-1](x)

        return x

### ?.2. Training the NN model

In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # for gpu acceleration
print(f"Using {device}")

INPUT_SIZE = 784
NUMBER_HIDDEN_LAYERS = 2
HIDDEN_SIZE = 64
OUTPUT_SIZE = 10 

LEARNING_RATE = 0.01
NUM_EPOCHS = 10

model = FeedForwardNN(INPUT_SIZE, NUMBER_HIDDEN_LAYERS, HIDDEN_SIZE, OUTPUT_SIZE).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_function = nn.CrossEntropyLoss()

num_total_steps = len(train_NN_loader)
for epoch in range(NUM_EPOCHS):
    for i, (image, label) in enumerate(train_NN_loader):
        image = image.to(device)
        label = label.to(device)

        #forward
        outputs = model(image)
        loss = loss_function(outputs, label)

        #bacward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print(f"Epoch: {epoch+1} / {NUM_EPOCHS}, Loss: {loss.item():.4f} ")


Using cpu
Epoch: 1 / 10, Loss: 1.6765 
Epoch: 1 / 10, Loss: 1.1634 
Epoch: 1 / 10, Loss: 0.9080 
Epoch: 1 / 10, Loss: 0.6503 
Epoch: 1 / 10, Loss: 0.6717 
Epoch: 1 / 10, Loss: 0.7028 
Epoch: 2 / 10, Loss: 0.4760 
Epoch: 2 / 10, Loss: 0.5503 
Epoch: 2 / 10, Loss: 0.2975 
Epoch: 2 / 10, Loss: 0.3296 
Epoch: 2 / 10, Loss: 0.4259 
Epoch: 2 / 10, Loss: 0.3305 
Epoch: 3 / 10, Loss: 0.3121 
Epoch: 3 / 10, Loss: 0.2383 
Epoch: 3 / 10, Loss: 0.3732 
Epoch: 3 / 10, Loss: 0.2533 
Epoch: 3 / 10, Loss: 0.2791 
Epoch: 3 / 10, Loss: 0.1457 
Epoch: 4 / 10, Loss: 0.2858 
Epoch: 4 / 10, Loss: 0.3250 
Epoch: 4 / 10, Loss: 0.2869 
Epoch: 4 / 10, Loss: 0.1698 
Epoch: 4 / 10, Loss: 0.3084 
Epoch: 4 / 10, Loss: 0.3650 
Epoch: 5 / 10, Loss: 0.1942 
Epoch: 5 / 10, Loss: 0.1659 
Epoch: 5 / 10, Loss: 0.1525 
Epoch: 5 / 10, Loss: 0.2794 
Epoch: 5 / 10, Loss: 0.3722 
Epoch: 5 / 10, Loss: 0.3825 
Epoch: 6 / 10, Loss: 0.3050 
Epoch: 6 / 10, Loss: 0.0831 
Epoch: 6 / 10, Loss: 0.1142 
Epoch: 6 / 10, Loss: 0.3113 
Epoc

In [92]:
print(torch.version.cuda)

None
