# Feed-Forward Neural Network
we will implement our first multilayer neural network that can do digit classification based on the famous MNIST dataset.

We put all the things from the last tutorials together:
- Dataset: MNISt for digit classification
- DataLoader, TransfomramtionUse the DataLoader to load our dataset and apply a transform to the dataset
- Multilayer Neural Net: Implement a feed-forward neural net with input layer, hidden layer, and output layer
- Activation funciton: Apply activation functions.
- loss and optimizer: Set up loss and optimizer
- Training loop: that can use batch training.
- Model Evaluation: Evaluate our model and calculate the accuracy.
-GPU Support:  Additionally, we will make sure that our whole code can also run on the gpu if we have gpu support.

### MNIST Dataset:

In [None]:
import torch
import torch.nn as nn
import torchvision # for datasets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
input_size = 784 # 28x28 (Image Size) --> than we flatten 
hidden_size = 100 
num_classes = 10 # we have 10 digits from 0 to 9. 10 classes
num_epochs = 2
batch_size = 100 
learning_rate = 0.001

# 1. Download train and test data 
# 2. Load train and test data

# 1. MNIST dataset: Import dataset from PyTorch library `torchvision.datasets`
train_dataset = torchvision.datasets.MNIST(root='./data',   # create a folder data
                                           train=True,      # Training data 
                                           transform=transforms.ToTensor(),  # transform to tensor
                                           download=True)   # Download if its not available

test_dataset = torchvision.datasets.MNIST(root='./data', 
                                          train=False, 
                                          transform=transforms.ToTensor()) # Now dont need to download anymore

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False) # shuffle flase because it doen't matter for evaluation

# Load the data: Lets look one batch of data
examples = iter(train_loader)
samples, labels = examples.next()
print(samples.shape, labels.shape)
for i in range(6):
    plt.subplot(2,3,i+1)
    plt.imshow(samples[i][0], cmap='gray') # [i][0] 0 because we want to extract 1st channel
plt.show()


examples = iter(test_loader)
# unpack 
# samples, lables = examples.next()
# print(samples.shape, lables.shape)
example_data, example_targets = next(examples)
print(example_data.shape, example_targets.shape)

for i in range(6):
    plt.subplot(2,3,i+1)
    plt.imshow(example_data[i][0], cmap='gray')
plt.show()

### Create a  Fully connected neural network

In [None]:
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes): # output_size = num_classes
        super(NeuralNet, self).__init__()
        self.input_size = input_size
        # Create the layers
        self.l1 = nn.Linear(input_size, hidden_size) 
        # After layer apply activation function
        self.relu = nn.ReLU()
        # than we have another linear layer. Input size = hidden size and output size = output classes
        self.l2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x): # `x` is one sample
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)   # no activation and no softmax at the end. We will apply cross-entropy whcih will aplly softmax
        return out

# model = NeuralNet(input_size, hidden_size, num_classes)
model = NeuralNet(input_size, hidden_size, num_classes).to(device)
print(model)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss() # this will apply SoftMax for us automatically
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [None]:
# Training Loop: Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    # run over all batches
    for i, (images, labels) in enumerate(train_loader):  # enumerate function will give the actual index and the data. The data here is the tuples of (images, labels)
        # Reshape the images: 
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784], number of batches and flatten image
        images = images.reshape(-1, 28*28).to(device) # -1 tensor will automatically find the dimension for us
        labels = labels.to(device) 
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad() # to empty the values gradeitns attribute
        loss.backward()
        optimizer.step() # update step to update parameters for us

        # Print the information
        if (i+1) % 100 == 0: # every 100 step
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

In [None]:
# Testing and evaluation of the model: for this we dont need to calcualte gradeints that all we do
# In test phase, we don't need to compute gradients (for memory efficiency). Wrap this using `with`
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    # loop over all the abtches 
    for images, labels in test_loader: # same as we did for training
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        # calcualte prediction. model is the trained model
        outputs = model(images)
        
        # torch.max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1) # we dont need first actual value whcih are classes
        n_samples += labels.size(0) # number of samples in cuurent batch
        n_correct += (predicted == labels).sum().item() # for each correct prediction we add one

    # Calcualte total accuracy in percent %
    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')