<a href="https://colab.research.google.com/github/mylethidiem/artificial-intelligence-projects/blob/main/Architecture%20Project%20Gradient%20Vanishing%20in%20MLP/6_TrainLayersSeparately.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **0. Import libraries**

In [None]:
import random
import matplotlib.pyplot as plt # truc quan hoa
import numpy as np

import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import FashionMNIST #download fashion mnist data

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

SEED = 42
set_seed(SEED)

## **1. Prepare dataset**


In [None]:
train_dataset = FashionMNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
test_dataset = FashionMNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

In [None]:
train_ratio = 0.9
train_size = int(train_ratio * len(train_dataset)) #90%
val_size = len(train_dataset) - train_size #10%

train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

batch_size = 64
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print(f"Train size: {len(train_subset)}")
print(f"Validation size: {len(val_subset)}")
print(f"Test size: {len(test_dataset)}")

## **2. Build MLP network**




Mô hình ban đầu gồm 7 layer sau đó sẽ được chia thành 4 mô hình con với số lượng layer là  2:2:2:1

In [None]:
class MLP_1layer(nn.Module):
  def __init__(self, input_dims, output_dims):
    super(MLP_1layer, self).__init__()
    self.layer1 = nn.Linear(input_dims, output_dims)
    # don't have output
    #self.output = nn.Linear(hidden_dims, output_dims)

    for module in self.modules():
      if isinstance(module, nn.Linear):
        nn.init.normal_(module.weight, mean=0.0, std=0.05)
        nn.init.constant_(module.bias, val=0.0)

  def forward(self, x):
    x = nn.Flatten()(x)

    x = self.layer1(x)
    x = nn.Sigmoid()(x)

    # don't have output
    #output = self.output(x) # don't have output

    return x

In [None]:
class MLP_2layers(nn.Module):
  def __init__(self, input_dims, output_dims):
    super(MLP_2layers, self).__init__()
    self.layer1 = nn.Linear(input_dims, output_dims)
    self.layer2 = nn.Linear(output_dims, output_dims)

    for module in self.modules():
      if isinstance(module, nn.Linear):
        nn.init.normal_(module.weight, mean=0.0, std=0.05)
        nn.init.constant_(module.bias, val=0.0)

  def forward(self, x):
    x = nn.Flatten()(x)

    x = self.layer1(x)
    x = nn.Sigmoid()(x)
    x = self.layer2(x)
    x = nn.Sigmoid()(x)

    return x


In [None]:
input_dims = 784 #28x28 pixel = 784 pixel
output_dims = 128

# Khởi tạo các module thành phần, ở mạng này ta có 4 module thành phần theo tỷ lệ 2:2:2:1
first = MLP_2layers(input_dims, output_dims)
second = MLP_2layers(output_dims, output_dims)
third = MLP_2layers(output_dims, output_dims)
fourth = MLP_1layer(output_dims, output_dims)

lr = 1e-2
criterion = nn.CrossEntropyLoss()

Training function

In [None]:
def train_model(model, optimizer, epochs=100):
  train_loss_list = []
  val_loss_list = []
  train_acc_list = []
  val_acc_list = []

  for epoch in range(epochs):
    #train
    train_loss = 0.0
    train_acc = 0.0
    num_sample = 0

    model.train()
    for X_train, y_train in train_loader:
      X_train, y_train = X_train.to(device), y_train.to(device)

      optimizer.zero_grad()

      y_predict = model(X_train)

      loss = criterion(y_predict, y_train)
      loss.backward()

      optimizer.step()

      train_loss += loss.item()
      train_acc += (torch.argmax(y_predict, dim=1) == y_train).sum().item()
      num_sample += len(y_train)

    train_loss /= len(train_loader)
    train_acc /= num_sample
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    #evaluation
    val_loss = 0.0
    val_acc = 0.0
    num_sample = 0

    model.eval()
    with torch.no_grad():
      for X_val, y_val in val_loader:
       X_val, y_val = X_val.to(device), y_val.to(device)

       y_predict = model(X_val)

       loss = criterion(y_predict, y_val)

       val_loss += loss.item()
       val_acc += (torch.argmax(y_predict, dim=1) == y_val).sum().item()
       num_sample += len(y_val)

    val_loss /= len(val_loader)
    val_acc /= num_sample
    val_loss_list.append(val_loss)
    val_acc_list.append(val_acc)

    print(f"Epoch {epoch+1}/{epochs}: \
    Train loss: {train_loss:.4f}, \
    Train accuracy: {train_acc:.4f} \
    Validation loss: {val_loss:.4f} \
    Validation accuracy: {val_acc:.4f}")

  return train_loss_list, train_acc_list, val_loss_list, val_acc_list



Visualization funciton

In [None]:
def visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list):
  fig, ax = plt.subplots(2, 2, figsize=(12,10))

  ax[0,0].plot(train_loss_list, color='green')
  ax[0,0].set(title='Train Loss', xlabel='Epoch', ylabel='Loss')

  ax[0,1].plot(train_acc_list, color='green')
  ax[0,1].set(title='Train Accuracy', xlabel='Epoch', ylabel='Accuracy')

  ax[1,0].plot(val_loss_list, color='orange')
  ax[1,0].set(title='Validation Loss', xlabel='Epoch', ylabel='Loss')

  ax[1,1].plot(val_acc_list, color='orange')
  ax[1,1].set(title='Train Accuracy', xlabel='Epoch', ylabel='Accuracy')

  plt.show()


## **3. Training 1**

In [None]:
# initialize
model = nn.Sequential(
    first,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **4. Training 2**

In [None]:
# initialize
# first được giữ cố dịnh, không cập nhật trọng số
for param in first.parameters():
  param.requires_grad = False

model = nn.Sequential(
    first,
    second,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **5. Training 3**

In [None]:
# initialize
# huấn luyện lại toàn bộ mạng hiện có mà không cố định thành phần nào
for param in first.parameters():
  param.requires_grad = True

model = nn.Sequential(
    first,
    second,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **6. Training 4**


In [None]:
# initialize
for param in first.parameters():
  param.requires_grad = False
for param in second.parameters():
  param.requires_grad = False

model = nn.Sequential(
    first,
    second,
    third,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **7. Training 5**


In [None]:
for param in first.parameters():
  param.requires_grad = True
for param in second.parameters():
  param.requires_grad = True

model = nn.Sequential(
    first,
    second,
    third,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **8. Training 6**


In [None]:
for param in first.parameters():
  param.requires_grad = False
for param in second.parameters():
  param.requires_grad = False
for param in third.parameters():
  param.requires_grad = False

model = nn.Sequential(
    first,
    second,
    third,
    fourth,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **8. Training 7**

In [None]:
for param in first.parameters():
  param.requires_grad = True
for param in second.parameters():
  param.requires_grad = True
for param in third.parameters():
  param.requires_grad = True

model = nn.Sequential(
    first,
    second,
    third,
    fourth,
    nn.Linear(128, 10)
).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)
print(f"model parameters = {model.parameters()}")

In [None]:
# training code
train_loss_list, train_acc_list, val_loss_list, val_acc_list = train_model(model, optimizer, epochs=100)

In [None]:
visualization_train(train_loss_list, train_acc_list, val_loss_list, val_acc_list)

## **10. Evaluation**

In [None]:
test_label = []
test_predict = []

model.eval()
with torch.no_grad():
  for X_test, y_test in test_loader:
    X_test, y_test = X_test.to(device), y_test.to(device)

    output = model(X_test)

    # transfer to CPU, currently it is tensor
    test_label.append(y_test.cpu())
    test_predict.append(output.cpu())

  test_label = torch.cat(test_label, dim=0)
  test_predict = torch.cat(test_predict, dim=0)
  test_acc = (torch.argmax(test_predict, dim=1) == test_label).sum().item()/len(test_label)

print(f"Test accuracy: {test_acc}")

In [None]:
val_label = []
val_predict = []

model.eval()
with torch.no_grad():
  for X_val, y_val in val_loader:
    X_val, y_val = X_val.to(device), y_val.to(device)

    output = model(X_val)

    # transfer to CP, currently it is tensor
    val_label.append(y_val.cpu())
    val_predict.append(output.cpu())

  val_label = torch.cat(val_label, dim=0)
  val_predict = torch.cat(val_predict, dim=0)
  val_acc = (torch.argmax(val_predict, dim=1) == val_label).sum().item()/len(val_label)

print(f"Validation accuracy: {val_acc:.4f}")