#  Saving & Loading Models

In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms 

In [None]:
input_size    = 784 #28 x 28 = 784
hidden_size   = 100
num_classes   = 10
learning_rate = 0.001
#images have size of 28 x 28
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size    = 100

training_dataset = torchvision.datasets.MNIST(root='./data', train=True , transform = transforms.ToTensor(),download=True )
test_dataset     = torchvision.datasets.MNIST(root='./data', train=False, transform = transforms.ToTensor(),download=False)

train_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=batch_size,  shuffle=True)
test_loader  = torch.utils.data.DataLoader(dataset=test_dataset,     batch_size=batch_size,  shuffle=False)

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear( input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))  # input   + activation 
        output = self.fc2(x)        # outout
        return output


mlp_model = MLP(input_size, hidden_size, num_classes).to(device)

loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(),lr=learning_rate)

In [None]:
mlp_model

## 1: Saving & Loading Entire Model
    A common PyTorch convention is to save models using either a .pt or .pth file extension**

- ### Save:
    
    torch.save(model, PATH)

>Saves a serialized object to disk. This function uses **Python’s pickle** utility for serialization. Models, tensors, and dictionaries of all kinds of objects can be saved using this function.
 
- ### Load:
    
    model = torch.load(PATH)
    model.eval()
 
> Uses pickle’s unpickling facilities to deserialize pickled object files to memory. This function also facilitates the device to load the data into

> you must call model.eval(), It is a kind of switch for some specific layers/parts (dropout and batch normalization layers) of the model that behave differently during training and inference (evaluating) time, Failing to do this will yield inconsistent inference results.
>
> The common practice for evaluating/validation is using torch.no_grad() in pair with model.eval() to turn off gradients computation:

In [None]:
PATH = "mlp_model.pth"

In [None]:
torch.save(mlp_model, PATH)

In [None]:
loaded_model = torch.load(PATH)
loaded_model.eval()

In [None]:
# for param in loaded_model.parameters():
#     print(param)

# 2:  Saving & Loading Trained Model for Inference

>When saving a model for inference, it is only necessary to save the trained model’s learned parameters.



- ### What is a state_dict?
>In PyTorch, the learnable parameters (i.e. weights and biases) of an torch.nn.Module model are contained in the model’s parameters (accessed with model.parameters()). 

>A state_dict is simply a Python dictionary object that maps each layer to its parameter tensor. Note that only layers with learnable parameters (convolutional layers, linear layers, etc.) and registered buffers (batchnorm’s running_mean) have entries in the model’s state_dict. 

>Optimizer objects (torch.optim) also have a state_dict, which contains information about the optimizer’s state, as well as the hyperparameters used.

## torch.nn.Module.load_state_dict: 
>Loads a model’s parameter dictionary using a deserialized **state_dict**.
>
- ### Save:
    torch.save(model.state_dict(), PATH)

- ### Load:
    model = torch.load_state_dict(torch.load(PATH))
    model.eval()

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear( input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))  # input   + activation 
        output = self.fc2(x)        # outout
        return output


mlp_model = MLP(input_size, hidden_size, num_classes).to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(),lr=learning_rate)



In [None]:
num_epochs    = 5

for epoch in range(num_epochs):
    for i, (images,labels) in enumerate(train_loader):
        
        #forward
        pred_labels = mlp_model(images)
        l = loss(pred_labels, labels)
        #backward
        
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        
        if (i+1)% 100 ==0:
            print(f'epoch: {epoch+1}/{num_epochs}, step {i+1}/{len(train_loader)}, loss = {l.item():.4f}')

In [None]:
PATH = "trained_model.pth"
torch.save(mlp_model.state_dict(), PATH)

In [None]:
print(mlp_model.state_dict())

>Notice that the load_state_dict() function takes a dictionary object, NOT a path to a saved object.

In [None]:
trained_loaded_model = MLP(input_size, hidden_size, num_classes).to(device)

trained_loaded_model.load_state_dict(torch.load(PATH))
trained_loaded_model.eval()

In [None]:
# for param in trained_loaded_model.parameters():
#     print(param)

#  3: Saving & Loading a General Checkpoint for Inference and/or Resuming Training
    When saving a general checkpoint, to be used for either inference or resuming training, you must save more than just the model’s state_dict. It is important to also save the optimizer’s state_dict, as this contains buffers and parameters that are updated as the model trains. Other items that you may want to save are the epoch you left off on, the latest recorded training loss, external torch.nn.Embedding layers, etc. As a result, such a checkpoint is often 2~3 times larger than the model alone.
loss = 0.0851
### Save:
   >
>torch.save({
   >
>            'epoch': epoch,
   >
>            'model_state_dict': model.state_dict(),
   >
>            'optimizer_state_dict': optimizer.state_dict(),
   >
>            'loss': loss,
   >
>            ...
   >
>            }, PATH)

### Load:
   >
>  model = TheModelClass(*args, **kwargs)
   >
>  optimizer = TheOptimizerClass(*args, **kwargs)
   >
>  checkpoint = torch.load(PATH)
   >
>  model.load_state_dict(checkpoint['model_state_dict'])
   >
>  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
   >
>  epoch = checkpoint['epoch']
   >
>  loss = checkpoint['loss']
   >
>  model.eval()  - or - model.train()

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear( input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))  # input   + activation 
        output = self.fc2(x)        # outout
        return output

mlp_model = MLP(input_size, hidden_size, num_classes).to(device)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(),lr=learning_rate)




In [None]:
# print(mlp_model.state_dict())

In [None]:
print(optimizer.state_dict())

In [None]:
num_epochs    = 5

for epoch in range(num_epochs):
    for i, (images,labels) in enumerate(train_loader):
        
        #forward
        pred_labels = mlp_model(images)
        l = loss(pred_labels, labels)
        #backward
        
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        
        if (i+1)% 100 ==0:
            print(f'epoch: {epoch+1}/{num_epochs}, step {i+1}/{len(train_loader)}, loss = {l.item():.4f}')

In [None]:
checkpoint ={
            'epoch': num_epochs,
            'model_state_dict': mlp_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }
PATH = 'checkpoint.pth'

In [None]:
torch.save( checkpoint, PATH )

In [None]:
loaded_checkpoint = torch.load(PATH)

In [None]:
loaded_checkpoint['epoch']

In [None]:
model = MLP(input_size, hidden_size, num_classes).to(device)

# print(model.state_dict())

In [None]:
optimizer = torch.optim.SGD(model.parameters(),lr=0)
print(optimizer.state_dict())

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

In [None]:
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
print(optimizer.state_dict())

# 4: Saving & Loading Model Across Devices

### 4.1- Save on GPU, Load on CPU


In [None]:
# Setting the device to 'cuda' for GPU
device = torch.device('cuda')

# Creating an instance of the model with 6 input features
model = MLP(input_size, hidden_size, num_classes).to(device)

# Saving the model's state dictionary to a specified path
torch.save(model.state_dict(), PATH)


In [None]:
# Setting the device to 'cpu'
device = torch.device('cpu')

# Creating a new instance of the model with 6 input features
model = MLP(input_size, hidden_size, num_classes).to(device)


# Loading the saved state dictionary into the model
model.load_state_dict(torch.load(PATH, map_location=device))

# Moving the model to the target device
model.to(device)

### 4.2- Save on GPU, Load on GPU


In [None]:
# Setting the device to 'cuda' for GPU
device = torch.device('cuda')

# Creating an instance of the model with 6 input features and moving it to the GPU
model = MLP(input_size, hidden_size, num_classes).to(device)

# Saving the model's state dictionary to a specified path
torch.save(model.state_dict(), PATH)


In [None]:
# Setting the device to 'cuda' for GPU
device = torch.device('cuda')

# Creating a new instance of the model with 6 input features
model = MLP(input_size, hidden_size, num_classes).to(device)

# Loading the saved state dictionary into the model
model.load_state_dict(torch.load(PATH))

# Moving the model to the GPU
model.to(device)


### 4.3- Save on CPU, Load on GPU


In [None]:
# Setting the device to 'cpu'
device = torch.device('cpu')

# Creating an instance of the model with 6 input features and moving it to the CPU
model = MLP(input_size, hidden_size, num_classes).to(device)


# Saving the model's state dictionary to a specified path
torch.save(model.state_dict(), PATH)


In [None]:
# Setting the device to 'cuda' for GPU
device = torch.device('cuda')

# Creating a new instance of the model with 6 input features
model = MLP(input_size, hidden_size, num_classes).to(device)


# Loading the saved state dictionary into the model and mapping it to GPU
model.load_state_dict(torch.load(PATH, map_location="cuda:0"))

# Moving the model to the GPU
model.to(device)
