# Saving and Loading Models
PyTorch provides several methods for saving and loading models.

This Demo will cover several methods using an example Model. 

#### Functions for Saving and Loading 
`torch.save()`: Save PyTorch objects (models, tensors, dictionaries, etc...) using Pythons pickle module.

`torch.load()`: Loads PyTorch objects into memory.

`load_state_dict()`: Loads saved parameters from objects. 

In [180]:
# Example Fake Model
import torch.nn as nn
import torch.nn.functional as F

class FakeNet(nn.Module):
    def __init__(self):
        super(FakeNet, self).__init__()
        self.fc1 = nn.Linear(10, 50)
        self.batch_norm = nn.BatchNorm1d(50) 
        self.fc2 = nn.Linear(50, 1)        

    def forward(self, x):
        x = F.relu(self.fc1(x))              
        x = self.batch_norm(x)               
        x = self.fc2(x)                      
        return x


In [181]:
# Create our model
model = FakeNet()
print(model)

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [182]:
# Create a fake dataset
import torch
from torch.utils.data import Dataset, DataLoader

class FakeDataset(Dataset):
    def __init__(self, num_samples=1000):
        self.num_samples = num_samples

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        # Generate random input data with 10 features
        x = torch.randn(10)
        # Generate a random target value
        y = torch.randn(1)
        return x, y



# Create a dataset and data loader
dataset = FakeDataset(num_samples=1000)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [183]:
# create loss and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [184]:
# Train a fake model
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    running_loss = 0.0
    for i, (inputs, targets) in enumerate(data_loader):
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()


# Saving and Loading using `state_dict`
`state_dict` is a dictionary that stores all the learnable parameters of a model, like weights and biases as well as hyperparameters of an Optimizer. This makes it easy to save, load, and transfer the model’s parameters, allowing flexible model saving and reloading across different environments.

In [185]:
# Print the state_dict of the model
print(model.state_dict())

OrderedDict({'fc1.weight': tensor([[-0.2281, -0.0470,  0.2462, -0.2436, -0.0946, -0.3025,  0.1570,  0.0669,
          0.1495,  0.3131],
        [ 0.1679, -0.3009,  0.3013,  0.1753,  0.0325, -0.1105, -0.2211, -0.2457,
         -0.1122,  0.2106],
        [-0.1523, -0.2123, -0.1113, -0.2280, -0.3060, -0.0035, -0.0418,  0.1317,
         -0.2353, -0.0075],
        [ 0.0966, -0.2417, -0.1267, -0.1652,  0.3085, -0.2116,  0.2211,  0.0841,
         -0.1746, -0.1794],
        [ 0.2723,  0.1501,  0.1777, -0.0608,  0.0209, -0.0566, -0.1187,  0.1044,
          0.0284, -0.1925],
        [ 0.2183, -0.0458, -0.0868,  0.2422,  0.2899,  0.0156,  0.2748,  0.1606,
          0.1072,  0.2375],
        [ 0.0581, -0.2766, -0.0214, -0.0603, -0.0570,  0.2813, -0.0172, -0.0519,
          0.0022,  0.0754],
        [ 0.0725,  0.0626,  0.1116,  0.0689, -0.0994, -0.1217, -0.2950, -0.0031,
          0.2694,  0.2645],
        [ 0.2746, -0.0389,  0.2735,  0.2441, -0.0763, -0.3111, -0.0373,  0.0636,
          0.0860, -0

In [186]:
# Print the paramters of each layer
for k, v in model.state_dict().items():
    print(f"Layer Name: {k} Parameters:{v.size()}")

Layer Name: fc1.weight Parameters:torch.Size([50, 10])
Layer Name: fc1.bias Parameters:torch.Size([50])
Layer Name: batch_norm.weight Parameters:torch.Size([50])
Layer Name: batch_norm.bias Parameters:torch.Size([50])
Layer Name: batch_norm.running_mean Parameters:torch.Size([50])
Layer Name: batch_norm.running_var Parameters:torch.Size([50])
Layer Name: batch_norm.num_batches_tracked Parameters:torch.Size([])
Layer Name: fc2.weight Parameters:torch.Size([1, 50])
Layer Name: fc2.bias Parameters:torch.Size([1])


In [187]:
# Print the hyperparameters of the Optimizer
print(optimizer.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.01, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'fused': None, 'params': [0, 1, 2, 3, 4, 5]}]}


In [188]:
# Save the state_dict for each (recommended approach)
import torch

torch.save(model.state_dict(), "model_state_dict.pt") # .pt or pth extension for models

In [189]:
# Save the state_dict for the Optimizer
torch.save(optimizer.state_dict(), "optimizer")

In [190]:
# NOTE: state_dict is ONLY saving the parameters!!!

### Model Inference
REVIEW: Inference is the process of using a trained model to make predictions.

Let's load a model using using its state_dict and prepare it for inference. 

In [191]:
# Create a new model
new_model = FakeNet()
print(new_model)

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [192]:
# Show the current state_dict
for k, v in new_model.state_dict().items():
    print(f"Layer Name: {k} Parameters:{v}")

Layer Name: fc1.weight Parameters:tensor([[ 0.1562, -0.0331, -0.2115, -0.2645, -0.1664, -0.0214,  0.3064, -0.1800,
          0.2725,  0.0491],
        [-0.0186, -0.0935,  0.1419, -0.0618, -0.1698,  0.0124, -0.0262,  0.2673,
         -0.1691, -0.0201],
        [-0.1629,  0.0486,  0.1208, -0.0841,  0.0433,  0.1216, -0.3088, -0.0044,
         -0.1457,  0.2384],
        [-0.2759, -0.1889, -0.0628,  0.1217, -0.0447,  0.1648, -0.1095, -0.2860,
          0.2170,  0.0545],
        [-0.2098, -0.0973,  0.1577, -0.0325,  0.0589, -0.1601, -0.2941,  0.0223,
          0.1271,  0.2688],
        [-0.1858,  0.1936,  0.2691,  0.1475,  0.2413,  0.0023, -0.2949, -0.1399,
         -0.2384,  0.2677],
        [-0.0810,  0.2775,  0.1493, -0.2103, -0.0083, -0.1030, -0.2814,  0.0369,
          0.1512, -0.1343],
        [ 0.0107, -0.0862, -0.1850, -0.0260,  0.1453, -0.1932,  0.2639, -0.1293,
         -0.1102,  0.0199],
        [-0.2460,  0.1502, -0.1030,  0.0102, -0.1603, -0.2061,  0.1597,  0.1906,
          0.1

In [193]:
# Load the parameters into our model
new_model.load_state_dict(torch.load("model_state_dict.pt", weights_only=True)) # ONLY the parameters!

<All keys matched successfully>

In [194]:
# Print it again to show the difference
for k, v in new_model.state_dict().items():
    print(f"Layer Name: {k} Parameters:{v}")

Layer Name: fc1.weight Parameters:tensor([[-0.2281, -0.0470,  0.2462, -0.2436, -0.0946, -0.3025,  0.1570,  0.0669,
          0.1495,  0.3131],
        [ 0.1679, -0.3009,  0.3013,  0.1753,  0.0325, -0.1105, -0.2211, -0.2457,
         -0.1122,  0.2106],
        [-0.1523, -0.2123, -0.1113, -0.2280, -0.3060, -0.0035, -0.0418,  0.1317,
         -0.2353, -0.0075],
        [ 0.0966, -0.2417, -0.1267, -0.1652,  0.3085, -0.2116,  0.2211,  0.0841,
         -0.1746, -0.1794],
        [ 0.2723,  0.1501,  0.1777, -0.0608,  0.0209, -0.0566, -0.1187,  0.1044,
          0.0284, -0.1925],
        [ 0.2183, -0.0458, -0.0868,  0.2422,  0.2899,  0.0156,  0.2748,  0.1606,
          0.1072,  0.2375],
        [ 0.0581, -0.2766, -0.0214, -0.0603, -0.0570,  0.2813, -0.0172, -0.0519,
          0.0022,  0.0754],
        [ 0.0725,  0.0626,  0.1116,  0.0689, -0.0994, -0.1217, -0.2950, -0.0031,
          0.2694,  0.2645],
        [ 0.2746, -0.0389,  0.2735,  0.2441, -0.0763, -0.3111, -0.0373,  0.0636,
          0.0

In [195]:
# The parameters have been updated after loaded!

In [196]:
# Create example input
import torch
# Random batch size of 1-10 features
sample_input = torch.randn(1, 10)
print(sample_input)

tensor([[ 0.1106,  1.7144,  0.1781,  2.1832,  0.6793, -0.4792,  0.3403, -1.2182,
         -1.1033, -0.5374]])


In [197]:
# Lets do an example infernce on our model
new_model.eval()

# Call the model with input to get a prediction
output = new_model(sample_input)
print(output)

tensor([[-0.1387]], grad_fn=<AddmmBackward0>)


# Saving and Loading entire Model
PyTorch provides the option to save a full model to the filesystem as well.

full model = full python pickle version of model

This can potentially cause issues because it relies on the exact class definitions and file structure from when the model was saved, so loading may fail if used in a different project or after code changes.

In [198]:
# To save a full model
import torch

torch.save(model, "model_full.pt")

In [199]:
# Import the model class from file
from fake_net import FakeNet

# Initialize and use the model
model = FakeNet()
print(model)

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [200]:
# Try again
torch.save(model, "model_full.pt")

In [201]:
# Look at the size difference
!ls -lh model*

-rw-r--r--@ 1 JABERI  staff   7.9K Nov 12 22:27 model_full.pt
-rw-r--r--@ 1 JABERI  staff   7.0K Nov 12 22:27 model_state_dict.pt


In [202]:
# Load a full model
from fake_net import FakeNet

# Initialize and use the model
new_model = FakeNet()
print(new_model)

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [203]:
# Load it from the full model
new_model = torch.load("model_full.pt", weights_only=False) # More than just the parameters
print(new_model)

#new_model = FakeNet()            # recreate your architecture
#new_model.load_state_dict(torch.load("model_full.pt", weights_only=False))

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [204]:
# Check inference
new_model.eval()

# Call the model with input to get a prediction
output = new_model(sample_input)
print(output)

tensor([[0.0808]], grad_fn=<AddmmBackward0>)


# Saving and Loading a Checkpoint
A model checkpoint is a way to save parameters as a snapshot in a point in time. 

This is helpful to continue a long training job that may have failed at some point or to give multiple models as options to use from a training run.

In [205]:
# Save a checkpoint
import torch

# dummy epoch and loss
epoch = 5
loss = 0.05

In [206]:

# Define checkpoint path
checkpoint_path = f"{epoch}_checkpoint.tar"
# Save a checkpoint
torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss}, 
            checkpoint_path) # .tar file 

In [207]:
# Print confirmation
print(f"✅ Checkpoint saved successfully: {checkpoint_path}")

✅ Checkpoint saved successfully: 5_checkpoint.tar


In [208]:
import os
import torch

# Go up 3 levels from the current folder to reach the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))

# Define checkpoint path in the project root
checkpoint_path = os.path.join(project_root, f"{epoch}_checkpoint.tar")

# Save checkpoint
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss
}, checkpoint_path)

# Print confirmation
print(f"✅ Checkpoint saved successfully at: {checkpoint_path}")

✅ Checkpoint saved successfully at: /Users/JABERI/Downloads/PyTorch-main/5_checkpoint.tar


In [209]:
# Load the checkpoint
# Initialize the Model as we have before. NOTE: also optimizer in our case
from fake_net import FakeNet

# Initialize and use the model
model = FakeNet()
print(model)

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)


In [210]:
# Load the model as a checkpoint
import torch

# Load the tar file
checkpoint = torch.load(f"{epoch}_checkpoint.tar", weights_only='true')

In [211]:
# Show the checkpoint info
print(checkpoint)

{'epoch': 5, 'model_state_dict': OrderedDict({'fc1.weight': tensor([[ 0.1939, -0.0690,  0.1930,  0.3073, -0.0985,  0.2643,  0.0314,  0.2465,
         -0.0899, -0.1004],
        [ 0.2517,  0.1523, -0.1693, -0.0171, -0.2050,  0.2094,  0.0029, -0.1450,
          0.0632,  0.3087],
        [-0.1711, -0.1093, -0.0354, -0.2856,  0.0973, -0.2452, -0.0832,  0.1497,
         -0.2379,  0.2537],
        [-0.2583,  0.1652,  0.0819,  0.2358,  0.1571, -0.0864,  0.0800,  0.1662,
          0.1451, -0.1171],
        [ 0.1074, -0.2723, -0.0270, -0.3016, -0.1467,  0.2596,  0.3072,  0.1203,
         -0.2644, -0.0127],
        [-0.0597,  0.1511,  0.0701, -0.2397,  0.1183, -0.2840, -0.1997, -0.1482,
         -0.2438,  0.2162],
        [ 0.0460, -0.2512,  0.3027, -0.2167, -0.1438, -0.2287,  0.2517, -0.1988,
         -0.1735,  0.1271],
        [ 0.0013, -0.0672,  0.1027, -0.1696, -0.1584,  0.1351,  0.0048,  0.1264,
         -0.2722,  0.0732],
        [-0.0551, -0.0351, -0.2239,  0.0305, -0.1977, -0.2706,  0.25

In [212]:
# Load the parameters to our model
model.load_state_dict(checkpoint['model_state_dict']) 

<All keys matched successfully>

In [213]:
# Load the optimizer
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [214]:
# Load the loss and the epoch. NOTE that we could have save other information here as well
loss = checkpoint['loss']
epoch = checkpoint['epoch']
print(loss, epoch)

0.05 5


In [215]:
# Test inference
model.eval()
output = model(sample_input)
print(output)

tensor([[0.0808]], grad_fn=<AddmmBackward0>)


# Adding Checkpoints to Training
Its good practice to include checkpoints as part of your training loop.

How you save checkpoints is up to you. ie: every so often, every epoch, every epoch which improves on loss.

In [216]:
# Lets include a checkpoint in our training loop that saves a checkpoint every 2 epochs
# Train a fake model
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    running_loss = 0.0
    for i, (inputs, targets) in enumerate(data_loader):
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item()

    ######### Save a checkpoint every 2 epochs
    if epoch % 2 == 0:
        torch.save({'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss}, 
                f'training_checkpoint_{epoch}.tar')

# Save the final checkpoint after the last epoch
torch.save({
    'epoch': N_EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss
}, 'training_checkpoint_final.tar')

In [217]:
# List all the checkpoints
!ls -l training_checkpoint*

-rw-r--r--@ 1 JABERI  staff  7717 Nov 12 22:27 training_checkpoint_0.tar
-rw-r--r--@ 1 JABERI  staff  7717 Nov 12 22:27 training_checkpoint_2.tar
-rw-r--r--@ 1 JABERI  staff  7717 Nov 12 22:27 training_checkpoint_4.tar
-rw-r--r--@ 1 JABERI  staff  7717 Nov 12 22:27 training_checkpoint_6.tar
-rw-r--r--@ 1 JABERI  staff  7717 Nov 12 22:27 training_checkpoint_8.tar
-rw-r--r--@ 1 JABERI  staff  7845 Nov 12 22:27 training_checkpoint_final.tar


NOTE: We can now load any of these checkpoints to either continue training from that point in time or run inference. 

# Warmstarting
Warmstarting is where we initialize a new model to train from trained parameters of a previously trained model.

This is helpful in Transfer Learning which is covered in more detail later.

With warmstarting we can also initialize only certain layers of a previously trained model.

In [218]:
# Example Fake Model
import torch.nn as nn
import torch.nn.functional as F

class FakeNet(nn.Module):
    def __init__(self):
        super(FakeNet, self).__init__()
        self.fc1 = nn.Linear(10, 50)
        self.batch_norm = nn.BatchNorm1d(50) 
        self.fc2 = nn.Linear(50, 1)        

    def forward(self, x):
        x = F.relu(self.fc1(x))              
        x = self.batch_norm(x)               
        x = self.fc2(x)                      
        return x

In [219]:
# create new model
new_model = FakeNet()
new_model

FakeNet(
  (fc1): Linear(in_features=10, out_features=50, bias=True)
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=50, out_features=1, bias=True)
)

In [220]:
# Show the parameters
new_model.state_dict()

OrderedDict([('fc1.weight',
              tensor([[-0.1815, -0.2359,  0.1409,  0.2466,  0.3120,  0.0500,  0.2572,  0.2403,
                       -0.1810, -0.1478],
                      [-0.0199,  0.0262,  0.2856, -0.1054, -0.2220, -0.2707,  0.3002, -0.3129,
                        0.0289, -0.1051],
                      [ 0.0826,  0.2328, -0.1066,  0.0225, -0.2014, -0.1936,  0.1761,  0.1122,
                        0.2779, -0.1517],
                      [-0.0328,  0.1457,  0.2343, -0.3087, -0.2595,  0.0876,  0.0765,  0.1837,
                        0.2121, -0.0769],
                      [-0.1719,  0.2916,  0.1315,  0.2228, -0.0696,  0.2572, -0.2160, -0.0908,
                        0.1333,  0.1842],
                      [ 0.0148,  0.2191,  0.2171,  0.2507,  0.1419, -0.1592,  0.3144, -0.0986,
                        0.0805, -0.1562],
                      [ 0.3069,  0.2559, -0.0986,  0.0869, -0.2676, -0.1157, -0.0418, -0.2473,
                       -0.3012, -0.1968],
             

In [221]:
# Load our very first trained model parameters into the new one
new_model.load_state_dict(torch.load('model_state_dict.pt'), strict=False)

<All keys matched successfully>

In [222]:
# Show the new parameters
print(new_model.state_dict())

OrderedDict({'fc1.weight': tensor([[-0.2281, -0.0470,  0.2462, -0.2436, -0.0946, -0.3025,  0.1570,  0.0669,
          0.1495,  0.3131],
        [ 0.1679, -0.3009,  0.3013,  0.1753,  0.0325, -0.1105, -0.2211, -0.2457,
         -0.1122,  0.2106],
        [-0.1523, -0.2123, -0.1113, -0.2280, -0.3060, -0.0035, -0.0418,  0.1317,
         -0.2353, -0.0075],
        [ 0.0966, -0.2417, -0.1267, -0.1652,  0.3085, -0.2116,  0.2211,  0.0841,
         -0.1746, -0.1794],
        [ 0.2723,  0.1501,  0.1777, -0.0608,  0.0209, -0.0566, -0.1187,  0.1044,
          0.0284, -0.1925],
        [ 0.2183, -0.0458, -0.0868,  0.2422,  0.2899,  0.0156,  0.2748,  0.1606,
          0.1072,  0.2375],
        [ 0.0581, -0.2766, -0.0214, -0.0603, -0.0570,  0.2813, -0.0172, -0.0519,
          0.0022,  0.0754],
        [ 0.0725,  0.0626,  0.1116,  0.0689, -0.0994, -0.1217, -0.2950, -0.0031,
          0.2694,  0.2645],
        [ 0.2746, -0.0389,  0.2735,  0.2441, -0.0763, -0.3111, -0.0373,  0.0636,
          0.0860, -0

We would now take the parameters we just added into this model and train it!

# Saving and Loading Across Devices
PyTorch supports multiple different devices such as CPU and GPUs.

Its common practice to train on a GPU for speed but do inference on a CPU for cost for example.

In [223]:
# Load a model on CPU that was saved on GPU
import torch

model = torch.load('model_state_dict.pt', map_location='cpu', weights_only=True) # Using map_location

In [224]:
import torch

# Detect the best available device
if torch.cuda.is_available():
    device = torch.device("cuda")   # NVIDIA GPU
    device_name = torch.cuda.get_device_name(0)
elif torch.backends.mps.is_available():
    device = torch.device("mps")    # Apple Silicon GPU
    device_name = "Apple Metal (MPS)"
else:
    device = torch.device("cpu")    # CPU fallback
    device_name = "CPU"

print(f"✅ Using device: {device} — {device_name}")

✅ Using device: mps — Apple Metal (MPS)


In [225]:
state_dict = torch.load('model_state_dict.pt', map_location=device, weights_only=True )

In addition to above we must also put the model on the GPU:

```py
model.to('cuda')
```

As well as the inputs for inference.
```py
model.eval()
outputs = model(sample_input.to('cuda'))
```

In [226]:
import torch

# Auto-select the best available device (CUDA → MPS → CPU)
if torch.cuda.is_available():
    device = 'cuda'          # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = 'mps'           # Apple GPU (Metal)
else:
    device = 'cpu'           # CPU fallback

print(device)

mps
