In [1]:
import torch
import numpy
import torchvision

# MODELS AND PRE-TRAINED WEIGHTS

## Initializing pre-trained models

In [2]:
# As of v0.13, TorchVision offers a new Multi-weight support API for loading different weights
# to the existing model builder methods:

from torchvision.models import resnet50, ResNet50_Weights

# Old weights with accuracy 76.130%
resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)

# New weights with accuracy 80.858%
resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)

# Best available weights (currently alias for IMAGENET1K_V2)
# Note that these weights may change across versions
resnet50(weights=ResNet50_Weights.DEFAULT)

# Strings are also supported
resnet50(weights="IMAGENET1K_V2")

# No weights - random initialization
resnet50(weights=None)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [3]:
# Migrating to the new API is very straightforward. The following method calls between the 2 APIs are all equivalent:

# Using pretrained weights:
resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
resnet50(weights="IMAGENET1K_V1")
resnet50(pretrained=True)  # deprecated
resnet50(True)  # deprecated

# Using no weights:
resnet50(weights=None)
resnet50()
resnet50(pretrained=False)  # deprecated
resnet50(False)  # deprecated

# Note that the pretrained parameter is now deprecated, using it will emit warnings and will be removed on v0.15.



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

## Using the pre-trained models

In [4]:
# Before using the pre-trained models, one must preprocess the image (resize with right resolution/interpolation, 
# apply inference transforms, rescale the values etc). There is no standard way to do this as it depends on how a given model
# was trained. It can vary across model families, variants or even weight versions. Using the correct preprocessing method
# is critical and failing to do so may lead to decreased accuracy or incorrect outputs.

# All the necessary information for the inference transforms of each pre-trained model is provided on its weights documentation.
# To simplify inference, TorchVision bundles the necessary preprocessing transforms into each model weight. 
# These are accessible via the weight.transforms attribute:

# Initialize the Weight Transforms
weights = ResNet50_Weights.DEFAULT
preprocess = weights.transforms()

from PIL import Image
img = Image.open("img/example1.jpg").convert('RGB')

# Apply it to the input image
img_transformed = preprocess(img)

In [5]:
# Some models use modules which have different training and evaluation behavior, such as batch normalization. 
# To switch between these modes, use model.train() or model.eval() as appropriate. See train() or eval() for details.

# Initialize model
weights = ResNet50_Weights.DEFAULT
model = resnet50(weights=weights)

# Set model to eval mode
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

## Listing and retrieving available models

In [6]:
from torchvision.models import *
from torchvision.models.quantization import *

In [7]:
# As of v0.14, TorchVision offers a new mechanism which allows listing and retrieving models and weights by their names. 
# Here are a few examples on how to use them:

# List available models
all_models = list_models()
classification_models = list_models(module=torchvision.models)

# Initialize models
m1 = get_model("mobilenet_v3_large", weights=None)
m2 = get_model("quantized_mobilenet_v3_large", weights="DEFAULT")

# Fetch weights
weights = get_weight("MobileNet_V3_Large_QuantizedWeights.DEFAULT")
assert weights == MobileNet_V3_Large_QuantizedWeights.DEFAULT

weights_enum = get_model_weights("quantized_mobilenet_v3_large")
assert weights_enum == MobileNet_V3_Large_QuantizedWeights

weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
assert weights_enum == weights_enum2

In [8]:
all_models

['alexnet',
 'convnext_base',
 'convnext_large',
 'convnext_small',
 'convnext_tiny',
 'deeplabv3_mobilenet_v3_large',
 'deeplabv3_resnet101',
 'deeplabv3_resnet50',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b2',
 'efficientnet_b3',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_v2_l',
 'efficientnet_v2_m',
 'efficientnet_v2_s',
 'fasterrcnn_mobilenet_v3_large_320_fpn',
 'fasterrcnn_mobilenet_v3_large_fpn',
 'fasterrcnn_resnet50_fpn',
 'fasterrcnn_resnet50_fpn_v2',
 'fcn_resnet101',
 'fcn_resnet50',
 'fcos_resnet50_fpn',
 'googlenet',
 'inception_v3',
 'keypointrcnn_resnet50_fpn',
 'lraspp_mobilenet_v3_large',
 'maskrcnn_resnet50_fpn',
 'maskrcnn_resnet50_fpn_v2',
 'maxvit_t',
 'mc3_18',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet_v2',
 'mobilenet_v3_large',
 'mobilenet_v3_small',
 'mvit_v1_b',
 'mvit_v2_s',
 'quantized_googlenet',
 '

In [9]:
classification_models

['alexnet',
 'convnext_base',
 'convnext_large',
 'convnext_small',
 'convnext_tiny',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b2',
 'efficientnet_b3',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_v2_l',
 'efficientnet_v2_m',
 'efficientnet_v2_s',
 'googlenet',
 'inception_v3',
 'maxvit_t',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet_v2',
 'mobilenet_v3_large',
 'mobilenet_v3_small',
 'regnet_x_16gf',
 'regnet_x_1_6gf',
 'regnet_x_32gf',
 'regnet_x_3_2gf',
 'regnet_x_400mf',
 'regnet_x_800mf',
 'regnet_x_8gf',
 'regnet_y_128gf',
 'regnet_y_16gf',
 'regnet_y_1_6gf',
 'regnet_y_32gf',
 'regnet_y_3_2gf',
 'regnet_y_400mf',
 'regnet_y_800mf',
 'regnet_y_8gf',
 'resnet101',
 'resnet152',
 'resnet18',
 'resnet34',
 'resnet50',
 'resnext101_32x8d',
 'resnext101_64x4d',
 'resnext50_32x4d',
 'shufflenet_v2_x0_5',
 'shufflenet_v2_x1_0',
 'sh

## Using models from Hub

In [10]:
# Most pre-trained models can be accessed directly via PyTorch Hub without having TorchVision installed:

#import torch

# Option 1: passing weights param as string
model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2")

# Option 2: passing weights param as enum
### weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2")
### TypeError: get_weight() got an unexpected keyword argument 'weights'
### model = torch.hub.load("pytorch/vision", "resnet50", weights=weights)

Using cache found in C:\Users\gribo/.cache\torch\hub\pytorch_vision_main


In [11]:
# You can also retrieve all the available weights of a specific model via PyTorch Hub by doing:

#import torch

weight_enum = torch.hub.load("pytorch/vision", "get_model_weights", name="resnet50")
print([weight for weight in weight_enum])

# The only exception to the above are the detection models included on torchvision.models.detection. 
# These models require TorchVision to be installed because they depend on custom C++ operators.

[ResNet50_Weights.IMAGENET1K_V1, ResNet50_Weights.IMAGENET1K_V2]


Using cache found in C:\Users\gribo/.cache\torch\hub\pytorch_vision_main


## Classification

In [12]:
# Here is an example of how to use the pre-trained image classification models:

from torchvision.io import read_image
from torchvision.models import resnet50, ResNet50_Weights

img = read_image("img/example1.jpg")

# Step 1: Initialize model with the best available weights
weights = ResNet50_Weights.DEFAULT
model = resnet50(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(img).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score:.1f}%")

African grey: 37.6%




In [13]:
# The classes of the pre-trained model outputs can be found at weights.meta["categories"].

weights.meta["categories"]

['tench',
 'goldfish',
 'great white shark',
 'tiger shark',
 'hammerhead',
 'electric ray',
 'stingray',
 'cock',
 'hen',
 'ostrich',
 'brambling',
 'goldfinch',
 'house finch',
 'junco',
 'indigo bunting',
 'robin',
 'bulbul',
 'jay',
 'magpie',
 'chickadee',
 'water ouzel',
 'kite',
 'bald eagle',
 'vulture',
 'great grey owl',
 'European fire salamander',
 'common newt',
 'eft',
 'spotted salamander',
 'axolotl',
 'bullfrog',
 'tree frog',
 'tailed frog',
 'loggerhead',
 'leatherback turtle',
 'mud turtle',
 'terrapin',
 'box turtle',
 'banded gecko',
 'common iguana',
 'American chameleon',
 'whiptail',
 'agama',
 'frilled lizard',
 'alligator lizard',
 'Gila monster',
 'green lizard',
 'African chameleon',
 'Komodo dragon',
 'African crocodile',
 'American alligator',
 'triceratops',
 'thunder snake',
 'ringneck snake',
 'hognose snake',
 'green snake',
 'king snake',
 'garter snake',
 'water snake',
 'vine snake',
 'night snake',
 'boa constrictor',
 'rock python',
 'Indian cobr

## Quantized models

In [14]:
# Here is an example of how to use the pre-trained quantized image classification models:

from torchvision.io import read_image
from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights

img = read_image("img/example1.jpg")

# Step 1: Initialize model with the best available weights
weights = ResNet50_QuantizedWeights.DEFAULT
model = resnet50(weights=weights, quantize=True)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(img).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score}%")

  device=storage.device,


African grey: 41.742271184921265%


In [15]:
# The classes of the pre-trained model outputs can be found at weights.meta["categories"].

weights.meta["categories"]

['tench',
 'goldfish',
 'great white shark',
 'tiger shark',
 'hammerhead',
 'electric ray',
 'stingray',
 'cock',
 'hen',
 'ostrich',
 'brambling',
 'goldfinch',
 'house finch',
 'junco',
 'indigo bunting',
 'robin',
 'bulbul',
 'jay',
 'magpie',
 'chickadee',
 'water ouzel',
 'kite',
 'bald eagle',
 'vulture',
 'great grey owl',
 'European fire salamander',
 'common newt',
 'eft',
 'spotted salamander',
 'axolotl',
 'bullfrog',
 'tree frog',
 'tailed frog',
 'loggerhead',
 'leatherback turtle',
 'mud turtle',
 'terrapin',
 'box turtle',
 'banded gecko',
 'common iguana',
 'American chameleon',
 'whiptail',
 'agama',
 'frilled lizard',
 'alligator lizard',
 'Gila monster',
 'green lizard',
 'African chameleon',
 'Komodo dragon',
 'African crocodile',
 'American alligator',
 'triceratops',
 'thunder snake',
 'ringneck snake',
 'hognose snake',
 'green snake',
 'king snake',
 'garter snake',
 'water snake',
 'vine snake',
 'night snake',
 'boa constrictor',
 'rock python',
 'Indian cobr

## Semantic Segmentation

In [16]:
# Here is an example of how to use the pre-trained semantic segmentation models:

from torchvision.io.image import read_image
from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
from torchvision.transforms.functional import to_pil_image

img = read_image("img/example2.jpg")

# Step 1: Initialize model with the best available weights
weights = FCN_ResNet50_Weights.DEFAULT
model = fcn_resnet50(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(img).unsqueeze(0)

# Step 4: Use the model and visualize the prediction
prediction = model(batch)["out"]
normalized_masks = prediction.softmax(dim=1)
class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
mask = normalized_masks[0, class_to_idx["dog"]]
to_pil_image(mask).show()
to_pil_image(mask).save('SemanticSegmentationExample.png')

In [17]:
# The classes of the pre-trained model outputs can be found at weights.meta["categories"]. 
# The output format of the models is illustrated in Semantic segmentation models.

weights.meta["categories"]

['__background__',
 'aeroplane',
 'bicycle',
 'bird',
 'boat',
 'bottle',
 'bus',
 'car',
 'cat',
 'chair',
 'cow',
 'diningtable',
 'dog',
 'horse',
 'motorbike',
 'person',
 'pottedplant',
 'sheep',
 'sofa',
 'train',
 'tvmonitor']

## Object Detection, Instance Segmentation and Person Keypoint Detection

In [18]:
# The pre-trained models for detection, instance segmentation and keypoint detection are initialized 
# with the classification models in torchvision. The models expect a list of Tensor[C, H, W]. 
# Check the constructor of the models for more information.

# Here is an example of how to use the pre-trained object detection models:

from torchvision.io.image import read_image
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image

img = read_image("img/example1.jpg")

# Step 1: Initialize model with the best available weights
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = [preprocess(img)]

# Step 4: Use the model and visualize the prediction
prediction = model(batch)[0]
labels = [weights.meta["categories"][i] for i in prediction["labels"]]
box = draw_bounding_boxes(img, boxes=prediction["boxes"],
                          labels=labels,
                          colors="red",
                          width=4, font_size=30)
im = to_pil_image(box.detach())
im.show()
im.save('ObjectDetectionExample.png')



In [19]:
# The classes of the pre-trained model outputs can be found at weights.meta["categories"]. 
# For details on how to plot the bounding boxes of the models, you may refer to Instance segmentation models.

weights.meta["categories"]

['__background__',
 'person',
 'bicycle',
 'car',
 'motorcycle',
 'airplane',
 'bus',
 'train',
 'truck',
 'boat',
 'traffic light',
 'fire hydrant',
 'N/A',
 'stop sign',
 'parking meter',
 'bench',
 'bird',
 'cat',
 'dog',
 'horse',
 'sheep',
 'cow',
 'elephant',
 'bear',
 'zebra',
 'giraffe',
 'N/A',
 'backpack',
 'umbrella',
 'N/A',
 'N/A',
 'handbag',
 'tie',
 'suitcase',
 'frisbee',
 'skis',
 'snowboard',
 'sports ball',
 'kite',
 'baseball bat',
 'baseball glove',
 'skateboard',
 'surfboard',
 'tennis racket',
 'bottle',
 'N/A',
 'wine glass',
 'cup',
 'fork',
 'knife',
 'spoon',
 'bowl',
 'banana',
 'apple',
 'sandwich',
 'orange',
 'broccoli',
 'carrot',
 'hot dog',
 'pizza',
 'donut',
 'cake',
 'chair',
 'couch',
 'potted plant',
 'bed',
 'N/A',
 'dining table',
 'N/A',
 'N/A',
 'toilet',
 'N/A',
 'tv',
 'laptop',
 'mouse',
 'remote',
 'keyboard',
 'cell phone',
 'microwave',
 'oven',
 'toaster',
 'sink',
 'refrigerator',
 'N/A',
 'book',
 'clock',
 'vase',
 'scissors',
 'ted

## Video Classification

In [20]:
# Here is an example of how to use the pre-trained video classification models:

from torchvision.io.video import read_video
from torchvision.models.video import r3d_18, R3D_18_Weights

vid, _, _ = read_video("video.mov", output_format="TCHW")
vid = vid[:32]  # optionally shorten duration

# Step 1: Initialize model with the best available weights
weights = R3D_18_Weights.DEFAULT
model = r3d_18(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(vid).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
label = prediction.argmax().item()
score = prediction[label].item()
category_name = weights.meta["categories"][label]
print(f"{category_name}: {100 * score}%")



tai chi: 23.7325057387352%


In [21]:
# The classes of the pre-trained model outputs can be found at weights.meta["categories"].

weights.meta["categories"]

['abseiling',
 'air drumming',
 'answering questions',
 'applauding',
 'applying cream',
 'archery',
 'arm wrestling',
 'arranging flowers',
 'assembling computer',
 'auctioning',
 'baby waking up',
 'baking cookies',
 'balloon blowing',
 'bandaging',
 'barbequing',
 'bartending',
 'beatboxing',
 'bee keeping',
 'belly dancing',
 'bench pressing',
 'bending back',
 'bending metal',
 'biking through snow',
 'blasting sand',
 'blowing glass',
 'blowing leaves',
 'blowing nose',
 'blowing out candles',
 'bobsledding',
 'bookbinding',
 'bouncing on trampoline',
 'bowling',
 'braiding hair',
 'breading or breadcrumbing',
 'breakdancing',
 'brush painting',
 'brushing hair',
 'brushing teeth',
 'building cabinet',
 'building shed',
 'bungee jumping',
 'busking',
 'canoeing or kayaking',
 'capoeira',
 'carrying baby',
 'cartwheeling',
 'carving pumpkin',
 'catching fish',
 'catching or throwing baseball',
 'catching or throwing frisbee',
 'catching or throwing softball',
 'celebrating',
 'cha

# TORCH.JIT.TRACE

In [22]:
# Example (tracing a function):

#import torch

def foo(x, y):
    return 2 * x + y

# Run `foo` with the provided inputs and record the tensor operations
traced_foo = torch.jit.trace(foo, (torch.rand(3), torch.rand(3)))

# `traced_foo` can now be run with the TorchScript interpreter or saved
# and loaded in a Python-free environment

In [23]:
# Example (tracing an existing module):

import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(1, 1, 3)

    def forward(self, x):
        return self.conv(x)

n = Net()
example_weight = torch.rand(1, 1, 3, 3)
example_forward_input = torch.rand(1, 1, 3, 3)

# Trace a specific method and construct `ScriptModule` with
# a single `forward` method
module = torch.jit.trace(n.forward, example_forward_input)

# Trace a module (implicitly traces `forward`) and construct a
# `ScriptModule` with a single `forward` method
module = torch.jit.trace(n, example_forward_input)

In [24]:
module

Net(
  original_name=Net
  (conv): Conv2d(original_name=Conv2d)
)

# INTRODUCTION TO TORCHSCRIPT

In [25]:
import torch  # This is all you need to use both PyTorch and TorchScript!
print(torch.__version__)
torch.manual_seed(191009)  # set the seed for reproducibility

2.0.1+cu117


<torch._C.Generator at 0x2020f564890>

## Basics of PyTorch Model Authoring

In [26]:
# Let’s start out by defining a simple Module. A Module is the basic unit of composition in PyTorch. It contains:
# 1. A constructor, which prepares the module for invocation
# 2. A set of Parameters and sub-Modules. These are initialized by the constructor and can be used by the module 
# during invocation.
# 3. A forward function. This is the code that is run when the module is invoked.

# Let’s examine a small example:

class MyCell(torch.nn.Module):
    def __init__(self):
        super(MyCell, self).__init__()

    def forward(self, x, h):
        new_h = torch.tanh(x + h)
        return new_h, new_h

my_cell = MyCell()
x = torch.rand(3, 4)
h = torch.rand(3, 4)
print(my_cell(x, h))

# So we’ve:
# 1. Created a class that subclasses torch.nn.Module.
# 2. Defined a constructor. The constructor doesn’t do much, just calls the constructor for super.
# 3. Defined a forward function, which takes two inputs and returns two outputs. The actual contents of the forward function 
# are not really important, but it’s sort of a fake RNN cell–that is–it’s a function that is applied on a loop.
# We instantiated the module, and made x and h, which are just 3x4 matrices of random values. 
# Then we invoked the cell with my_cell(x, h). This in turn calls our forward function.

(tensor([[0.8219, 0.8990, 0.6670, 0.8277],
        [0.5176, 0.4017, 0.8545, 0.7336],
        [0.6013, 0.6992, 0.2618, 0.6668]]), tensor([[0.8219, 0.8990, 0.6670, 0.8277],
        [0.5176, 0.4017, 0.8545, 0.7336],
        [0.6013, 0.6992, 0.2618, 0.6668]]))


In [27]:
# Let’s do something a little more interesting:

class MyCell(torch.nn.Module):
    def __init__(self):
        super(MyCell, self).__init__()
        self.linear = torch.nn.Linear(4, 4)

    def forward(self, x, h):
        new_h = torch.tanh(self.linear(x) + h)
        return new_h, new_h

my_cell = MyCell()
print(my_cell)
print(my_cell(x, h))

# We’ve redefined our module MyCell, but this time we’ve added a self.linear attribute, and we invoke self.linear 
# in the forward function.
# What exactly is happening here? torch.nn.Linear is a Module from the PyTorch standard library. 
# Just like MyCell, it can be invoked using the call syntax. We are building a hierarchy of Modules.
# print on a Module will give a visual representation of the Module’s subclass hierarchy. 
# In our example, we can see our Linear subclass and its parameters.
# By composing Modules in this way, we can succinctly and readably author models with reusable components.

MyCell(
  (linear): Linear(in_features=4, out_features=4, bias=True)
)
(tensor([[ 0.8573,  0.6190,  0.5774,  0.7869],
        [ 0.3326,  0.0530,  0.0702,  0.8114],
        [ 0.7818, -0.0506,  0.4039,  0.7967]], grad_fn=<TanhBackward0>), tensor([[ 0.8573,  0.6190,  0.5774,  0.7869],
        [ 0.3326,  0.0530,  0.0702,  0.8114],
        [ 0.7818, -0.0506,  0.4039,  0.7967]], grad_fn=<TanhBackward0>))


In [28]:
# Now let’s examine said flexibility:

class MyDecisionGate(torch.nn.Module):
    def forward(self, x):
        if x.sum() > 0:
            return x
        else:
            return -x

class MyCell(torch.nn.Module):
    def __init__(self):
        super(MyCell, self).__init__()
        self.dg = MyDecisionGate()
        self.linear = torch.nn.Linear(4, 4)

    def forward(self, x, h):
        new_h = torch.tanh(self.dg(self.linear(x)) + h)
        return new_h, new_h

my_cell = MyCell()
print(my_cell)
print(my_cell(x, h))

# We’ve once again redefined our MyCell class, but here we’ve defined MyDecisionGate. 
# This module utilizes control flow. Control flow consists of things like loops and if-statements.

MyCell(
  (dg): MyDecisionGate()
  (linear): Linear(in_features=4, out_features=4, bias=True)
)
(tensor([[ 0.8346,  0.5931,  0.2097,  0.8232],
        [ 0.2340, -0.1254,  0.2679,  0.8064],
        [ 0.6231,  0.1494, -0.3110,  0.7865]], grad_fn=<TanhBackward0>), tensor([[ 0.8346,  0.5931,  0.2097,  0.8232],
        [ 0.2340, -0.1254,  0.2679,  0.8064],
        [ 0.6231,  0.1494, -0.3110,  0.7865]], grad_fn=<TanhBackward0>))


## Basics of TorchScript

In [29]:
# Now let’s take our running example and see how we can apply TorchScript.

# In short, TorchScript provides tools to capture the definition of your model, even in light of the flexible 
# and dynamic nature of PyTorch. Let’s begin by examining what we call tracing.

# Tracing Modules

class MyCell(torch.nn.Module):
    def __init__(self):
        super(MyCell, self).__init__()
        self.linear = torch.nn.Linear(4, 4)

    def forward(self, x, h):
        new_h = torch.tanh(self.linear(x) + h)
        return new_h, new_h

my_cell = MyCell()
x, h = torch.rand(3, 4), torch.rand(3, 4)
traced_cell = torch.jit.trace(my_cell, (x, h))
print(traced_cell)
traced_cell(x, h)

# We’ve rewinded a bit and taken the second version of our MyCell class. As before, we’ve instantiated it, but this time, 
# we’ve called torch.jit.trace, passed in the Module, and passed in example inputs the network might see.
# What exactly has this done? It has invoked the Module, recorded the operations that occurred when the Module was run, 
# and created an instance of torch.jit.ScriptModule (of which TracedModule is an instance)

MyCell(
  original_name=MyCell
  (linear): Linear(original_name=Linear)
)


(tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
         [-0.2329, -0.2911,  0.5641,  0.5015],
         [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>),
 tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
         [-0.2329, -0.2911,  0.5641,  0.5015],
         [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>))

In [30]:
# TorchScript records its definitions in an Intermediate Representation (or IR), commonly referred to in Deep learning 
# as a graph. We can examine the graph with the .graph property:

print(traced_cell.graph)

graph(%self.1 : __torch__.MyCell,
      %x : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu),
      %h : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)):
  %linear : __torch__.torch.nn.modules.linear.Linear = prim::GetAttr[name="linear"](%self.1)
  %20 : Tensor = prim::CallMethod[name="forward"](%linear, %x)
  %11 : int = prim::Constant[value=1]() # C:\Users\gribo\AppData\Local\Temp\ipykernel_11600\19045061.py:14:0
  %12 : Float(3, 4, strides=[4, 1], requires_grad=1, device=cpu) = aten::add(%20, %h, %11) # C:\Users\gribo\AppData\Local\Temp\ipykernel_11600\19045061.py:14:0
  %13 : Float(3, 4, strides=[4, 1], requires_grad=1, device=cpu) = aten::tanh(%12) # C:\Users\gribo\AppData\Local\Temp\ipykernel_11600\19045061.py:14:0
  %14 : (Float(3, 4, strides=[4, 1], requires_grad=1, device=cpu), Float(3, 4, strides=[4, 1], requires_grad=1, device=cpu)) = prim::TupleConstruct(%13, %13)
  return (%14)



In [31]:
# However, this is a very low-level representation and most of the information contained in the graph is not useful 
# for end users. Instead, we can use the .code property to give a Python-syntax interpretation of the code:

print(traced_cell.code)

def forward(self,
    x: Tensor,
    h: Tensor) -> Tuple[Tensor, Tensor]:
  linear = self.linear
  _0 = torch.tanh(torch.add((linear).forward(x, ), h))
  return (_0, _0)



In [32]:
# So why did we do all this? There are several reasons:
# 1. TorchScript code can be invoked in its own interpreter, which is basically a restricted Python interpreter. 
# This interpreter does not acquire the Global Interpreter Lock, and so many requests can be processed 
# on the same instance simultaneously.
# 2. This format allows us to save the whole model to disk and load it into another environment, 
# such as in a server written in a language other than Python
# 3. TorchScript gives us a representation in which we can do compiler optimizations on the code to provide 
# more efficient execution
# 4. TorchScript allows us to interface with many backend/device runtimes that require a broader view of the program 
# than individual operators.

#We can see that invoking traced_cell produces the same results as the Python module:

print(my_cell(x, h))
print(traced_cell(x, h))

(tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
        [-0.2329, -0.2911,  0.5641,  0.5015],
        [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>), tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
        [-0.2329, -0.2911,  0.5641,  0.5015],
        [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>))
(tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
        [-0.2329, -0.2911,  0.5641,  0.5015],
        [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>), tensor([[-0.2541,  0.2460,  0.2297,  0.1014],
        [-0.2329, -0.2911,  0.5641,  0.5015],
        [ 0.1688,  0.2252,  0.7251,  0.2530]], grad_fn=<TanhBackward0>))


## Using Scripting to Convert Modules

In [33]:
# There’s a reason we used version two of our module, and not the one with the control-flow-laden submodule. 
# Let’s examine that now:

class MyDecisionGate(torch.nn.Module):
    def forward(self, x):
        if x.sum() > 0:
            return x
        else:
            return -x

class MyCell(torch.nn.Module):
    def __init__(self, dg):
        super(MyCell, self).__init__()
        self.dg = dg
        self.linear = torch.nn.Linear(4, 4)

    def forward(self, x, h):
        new_h = torch.tanh(self.dg(self.linear(x)) + h)
        return new_h, new_h

my_cell = MyCell(MyDecisionGate())
traced_cell = torch.jit.trace(my_cell, (x, h))

print(traced_cell.dg.code)
print(traced_cell.code)

# Looking at the .code output, we can see that the if-else branch is nowhere to be found! Why? 
# Tracing does exactly what we said it would: run the code, record the operations that happen and construct a ScriptModule 
# that does exactly that. Unfortunately, things like control flow are erased.

def forward(self,
    argument_1: Tensor) -> NoneType:
  return None

def forward(self,
    x: Tensor,
    h: Tensor) -> Tuple[Tensor, Tensor]:
  dg = self.dg
  linear = self.linear
  _0 = (linear).forward(x, )
  _1 = (dg).forward(_0, )
  _2 = torch.tanh(torch.add(_0, h))
  return (_2, _2)



  if x.sum() > 0:


In [34]:
# How can we faithfully represent this module in TorchScript? We provide a script compiler, which does direct analysis 
# of your Python source code to transform it into TorchScript. Let’s convert MyDecisionGate using the script compiler:

scripted_gate = torch.jit.script(MyDecisionGate())

my_cell = MyCell(scripted_gate)
scripted_cell = torch.jit.script(my_cell)

print(scripted_gate.code)
print(scripted_cell.code)

def forward(self,
    x: Tensor) -> Tensor:
  if bool(torch.gt(torch.sum(x), 0)):
    _0 = x
  else:
    _0 = torch.neg(x)
  return _0

def forward(self,
    x: Tensor,
    h: Tensor) -> Tuple[Tensor, Tensor]:
  dg = self.dg
  linear = self.linear
  _0 = torch.add((dg).forward((linear).forward(x, ), ), h)
  new_h = torch.tanh(_0)
  return (new_h, new_h)



In [35]:
# Hooray! We’ve now faithfully captured the behavior of our program in TorchScript. Let’s now try running the program:

# New inputs
x, h = torch.rand(3, 4), torch.rand(3, 4)
print(scripted_cell(x, h))

(tensor([[ 0.5679,  0.5762,  0.2506, -0.0734],
        [ 0.5228,  0.7122,  0.6985, -0.0656],
        [ 0.6187,  0.4487,  0.7456, -0.0238]], grad_fn=<TanhBackward0>), tensor([[ 0.5679,  0.5762,  0.2506, -0.0734],
        [ 0.5228,  0.7122,  0.6985, -0.0656],
        [ 0.6187,  0.4487,  0.7456, -0.0238]], grad_fn=<TanhBackward0>))


## Mixing Scripting and Tracing

In [36]:
# Some situations call for using tracing rather than scripting (e.g. a module has many architectural decisions that are made 
# based on constant Python values that we would like to not appear in TorchScript). In this case, scripting can be composed with
# tracing: torch.jit.script will inline the code for a traced module, and tracing will inline the code for a scripted module.

# An example of the first case:

class MyRNNLoop(torch.nn.Module):
    def __init__(self):
        super(MyRNNLoop, self).__init__()
        self.cell = torch.jit.trace(MyCell(scripted_gate), (x, h))

    def forward(self, xs):
        h, y = torch.zeros(3, 4), torch.zeros(3, 4)
        for i in range(xs.size(0)):
            y, h = self.cell(xs[i], h)
        return y, h

rnn_loop = torch.jit.script(MyRNNLoop())
print(rnn_loop.code)

def forward(self,
    xs: Tensor) -> Tuple[Tensor, Tensor]:
  h = torch.zeros([3, 4])
  y = torch.zeros([3, 4])
  y0 = y
  h0 = h
  for i in range(torch.size(xs, 0)):
    cell = self.cell
    _0 = (cell).forward(torch.select(xs, 0, i), h0, )
    y1, h1, = _0
    y0, h0 = y1, h1
  return (y0, h0)



In [37]:
# And an example of the second case:

class WrapRNN(torch.nn.Module):
    def __init__(self):
        super(WrapRNN, self).__init__()
        self.loop = torch.jit.script(MyRNNLoop())

    def forward(self, xs):
        y, h = self.loop(xs)
        return torch.relu(y)

traced = torch.jit.trace(WrapRNN(), (torch.rand(10, 3, 4)))
print(traced.code)

# This way, scripting and tracing can be used when the situation calls for each of them and used together.

def forward(self,
    xs: Tensor) -> Tensor:
  loop = self.loop
  _0, y, = (loop).forward(xs, )
  return torch.relu(y)



## Saving and Loading models

In [38]:
# We provide APIs to save and load TorchScript modules to/from disk in an archive format. This format includes code, parameters,
# attributes, and debug information, meaning that the archive is a freestanding representation of the model that can be loaded 
# in an entirely separate process. Let’s save and load our wrapped RNN module:

traced.save('wrapped_rnn.pt')

loaded = torch.jit.load('wrapped_rnn.pt')

print(loaded)
print(loaded.code)

# As you can see, serialization preserves the module hierarchy and the code we’ve been examining throughout. 
# The model can also be loaded, for example, into C++ for python-free execution.

RecursiveScriptModule(
  original_name=WrapRNN
  (loop): RecursiveScriptModule(
    original_name=MyRNNLoop
    (cell): RecursiveScriptModule(
      original_name=MyCell
      (dg): RecursiveScriptModule(original_name=MyDecisionGate)
      (linear): RecursiveScriptModule(original_name=Linear)
    )
  )
)
def forward(self,
    xs: Tensor) -> Tensor:
  loop = self.loop
  _0, y, = (loop).forward(xs, )
  return torch.relu(y)



# Tracing model

In [39]:
# Import required libraries
import torch
#import numpy as np
#import cv2
import torchvision.transforms as transforms
from torchvision.io import read_image
from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights

# Read the image
#image = cv2.imread("example.jpeg")
#image1 = cv2.imread("example1.jpeg")
#image2 = cv2.imread("example2.jpeg")

# Convert BGR image to RGB image
#image1 = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#image = read_image("img/example.jpg")
image1 = read_image("img/example1.jpg")
image2 = read_image("img/example2.jpg")

# Step 1: Initialize model with the best available weights
weights = ResNet50_QuantizedWeights.DEFAULT
model = resnet50(weights=weights, quantize=True)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(image2).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score}%")

Border collie: 8.586472272872925%


In [40]:
module = torch.jit.trace(model, torch.rand(1, 3, 1280, 720))

In [41]:
module.save("module.pt")

In [42]:
print(module.code)

def forward(self,
    x: Tensor) -> Tensor:
  dequant = self.dequant
  fc = self.fc
  avgpool = self.avgpool
  layer4 = self.layer4
  layer3 = self.layer3
  layer2 = self.layer2
  layer1 = self.layer1
  maxpool = self.maxpool
  relu = self.relu
  bn1 = self.bn1
  conv1 = self.conv1
  quant = self.quant
  _0 = (conv1).forward((quant).forward(x, ), )
  _1 = (bn1).forward()
  _2 = (relu).forward()
  _3 = (layer1).forward((maxpool).forward(_0, ), )
  _4 = (layer3).forward((layer2).forward(_3, ), )
  _5 = (avgpool).forward((layer4).forward(_4, ), )
  _6 = (dequant).forward((fc).forward(torch.flatten(_5, 1), ), )
  return _6



In [43]:
loaded = torch.jit.load('module.pt')

In [44]:
print(loaded.code)

def forward(self,
    x: Tensor) -> Tensor:
  dequant = self.dequant
  fc = self.fc
  avgpool = self.avgpool
  layer4 = self.layer4
  layer3 = self.layer3
  layer2 = self.layer2
  layer1 = self.layer1
  maxpool = self.maxpool
  relu = self.relu
  bn1 = self.bn1
  conv1 = self.conv1
  quant = self.quant
  _0 = (conv1).forward((quant).forward(x, ), )
  _1 = (bn1).forward()
  _2 = (relu).forward()
  _3 = (layer1).forward((maxpool).forward(_0, ), )
  _4 = (layer3).forward((layer2).forward(_3, ), )
  _5 = (avgpool).forward((layer4).forward(_4, ), )
  _6 = (dequant).forward((fc).forward(torch.flatten(_5, 1), ), )
  return _6



In [45]:
# Step 4: Use the model and print the predicted category
prediction = loaded(batch).squeeze(0).softmax(0)
class_id = prediction.argmax().item()
score = prediction[class_id].item()
category_name = weights.meta["categories"][class_id]
print(f"{category_name}: {100 * score}%")

Border collie: 8.586472272872925%
