# PyTorch
The following code fragments illustrate the typical structure of a PyTorch program, with further details and various options for each component.

# Typical Structure of a PyTorch Program

# Defining a Custom Model

In [None]:
import torch

class MyModel(torch.nn.Module):

    def __init__(self):
        super(MyModel, self).__init__()
        # define structure of the network here

    def forward(self, input):
        # apply network and return output


# Defining a Custom Model
This code defines a module for computing a function of the form $(x,y) \mapsto Ax \log(y) + B y^{2}$

In [None]:
import torch.nn as nn

class MyModel(nn.Module):

    def __init__(self):
        super(MyModel, self).__init__()
        self.A = nn.Parameter(torch.randn((1),requires_grad=True))
        self.B = nn.Parameter(torch.randn((1),requires_grad=True))

    def forward(self, input):
        output = self.A * input[:,0] * torch.log(input[:,1]) \
               + self.B * input[:,1] * input[:,1]
        return output

In [None]:
# create neural network according to model specification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = MyModel().to(device) # CPU or GPU

# prepare to load the training and test data
train_loader = torch.utils.data.DataLoader(...)
test_loader = torch.utils.data.DataLoader(...)

# choose between SGD, Adam or other optimizer
optimizer = torch.optim.SGD(net.parameters,...)

# enter the training loop
for epoch in range(1, epochs):
    train(params, net, device, train_loader, optimizer)
    # periodically evaluate the network on the test data
    if epoch % 10 == 0:
        test(params, net, device, test_loader)

# Building a Net from Individual Components

In [None]:
class MyModel(torch.nn.Module):

    def __init__(self):
        super(MyModel, self).__init__()
        self.in_to_hid = torch.nn.Linear(2,2)
        self.hid_to_out = torch.nn.Linear(2,1)

    def forward(self, input):
        hid_sum = self.in_to_hid(input)
        hidden = torch.tanh(hid_sum)
        out_sum = self.hid_to_out(hidden)
        output = torch.sigmoid(out_sum)
        return output

# Defining a Sequential Network

In [None]:
class MyModel(torch.nn.Module):

    def __init__(self, num_input, num_hid, num_out):
        super(MyModel, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(num_input, num_hid),
            nn.Tanh(),
            nn.Linear(num_hid, num_out),
            nn.Sigmoid()
        )
    def forward(self, input):
        output = self.main(input)
        return output

# Sequential Components
## Network Layers:
- nn.Linear()
- nn.Conv2d()

## Intermediate Operators:
- nn.Dropout()
- nn.BatchNorm()

## Activation Functions:
- nn.Tanh()
- nn.Sigmoid()
- nn.ReLU()

# Declaring Data Explicitly

In [None]:
import torch.utils.data

# input and target values for the XOR task
input = torch.Tensor([[0,0],[0,1],[1,0],[1,1]])
target = torch.Tensor([[0],[1],[1],[0]])

xdata = torch.utils.data.TensorDataset(input, target)
train_loader = torch.utils.data.DataLoader(xdata, batch_size=4)

# Loading Data from a .csv File

In [None]:
import pandas as pd

df = pd.read_csv("sonar.all-data.csv")
df = df.replace('R', 0)
df = df.replace('M', 1)
data = torch.tensor(df.values,dtype=torch.float32)
num_input = data.shape[1] - 1
input = data[:, 0:num_input]
target = data[:, num_input:num_input + 1]
dataset = torch.utils.data.TensorDataset(input,target)

# Custom Datasets

In [None]:
from data import ImageFolder

# load images from a specified directory
dataset = ImageFolder(folder, transform)

import torchvision.datasets as dsets
# download popular image datasets remotely
mnistset = dsets.MNIST(...)
cifarset = dsets.CIFAR10(...)
celebset = dsets.CelebA(...)

# Choosing an Optimizer

In [None]:
# SGD stands for “Stochastic Gradient Descent”
optimizer = torch.optim.SGD(
    net.parameters(),
    lr=0.01,
    momentum=0.9,
    weight_decay=0.0001
)

# Adam = Adaptive Moment Estimation (good for deep networks)
optimizer = torch.optim.Adam(
    net.parameters(),
    eps=0.000001,
    lr=0.01,
    betas=(0.5,0.999),
    weight_decay=0.0001
)

# Training

In [None]:
def train(args, net, device, train_loader, optimizer):
    for batch_idx, (data,target) in enumerate(train_loader):
        optimizer.zero_grad() # zero the gradients
        output = net(data) # apply network
        loss = ... # compute loss function
        loss.backward() # update gradients
        optimizer.step() # update weights

# Loss Functions

In [None]:
import torch.nn.functional as F
loss = torch.sum((output - target) * (output - target))
loss = F.nll_loss(output, target)
loss = F.binary_cross_entropy(output, target)
loss = F.softmax(output, dim=1)
loss = F.log_softmax(output, dim=1)

# Testing

In [None]:
def test(args, model, device, test_loader):
with torch.no_grad(): # suppress updating of gradients
    net.eval() # toggle batch norm, dropout
    test_loss = 0
    for data, target in test_loader:
        output = model(data)
        test_loss += ...
    print(test_loss)
    net.train() # toggle batch norm, dropout back again

## Computational Graphs
PyTorch automatically builds a computational graph, enabling it to backpropagate derivatives.

Every parameter includes `.data` and `.grad` components, for example:
`A.data`
`A.grad`

`optimizer.zero_grad()` sets all `.grad` components to zero.

`loss.backward()` updates the `.grad` component of all Parameters by backpropagating gradients through the computational graph.

`optimizer.step()` updates the `.data` components.

## Controlling the Computational Graph
If we need to stop the gradients from being backpropagated through a certain variable (or expression) A, we can exclude it from the computational graph by using:

`A.detach()`

By default, `loss.backward()` discards the computational graph after computing the gradients.

If needed, we can force it to keep the computational graph by calling it this way:

`loss.backward(retain_graph=True)`

# Exercise: Running pytorch

The following program solves the simplest possible machine learning task:

solve $f(x)=Axf(x)$ such that $f(1)=1$

In [3]:
import torch
import torch.utils.data
import numpy as np

lr = 1.9  # learning rate
mom = 0.9  # momentum

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.A = torch.nn.Parameter(torch.zeros(1, requires_grad=True))
    def forward(self, input):
        output  = self.A * input
        return(output)

input  = torch.Tensor([[1]])
target = torch.Tensor([[1]])

slope_dataset = torch.utils.data.TensorDataset(input, target)
train_loader  = torch.utils.data.DataLoader(slope_dataset, batch_size=1)

# create neural network according to model specification
net = MyModel().to(device) # CPU or GPU

# choose between SGD, Adam or other optimizer
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=mom)

epochs = 1000

for epoch in range(1, epochs):
    for batch_id, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()  # zero the gradients
        output = net(data)  # apply network
        loss = 0.5*torch.mean((output-target) * (output-target))
        if type(net.A.grad) == type(None):
            print('Ep%3d: zero_grad(): A.grad=  None  A.data=%7.4f loss=%7.4f' \
                      % (epoch, net.A.data, loss))
        else:
            print('Ep%3d: zero_grad(): A.grad=%7.4f A.data=%7.4f loss=%7.4f' \
                      % (epoch, net.A.grad, net.A.data, loss))
        loss.backward()  # compute gradients
        optimizer.step()  # update weights
        print('            step(): A.grad=%7.4f A.data=%7.4f' \
                      % (net.A.grad, net.A.data))
        if loss < 0.000000001 or np.isnan(loss.data):
            exit(0)

Ep  1: zero_grad(): A.grad=  None  A.data= 0.0000 loss= 0.5000
            step(): A.grad=-1.0000 A.data= 1.9000
Ep  2: zero_grad(): A.grad= 0.0000 A.data= 1.9000 loss= 0.4050
            step(): A.grad= 0.9000 A.data= 1.9000
Ep  3: zero_grad(): A.grad= 0.0000 A.data= 1.9000 loss= 0.4050
            step(): A.grad= 0.9000 A.data= 0.1900
Ep  4: zero_grad(): A.grad= 0.0000 A.data= 0.1900 loss= 0.3280
            step(): A.grad=-0.8100 A.data= 0.1900
Ep  5: zero_grad(): A.grad= 0.0000 A.data= 0.1900 loss= 0.3280
            step(): A.grad=-0.8100 A.data= 1.7290
Ep  6: zero_grad(): A.grad= 0.0000 A.data= 1.7290 loss= 0.2657
            step(): A.grad= 0.7290 A.data= 1.7290
Ep  7: zero_grad(): A.grad= 0.0000 A.data= 1.7290 loss= 0.2657
            step(): A.grad= 0.7290 A.data= 0.3439
Ep  8: zero_grad(): A.grad= 0.0000 A.data= 0.3439 loss= 0.2152
            step(): A.grad=-0.6561 A.data= 0.3439
Ep  9: zero_grad(): A.grad= 0.0000 A.data= 0.3439 loss= 0.2152
            step(): A.grad=-0.656

# Exercise: XOR with Pytorch

In [1]:
import torch
import torch.utils.data
import torch.nn.functional as F

lr = 0.1
mom = 0.0
init = 1.0

class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # define structure of the network here
        self.in_hid  = torch.nn.Linear(2,2)
        self.hid_out = torch.nn.Linear(2,1)
    def forward(self, input):
        # apply network and return output
        hid_sum = self.in_hid(input)
        hidden  = torch.tanh(hid_sum)
        out_sum = self.hid_out(hidden)
        output  = torch.sigmoid(out_sum)
        return(output)

device = 'cpu'

input  = torch.Tensor([[0,0],[0,1],[1,0],[1,1]])
target = torch.Tensor([[0],[1],[1],[0]])

xor_dataset  = torch.utils.data.TensorDataset(input,target)
train_loader = torch.utils.data.DataLoader(xor_dataset,batch_size=4)

# create neural network according to model specification
net = MyModel().to(device) # CPU or GPU

# initialize weight values
net.in_hid.weight.data.normal_(0,init)
net.hid_out.weight.data.normal_(0,init)

# choose between SGD, Adam or other optimizer
optimizer = torch.optim.SGD(net.parameters(),lr=lr,momentum=mom)

epochs = 10000

for epoch in range(1, epochs):
    #train(net, device, train_loader, optimizer)
    for batch_id, (data,target) in enumerate(train_loader):
        optimizer.zero_grad() # zero the gradients
        output = net(data)    # apply network
        loss = F.binary_cross_entropy(output,target)
        loss.backward()       # compute gradients
        optimizer.step()      # update weights
        if epoch % 100 == 0:
            print('ep%3d: loss = %7.4f' % (epoch, loss.item()))
        if loss < 0.01:
            print("Global Mininum")
            exit(0)
print("Local Minimum")

ep100: loss =  0.6725
ep200: loss =  0.6182
ep300: loss =  0.5233
ep400: loss =  0.3734
ep500: loss =  0.2390
ep600: loss =  0.1604
ep700: loss =  0.1163
ep800: loss =  0.0896
ep900: loss =  0.0722
ep1000: loss =  0.0602
ep1100: loss =  0.0514
ep1200: loss =  0.0447
ep1300: loss =  0.0396
ep1400: loss =  0.0354
ep1500: loss =  0.0320
ep1600: loss =  0.0292
ep1700: loss =  0.0268
ep1800: loss =  0.0248
ep1900: loss =  0.0230
ep2000: loss =  0.0215
ep2100: loss =  0.0202
ep2200: loss =  0.0190
ep2300: loss =  0.0179
ep2400: loss =  0.0170
ep2500: loss =  0.0161
ep2600: loss =  0.0153
ep2700: loss =  0.0146
ep2800: loss =  0.0140
ep2900: loss =  0.0134
ep3000: loss =  0.0129
ep3100: loss =  0.0124
ep3200: loss =  0.0119
ep3300: loss =  0.0115
ep3400: loss =  0.0111
ep3500: loss =  0.0107
ep3600: loss =  0.0103
ep3700: loss =  0.0100
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global Mininum
Global M