In [None]:
import numpy as np
import pandas as pd

# PyTorch stuff we'll need
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

## Learning Rate Scheduler

In [None]:
# model and optimizer
model = nn.Sequential(nn.Linear(2, 5),
                      nn.ReLU(),
                      nn.Linear(5, 1))
optimizer = optim.Adam(model.parameters(), lr = 0.001)

We can group the parameters of our model into different groups (will be used for transfer learning later)
- here we only have one group
- note the learning rate

In [None]:
optimizer.param_groups

The learning rate scheduler lets us adjust the learning rate according to different schemes
- For example the following is [Cosine Annealing](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingLR.html#torch.optim.lr_scheduler.CosineAnnealingLR) set for 100 iterations

In [None]:
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 100)

In [None]:
# why do you think we are getting an error here?
print(optimizer.param_groups[0]['lr'])
lr_scheduler.step()
print(optimizer.param_groups[0]['lr'])

In [None]:
for i in range (10):
    lr_scheduler.step()
    print(optimizer.param_groups[0]['lr'])

Try implementing and plotting the learning rate for a [One Cycle](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.OneCycleLR.html#torch.optim.lr_scheduler.OneCycleLR) learning rate

## Dropout

The layer ```nn.Dropout(p)``` randomly zeros out elements on the input tensor with probability ```p```. The resulting tensor is then scaling by $\frac{1}{1-p}$.
- Keeps output same scale as during test time (no dropout)
- Think about it as making it so that Dropout is adding noise with mean 0

In [None]:
drop = nn.Dropout(p=.2)
x = torch.ones((100,100))
print(x)
y = drop(x)
y

In [None]:
# model with Dropout
class TwoLayerNN_drop(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN_drop, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=.25)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        
        # Add some dropout after first layer
        x = self.dropout(x)
        
        x = self.linear2(x)
        return torch.squeeze(x)

## Weight Decay
- Let's use [Stochastic Gradient Descent](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD) with weight decay

In [None]:
# blowing up weight decay so you can see it in action

model = nn.Sequential(nn.Linear(2, 5),
                      nn.ReLU(),
                      nn.Linear(5, 1))
optimizer = optim.SGD(model.parameters(), lr = 0.001, weight_decay = 1000.0)

In [None]:
for param in model.parameters():
    print(param)

In [None]:
model.train()
y = model(torch.ones(10, 2))

# train w.r.t a loss function that wants to maximize output
(1/sum(y)).backward()
optimizer.step()

In [None]:
# weights have decreased
for param in model.parameters():
    print(param)

## Batch Normalization

In [None]:
# model with batch normalization
class TwoLayerNN_BN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN_BN, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
        # we input the number of features to be normalizing across a batch
        self.bn = nn.BatchNorm1d(hidden_dim)
        
    def forward(self, x):
        x = self.linear1(x)
        
        # add batch normalization before activation
        x = self.bn(x)
        x = self.relu(x)
        
        x = self.linear2(x)
        # no batch norm for final output!
        
        return torch.squeeze(x)

In [None]:
model = TwoLayerNN_BN(2, 5, 1)
bn_layer = model.bn

# note that batch normalization intializes with pure mini-batch noramlization
# will change during training
for param in bn_layer.parameters():
    print(param)

## Early Stopping
- ideas
 * stop training after validation loss does not improve after so many epochs
 * save model parameters after each epoch if they are a new minimum validation loss

In [None]:
import seaborn as sns
mpg = sns.load_dataset('mpg')
mpg.head()

In [None]:
class MPGDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        x = torch.tensor([row['displacement'],
                          row['weight']]).float()
        
        y = torch.tensor(row['mpg']).float()
        
        return x, y

# train/val split
mpg_train = mpg[100:].reset_index(drop=True)
mpg_val = mpg[:100].reset_index(drop=True)
mpg_train_ds = MPGDataset(mpg_train)
mpg_val_ds = MPGDataset(mpg_val)

# load into dataloader
mpg_train_dl = DataLoader(mpg_train_ds, batch_size=50, shuffle=True)
mpg_val_dl = DataLoader(mpg_val_ds, batch_size=100, shuffle=False)

In [None]:
# vanilla two-layer
class TwoLayerNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)     
        x = self.linear2(x)
        return torch.squeeze(x)

In [None]:
# large network to induce overfitting
model = TwoLayerNN(2, 10, 1)
lossFun = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
from tqdm.notebook import tqdm

# collect losses
avg_train = []
avg_val = []

for epoch in tqdm(range(500)):
    train_losses = []
    for x, y in mpg_train_dl:
        
        model.train()
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        train_losses.append(loss.item())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
    
    avg_train.append(sum(train_losses) / len(train_losses))
    
    for x, y in mpg_val_dl:
        model.eval()
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        
        avg_val.append(loss.item())

In [None]:
import matplotlib.pyplot as plt

# plot losses
plt.plot(avg_train)
plt.plot(avg_val)
plt.show()

What about with batch normalization?

In [None]:
model = TwoLayerNN_BN(2, 10, 1)
lossFun = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

# collect losses
avg_train = []
avg_val = []

for epoch in tqdm(range(500)):
    train_losses = []
    for x, y in mpg_train_dl:
        
        model.train()
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        train_losses.append(loss.item())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
    
    avg_train.append(sum(train_losses) / len(train_losses))
    
    for x, y in mpg_val_dl:
        model.eval()
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        
        avg_val.append(loss.item())
        
plt.plot(avg_train)
plt.plot(avg_val)
plt.show()

## Categorical Embeddings
- let's include the make of the car in our model

In [None]:
mpg.head()

In [None]:
makes = []
for idx in range(len(mpg)):
    row = mpg.iloc[idx]
    makes.append(row['name'].split(' ')[0])
mpg['make'] = makes
mpg.head()

In [None]:
# create an index for possible values of make
# sort of like a one-hot-encoding here
make_dict = {make: i for i, make in enumerate(set(makes))}
make_dict

In [None]:
class MPGDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        make_idx = make_dict[row['make']]
        
        x1 = torch.tensor([row['displacement'],
                          row['weight']]).float()
        
        x2 = torch.tensor(make_idx)
        
        y = torch.tensor(row['mpg']).float()
        
        return x1, x2, y
    
mpg_ds = MPGDataset(mpg)

# note the second tensor
next(iter(mpg_ds))

In [None]:
# let's add an embedding layer
class TwoLayerNN_Emb(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN_Emb, self).__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        
        # first argument is number of values, next is size of embedding
        self.emb = nn.Embedding(len(make_dict), 2)
        
        # let's keep in batch normalization
        self.bn = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x1, x2):
        
        x2 = self.emb(x2)
        
        # concatenate the vectors along dim=1, skipping batch dim
        x = torch.cat((x1, x2), dim=1)
        
        x = self.linear1(x)
        x = self.bn(x)
        x = self.relu(x)
        
        x = self.linear2(x)
        
        return torch.squeeze(x)

In [None]:
mpg_dl = DataLoader(mpg_ds, batch_size=50, shuffle=True)

# what is the correct dimension here?
model = TwoLayerNN_Emb(4, 5, 1)
x1, x2, y = next(iter(mpg_dl))
model(x1, x2)

Let's train it the same way we did above

In [None]:
# train/val split
mpg_train = mpg[100:].reset_index(drop=True)
mpg_val = mpg[:100].reset_index(drop=True)
mpg_train_ds = MPGDataset(mpg_train)
mpg_val_ds = MPGDataset(mpg_val)

# load into dataloader
mpg_train_dl = DataLoader(mpg_train_ds, batch_size=50, shuffle=True)
mpg_val_dl = DataLoader(mpg_val_ds, batch_size=100, shuffle=False)

In [None]:
lossFun = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

# collect losses
avg_train = []
avg_val = []

for epoch in tqdm(range(500)):
    train_losses = []
    for x1, x2, y in mpg_train_dl:
        
        model.train()
        
        y_pred = model(x1, x2)
        loss = lossFun(y_pred, y)
        train_losses.append(loss.item())
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
    
    avg_train.append(sum(train_losses) / len(train_losses))
    
    for x1, x2, y in mpg_val_dl:
        model.eval()
        
        y_pred = model(x1, x2)
        loss = lossFun(y_pred, y)
        
        avg_val.append(loss.item())

In [None]:
plt.plot(avg_train)
plt.plot(avg_val)
plt.show()

In [None]:
# let's look at the embedding matrix
for param in model.emb.parameters():
    print(param)

In [None]:
# compare embeddings for vw and volkswagon
for param in model.emb.parameters():
    print(param[?], param[?])