<a href="https://colab.research.google.com/github/narendra974/insidedeeplearning/blob/main/AutoRegressieveModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda

In [18]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def moveTo(obj, device):
  if hasattr(obj,"to"):
    return obj.to(device)
  elif isinstance(obj, list):
    return [moveTo(x, device) for x in obj]
  elif isinstance(obj, tuple):
    return tuple(moveTo(list(obj), device))
  elif isinstance(obj, set):
    return set(moveTo(list(obj), device))
  elif isinstance(obj, dict):
    to_ret = dect()
    for key, value in obj.items():
      to_ret[moveTo(key, device)] = moveTo(value,device)
    return to_ret
  else:
    return object


def run_epoch(model, optimizer, data_loader, loss_func, device, results, score_funcs, prefix="", desc=None):
    """
    model -- the PyTorch model / "Module" to run for one epoch
    optimizer -- the object that will update the weights of the network
    data_loader -- DataLoader object that returns tuples of (input, label) pairs. 
    loss_func -- the loss function that takes in two arguments, the model outputs and the labels, and returns a score
    device -- the compute lodation to perform training
    score_funcs -- a dictionary of scoring functions to use to evalue the performance of the model
    prefix -- a string to pre-fix to any scores placed into the _results_ dictionary. 
    desc -- a description to use for the progress bar.     
    """
    running_loss = []
    y_true = []
    y_pred = []
    start = time.time()
    for inputs, labels in tqdm(data_loader, desc=desc, leave=False):
        #Move the batch to the device we are using. 
        inputs = moveTo(inputs, device)
        labels = moveTo(labels, device)
        y_hat = model(inputs) #this just computed f_Θ(x(i))
        # Compute loss.

        loss = loss_func(y_hat, labels)

        if model.training:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        #Now we are just grabbing some information we would like to have
        running_loss.append(loss.item())

        if len(score_funcs) > 0 and isinstance(labels, torch.Tensor):
            #moving labels & predictions back to CPU for computing / storing predictions
            labels = labels.detach().cpu().numpy()
            y_hat = y_hat.detach().cpu().numpy()
            #add to predictions so far
            y_true.extend(labels.tolist())
            y_pred.extend(y_hat.tolist())
    #end training epoch
    end = time.time()
    
    y_pred = np.asarray(y_pred)
    if len(y_pred.shape) == 2 and y_pred.shape[1] > 1: #We have a classification problem, convert to labels
        y_pred = np.argmax(y_pred, axis=1)
    #Else, we assume we are working on a regression problem
    
    results[prefix + " loss"].append( np.mean(running_loss) )
    for name, score_func in score_funcs.items():
        try:
            results[prefix + " " + name].append( score_func(y_true, y_pred) )
        except:
            results[prefix + " " + name].append(float("NaN"))
    return end-start #time spent on epoch


def train_simple_network(model, loss_func, train_loader, test_loader=None, score_funcs=None, 
                         epochs=50, device="cpu", checkpoint_file=None, lr=0.001):
    """Train simple neural networks
    
    Keyword arguments:
    model -- the PyTorch model / "Module" to train
    loss_func -- the loss function that takes in batch in two arguments, the model outputs and the labels, and returns a score
    train_loader -- PyTorch DataLoader object that returns tuples of (input, label) pairs. 
    test_loader -- Optional PyTorch DataLoader to evaluate on after every epoch
    score_funcs -- A dictionary of scoring functions to use to evalue the performance of the model
    epochs -- the number of training epochs to perform
    device -- the compute lodation to perform training
    
    """
    to_track = ["epoch", "total time", "train loss"]
    if test_loader is not None:
        to_track.append("test loss")
    if score_funcs is not None:
      for eval_score in score_funcs:
        to_track.append("train " + eval_score )
        if test_loader is not None:
            to_track.append("test " + eval_score )
        
    total_train_time = 0 #How long have we spent in the training loop? 
    results = {}
    #Initialize every item with an empty list
    for item in to_track:
        results[item] = []
        
    #SGD is Stochastic Gradient Decent.
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    #Place the model on the correct compute resource (CPU or GPU)
    model.to(device)
    for epoch in tqdm(range(epochs), desc="Epoch"):
        model = model.train()#Put our model in training mode
        
        total_train_time += run_epoch(model, optimizer, train_loader, loss_func, device, results, score_funcs, prefix="train", desc="Training")

        results["total time"].append( total_train_time )
        results["epoch"].append( epoch )
        
        if test_loader is not None:
            model = model.eval()
            with torch.no_grad():
                run_epoch(model, optimizer, test_loader, loss_func, device, results, score_funcs, prefix="test", desc="Testing")
                    
    if checkpoint_file is not None:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'results' : results
            }, checkpoint_file)

    return pd.DataFrame.from_dict(results)


def train_network(model, loss_func, train_loader, val_loader=None, test_loader=None,score_funcs=None, 
                         epochs=50, device="cpu", checkpoint_file=None, 
                         lr_schedule=None, optimizer=None, disable_tqdm=False
                        ):
    """Train simple neural networks
    
    Keyword arguments:
    model -- the PyTorch model / "Module" to train
    loss_func -- the loss function that takes in batch in two arguments, the model outputs and the labels, and returns a score
    train_loader -- PyTorch DataLoader object that returns tuples of (input, label) pairs. 
    val_loader -- Optional PyTorch DataLoader to evaluate on after every epoch
    test_loader -- Optional PyTorch DataLoader to evaluate on after every epoch
    score_funcs -- A dictionary of scoring functions to use to evalue the performance of the model
    epochs -- the number of training epochs to perform
    device -- the compute lodation to perform training
    lr_schedule -- the learning rate schedule used to alter \eta as the model trains. If this is not None than the user must also provide the optimizer to use. 
    optimizer -- the method used to alter the gradients for learning. 
    
    """
    if score_funcs == None:
        score_funcs = {}#Empty set 
    
    to_track = ["epoch", "total time", "train loss"]
    if val_loader is not None:
        to_track.append("val loss")
    if test_loader is not None:
        to_track.append("test loss")
    for eval_score in score_funcs:
        to_track.append("train " + eval_score )
        if val_loader is not None:
            to_track.append("val " + eval_score )
        if test_loader is not None:
            to_track.append("test "+ eval_score )
        
    total_train_time = 0 #How long have we spent in the training loop? 
    results = {}
    #Initialize every item with an empty list
    for item in to_track:
        results[item] = []

        
    if optimizer == None:
        #The AdamW optimizer is a good default optimizer
        optimizer = torch.optim.AdamW(model.parameters())
        del_opt = True
    else:
        del_opt = False

    #Place the model on the correct compute resource (CPU or GPU)
    model.to(device)
    for epoch in tqdm(range(epochs), desc="Epoch", disable=disable_tqdm):
        model = model.train()#Put our model in training mode

        total_train_time += run_epoch(model, optimizer, train_loader, loss_func, device, results, score_funcs, prefix="train", desc="Training")
        
        results["epoch"].append( epoch )
        results["total time"].append( total_train_time )
        
      
        if val_loader is not None:
            model = model.eval() #Set the model to "evaluation" mode, b/c we don't want to make any updates!
            with torch.no_grad():
                run_epoch(model, optimizer, val_loader, loss_func, device, results, score_funcs, prefix="val", desc="Validating")
                
        #In PyTorch, the convention is to update the learning rate after every epoch
        if lr_schedule is not None:
            if isinstance(lr_schedule, torch.optim.lr_scheduler.ReduceLROnPlateau):
                lr_schedule.step(results["val loss"][-1])
            else:
                lr_schedule.step()
                
        if test_loader is not None:
            model = model.eval() #Set the model to "evaluation" mode, b/c we don't want to make any updates!
            with torch.no_grad():
                run_epoch(model, optimizer, test_loader, loss_func, device, results, score_funcs, prefix="test", desc="Testing")
        
        
        if checkpoint_file is not None:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'results' : results
                }, checkpoint_file)
    if del_opt:
        del optimizer

    return pd.DataFrame.from_dict(results)


def prediction(model, img):
  with torch.no_grad():
    w, h = img.shape
    if not isinstance(img, torch.Tensor):
      img = torch.tensor(img)
    x = img.reshape(1, -1, w, h)
    logits = model(x)
    y_hat = F.softmax(logits, dim=1)
    return y_hat.numpy().flatten()

In [19]:
printonce = True
all_data = []
resp = urlopen("https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")
shakespear_100k = resp.read()
shakespear_100k = shakespear_100k.decode('utf-8').lower()
device = torch.device("cuda")


In [20]:
vocab2index = {}

for char in shakespear_100k:
  if char not in vocab2index:
    vocab2index[char] = len(vocab2index)

index2Vocab={}

for k, v in vocab2index.items():
  index2Vocab[v]=k


print("Vocab Size ", len(vocab2index))
print("Total text size ", len(shakespear_100k))



Vocab Size  39
Total text size  1115394


In [21]:
class AutoRegressiveDataset(Dataset):

  def __init__(self, large_string, max_chunk=500):
    self.doc = large_string
    self.max_chunk = max_chunk

  def __len__(self):
    return (len(self.doc)-1) // self.max_chunk

  def __getitem__(self, idx):
    start = idx * self.max_chunk
    sub_string = self.doc[start:start+self.max_chunk]
    x = [vocab2index(c) for c in sub_string]
    sub_string_y = self.doc[start+1:start+self.max_chunk+1]
    y = [vocab2index(c) for c in sub_string_y]
    

In [22]:
class AutoRegressiveModule(nn.Module):
  
  def __init__(self, num_embeddings, embd_size, hidden_size, layers=1):
    super(AutoRegressiveModule, self).__init__()
    self.hidden_size = hidden_size
    self.embd = nn.Embedding(num_embeddings, embd_size)
    self.layers = nn.ModuleList( [nn.GRUCell(embd_size, hidden_size)] + [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
    self.norms = nn.ModuleList([nn.LayerNorm(hidden_size) for i in range(layers)])
    self.pred_class = nn.Sequential(
        nn.Linear(hidden_size, hidden_size), 
        nn.LeakyReLU(),
        nn.LayerNorm(hidden_size),
        nn.Linear(hidden_size, num_embeddings)   
    )


  def initHiddenStates(self, B):
    return [torch.zeros(B, self.hidden_size, device=device) for _ in range(len(self.layers))]


  def step(self, x_in, h_prevs):
    if len(x_in.shape) == 1:
      x_in = self.embd(x_in)
  
    if h_prevs is None:
      h_prevs = self.initHiddenStates(x_in.shape[0])

    # for GRUCell there is only one input that is send as the hidden state to the next time step process and as an output from this layer. 
    for l in range(len(self.layers)):  
      h_prev = h_prevs[l]
      h = self.norms[l](self.layers[l](x_in, h_prev))
      h_prevs[l]=h
      x_in = h

    return self.pred_class(x_in)


  def forward(self, input):
    B = input.size(0)
    T = input.size(1)
    x = self.embd(input)
    h_prevs = self.initHiddenStates(B)
    
    last_activations = []
    for t in range(T):
      x_in = x[:,t, :]
      last_activations.append(self.step(x_in, h_prevs))
    
    last_activations = torch.stack(last_activations, dim=1)
    return last_activations

In [23]:
def crossEntLossTime(x, y):

  cel = nn.CrossEntropyLoss()
  T = x.size(1)
  loss = 0  

  if printonce is True:
    print(x.shape)
    print(y.shape)
    printonce=False

  for t in range(T):
    loss += cel(x[:,t,:], y[:, t])

  return loss


In [None]:
autoRegData = AutoRegressiveDataset(shakespear_100k, max_chunk=250)
autoRegLoader = DataLoader(autoRegData, batch_size=128, shuffle=True)

autoReg_model = AutoRegressiveModule(len(vocab2index), 32, 128, layers=2)
autoReg_model = autoReg_model.to(device)


for p in autoReg_model.parameters():
  p.register_hook(lambda grad: torch.clamp(grad, -2, 2))

In [None]:
from torch.nn.modules.loss import CrossEntropyLoss
train_network(autoReg_model, crossEntLossTime, autoRegLoader, epochs=100, device=device)