In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision 
from torchvision import transforms

from torch.utils.data import Dataset, DataLoader

from tqdm.autonotebook import tqdm

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

import pandas as pd

from sklearn.metrics import accuracy_score

import time
import os

In [2]:
class View(nn.Module):
    def __init__(self, *shape):
        super(View, self).__init__()
        self.shape = shape
    def forward(self, input):
        return input.view(*self.shape) 

In [3]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

In [4]:
def weight_reset(m):
    """
    Go through a PyTorch module m and reset all the weights to an initial random state
    """
    if "reset_parameters" in dir(m):
        m.reset_parameters()
    return

In [24]:
lesson_folder='/Users/msarica/Desktop/DS606/DS_Capstone/Deliverable3/'
def delete_file(file_name):
    file = lesson_folder + file_name
    if os.path.exists(file) == False:
        print ("file doesn't exist")
    else:
        os.remove(file)

# delete_file('model.pt')

def get_filename(file_name):
    return  lesson_folder + file_name

In [7]:
def train_network(
    model, 
    loss_func, 
    train_loader, 
    val_loader=None, 
    score_funcs=None,
    epochs=50, 
    device="cpu", 
    checkpoint_file=None,
    lr_schedule=None, 
    optimizer=None, 
    disable_tqdm=False,
    should_stop_early=None,
    override_early_stop_on_start=False
):
    """Train simple neural networks
    
    Keyword arguments:
    model -- the PyTorch model / "Module" to train
    loss_func -- the loss function that takes in batch in two arguments, the model outputs and the labels, and returns a score
    train_loader -- PyTorch DataLoader object that returns tuples of (input, label) pairs. 
    val_loader -- Optional PyTorch DataLoader to evaluate on after every epoch
    score_funcs -- A dictionary of scoring functions to use to evalue the performance of the model
    epochs -- the number of training epochs to perform
    device -- the compute lodation to perform training
    
    """
    if score_funcs == None:
        score_funcs = {}#Empty set 
    
    to_track = ["epoch", "total time", "train loss"]
    if val_loader is not None:
        to_track.append("val loss")
    for eval_score in score_funcs:
        to_track.append("train " + eval_score )
        if val_loader is not None:
            to_track.append("val " + eval_score )
        
    total_train_time = 0 #How long have we spent in the training loop? 
    results = {}
    #Initialize every item with an empty list
    for item in to_track:
        results[item] = []

    if optimizer == None:
        #The AdamW optimizer is a good default optimizer
        optimizer = torch.optim.AdamW(model.parameters())

    #Place the model on the correct compute resource (CPU or GPU)
    model.to(device)

    last_epoch = None
    if checkpoint_file is not None and os.path.exists(checkpoint_file):
      checkpoint = torch.load(checkpoint_file, map_location=device)
      model.load_state_dict(checkpoint['model_state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      results = checkpoint['results']
      early_stop = False if 'early_stop' not in checkpoint else checkpoint['early_stop']
      last_epoch = checkpoint['epoch']
      print ("last saved epoch is " , last_epoch+1)

      if override_early_stop_on_start == False and early_stop == True:
        print ('early stopped')
        return pd.DataFrame.from_dict(results)

      if last_epoch == epochs-1:
        print ('model fully loaded - no further training needed')

    for epoch in tqdm(range(epochs), desc="Epoch", disable=disable_tqdm):
        if last_epoch is not None and epoch <= last_epoch:
          continue

    
        model = model.train()#Put our model in training mode
        running_loss = 0.0
        
        y_true = []
        y_pred = []

        start = time.time()
        for inputs, labels in tqdm(train_loader, desc="Train Batch", leave=False, disable=disable_tqdm):
      
            #Move the batch to the device we are using. 
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            batch_size = labels.shape[0]

            # PyTorch stores gradients in a mutable data structure. So we need to set it to a clean state before we use it. 
            #Otherwise, it will have old information from a previous iteration
            optimizer.zero_grad()

            y_hat = model(inputs) #this just computed f_Θ(x(i))

            # Compute loss.
            loss = loss_func(y_hat, labels)

            loss.backward()# ∇_Θ just got computed by this one call!

            #Now we just need to update all the parameters! 
            optimizer.step()# Θ_{k+1} = Θ_k − η * ∇_Θ ℓ(y_hat, y)

            #Now we are just grabbing some information we would like to have
            running_loss += loss.item() * batch_size
            
            #moving labels & predictions back to CPU for computing / storing predictions
            labels = labels.detach().cpu().numpy()
            y_hat = y_hat.detach().cpu().numpy()
            for i in range(batch_size):
                y_true.append(labels[i])
                y_pred.append(y_hat[i,:])
        #end training epoch
        end = time.time()
        total_train_time += (end-start)
        
        results["epoch"].append( epoch )
        results["total time"].append( total_train_time )
        results["train loss"].append( running_loss )
        
        y_pred = np.asarray(y_pred)
        
        if y_pred.shape[1] > 1: #We have a classification problem, convert to labels
            y_pred = np.argmax(y_pred, axis=1)
            
        for name, score_func in score_funcs.items():
            results["train " + name].append( score_func(y_true, y_pred) )
      
        if val_loader is None:
            pass
        else:#Lets find out validation performance as we go!
            model = model.eval() #Set the model to "evaluation" mode, b/c we don't want to make any updates!

            y_true = []
            y_pred = []
            
            val_running_loss = 0.0

            for inputs, labels in val_loader:
        
                #Move the batch to the device we are using. 
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                batch_size = labels.shape[0]
        
                y_hat = model(inputs)
            
                loss = loss_func(y_hat, labels)
                
                #Now we are just grabbing some information we would like to have
                val_running_loss += loss.item() * batch_size

                #moving labels & predictions back to CPU for computing / storing predictions
                labels = labels.detach().cpu().numpy()
                y_hat = y_hat.detach().cpu().numpy()
                for i in range(batch_size):
                    y_true.append(labels[i])
                    y_pred.append(y_hat[i,:])
                        
            results["val loss"].append( running_loss )

            y_pred = np.asarray(y_pred)

            if y_pred.shape[1] > 1: #We have a classification problem, convert to labels
                y_pred = np.argmax(y_pred, axis=1)

            for name, score_func in score_funcs.items():
                score_text = "val " + name
                score_result = score_func(y_true, y_pred)
                print (score_text, score_result)
                results[score_text].append( score_result )
        
        #In PyTorch, the convention is to update the learning rate after every epoch
        if not lr_schedule is None:
            if isinstance(lr_schedule, torch.optim.lr_scheduler.ReduceLROnPlateau):
                lr_schedule.step(val_running_loss)
            else:
                lr_schedule.step()
        
        # early stopping (experiment)
        early_stop_flag = False
        if should_stop_early is not None and should_stop_early(pd.DataFrame.from_dict(results), epoch):
            early_stop_flag = True

        if checkpoint_file is not None:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'results' : results,
                'early_stop': early_stop_flag
                }, checkpoint_file)
            
            if early_stop_flag == True:
                  break
            
    return pd.DataFrame.from_dict(results)

In [15]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [16]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re

all_data = []
# resp = urlopen("https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt")
resp = urlopen("https://norvig.com/big.txt")
shakespear_100k = resp.read()[0:3000000]
shakespear_100k = shakespear_100k.decode('utf-8').lower()

In [17]:
vocab2indx = {}
for char in shakespear_100k:
    if char not in vocab2indx:
        vocab2indx[char] = len(vocab2indx)
        
indx2vocab = {}
for k, v in vocab2indx.items():
    indx2vocab[v] = k
print("Vocab Size: ", len(vocab2indx))
print("Total Characters:", len(shakespear_100k))

Vocab Size:  67
Total Characters: 3000000


In [18]:
class AutoRegressiveDataset(Dataset):
    """
    """

    def __init__(self, vocab2indx, large_string, MAX_CHUNK=500):
        self.vocab2indx = vocab2indx
        self.doc = large_string
        self.MAX_CHUNK = MAX_CHUNK

    def __len__(self):
        return (len(self.doc)-1) // self.MAX_CHUNK

    def __getitem__(self, idx):
        
        start = idx*self.MAX_CHUNK
        
        #First we build our input values x, which is just each character that 
        #occured in order. We use our vocabular to map the characters to unique
        #IDs for the embedding layer our model will use
        sub_string = self.doc[start:start+self.MAX_CHUNK]
        
        x = [self.vocab2indx[c] for c in sub_string]
        
        #Now we build the target values, in the exact same way as we built our
        #inputs. The differences is that we shift the sub-string over by 1! 
        #This is because we are predicting the _next_ character! 
        sub_string = self.doc[start+1:start+self.MAX_CHUNK+1]
        
        y = [self.vocab2indx[c] for c in sub_string]
        
        return torch.tensor(x, dtype=torch.int64), torch.tensor(y, dtype=torch.int64)

In [19]:
class AutoRegressive(nn.Module):

    def __init__(self, num_embeddings, embd_size, hidden_size, padding_idx=None, layers=1):
        """
        num_embeddings: this is the size of our vocabulary, as each item will require its own embedding. 
        embd_size: how many dimensions to use for the embedding layer
        hidden_size: how many units to use in the RNN layers
        padding_idx: the token used to indicate padding
        layers:  the number of GRU layers to use in the auto-encoder. 
        """
        super(AutoRegressive, self).__init__()
        self.padding_idx = padding_idx
        self.hidden_size = hidden_size
        self.embd = nn.Embedding(num_embeddings, embd_size, padding_idx=padding_idx)
        self.layers = nn.ModuleList([nn.GRUCell(embd_size, hidden_size)] + 
                                      [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
        self.norms = nn.ModuleList([nn.LayerNorm(hidden_size) for i in range(layers)])
        
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.layernorm = nn.LayerNorm(hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_embeddings)
        
    
    def forward(self, input):
        #Input should be (B, T)
        #What is the batch size?
        B = input.size(0)
        #What is the max number of time steps?
        T = input.size(1)
        
        x = self.embd(input) #(B, T, D)
        
        #grab the device that the model currently resides on
        device = self.embd.weight.device
        
        if self.padding_idx is not None:
            mask = input != self.padding_idx
        else:
            mask = input == input
        mask = mask.to(device)
        #Mask is now (B, T)
        
        #Initial hidden states
        h_prevs = [torch.zeros(B, self.hidden_size, device=device) for _ in range(len(self.layers))]
        
        last_activations = []
        for t in range(T):
            #Grab the mask that tells us which slices in time are currently valid
            #we do the unsqueeze to make it shape (B, 1), so that we can do valid
            #multiplications with the hidden states
            mask_t = mask[:,t].unsqueeze(1) 
            
            x_in = x[:,t,:] #(B, D)
            
            for l in range(len(self.layers)):
                h_prev = h_prevs[l]
                h = self.norms[l](self.layers[l](x_in, h_prev))
                
                h_prevs[l]  = h*mask_t + (~mask_t)*h_prev
                x_in = h
            last_activations.append(x_in)
        
        last_activations = torch.stack(last_activations, dim=1) #(B, T, D)
        
        #Apply linear models to results over time 
        h = self.layernorm(F.leaky_relu(self.linear1(last_activations)))
        h = self.linear2(h) #(B, T, D) -> B(B, T, VocabSize)
        
        return h

In [21]:
autoRegData = AutoRegressiveDataset(vocab2indx, shakespear_100k, MAX_CHUNK=250)
autoReg_loader = DataLoader(autoRegData, batch_size=128, shuffle=True)

In [22]:
autoReg_model = AutoRegressive(len(vocab2indx), 32, 128, layers=2)
autoReg_model = autoReg_model.to(device)

for p in autoReg_model.parameters():
    p.register_hook(lambda grad: torch.clamp(grad, -2, 2))

In [23]:
def CrossEntLossTime(x, y):
    """
    x: output predictions with shape (B, T, V)
    y: target labels with shape (B, T)
    
    """
    cel = nn.CrossEntropyLoss()
    
    T = x.size(1)
    
    loss = 0
    
    for t in range(T):
        loss += cel(x[:,t,:], y[:,t])
    
    return loss
    

In [25]:
train_network(autoReg_model, CrossEntLossTime, autoReg_loader, epochs=20, device=device, checkpoint_file=get_filename('autoreg_model.pt'))

HBox(children=(IntProgress(value=0, description='Epoch', max=20, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Train Batch', max=94, style=ProgressStyle(description_width='…

KeyboardInterrupt: 