In [1]:
from google.colab import files, auth, drive
from urllib.request import urlopen
from typing import List, Dict, Callable
from collections import Counter
from os import path
import numpy as np
import torch
from torch import nn
from torch.optim import SGD, Adam
from matplotlib import pyplot as plt
import glob
import json
import unicodedata
import string

############################## CONSTANTS #######################################
def findFiles(path): return glob.glob(path)

def readfile(filepath: str) -> str:
    """
    Reads file and returns its content as a string.
    """
    #response = urlopen(url)
    #body = response.read().decode('utf-8')
    with open(filepath, "r") as f:
        body = f.read()
    
    return body.encode('ascii', 'ignore').decode("utf-8")

ROOT_COLAB_FOLDER = "/content"
filename = "train-data.json" #modify filename here
filepath = path.join(ROOT_COLAB_FOLDER, filename)

with open(filepath, "r") as f:
  trainData = json.load(f)


filename_gt = "code-train-gt.json"
filepath_gt = path.join(ROOT_COLAB_FOLDER, filename_gt)

with open(filepath_gt, "r") as f:
  trainData_gt = json.load(f)

filename_test = "test-data.json" #modify filename here
filepath_test = path.join(ROOT_COLAB_FOLDER, filename_test)

with open(filepath_test, "r") as f:
  _testData = json.load(f)


filename_test_gt = "code-test-gt.json"
filepath_test_gt = path.join(ROOT_COLAB_FOLDER, filename_test_gt)

with open(filepath_test_gt, "r") as f:
  testData_gt = json.load(f)

languages2Labels = {
    "JavaScript" : 0,
    "Java" : 1,
    "SQL" : 2,
    "Python" : 3,
    "c++" : 4,
    "c#" : 5
}
nrLanguages = len(languages2Labels.keys())

trainData_gt_labels = {}
for key, value in trainData_gt.items():
  trainData_gt_labels[key] = languages2Labels[value]
  
print(trainData_gt_labels["00025cd483baf04e1862328f61b99f8c40c62da4V_PO_DETAILS_0"])

for key, value in trainData.items():
  try:
    label = trainData_gt_labels[key] # check for no KeyErrors
  except:
    print(key)
    
testData_gt_labels = {}
for key, value in testData_gt.items():
  testData_gt_labels[key] = languages2Labels[value]

2


In [2]:
class Vocabulary:
    """
    Helper class that maps characters to unique indices and the other way around
    """
    def __init__(self, text: str):
        #special character for padding shorter sequences in a mini-batch
        characters_set = set("<PAD>") 
        characters_set.update(text)
        
        self.char_to_idx = {char:idx for (idx, char) 
                            in enumerate(characters_set)}
        self.idx_to_char = {idx:char for (idx, char) 
                            in enumerate(characters_set)}
   
    def size(self):
        return len(self.char_to_idx)
      
    def __str__(self):
        return str(self.char_to_idx)

text = ""
for key, value in trainData.items():
  text = text + value

vocab = Vocabulary(text)
print("Vocabulary size: ", vocab.size())
print("Vocabulary: \n", vocab)

print(len(trainData.keys()))
print(len(_testData.keys()))

Vocabulary size:  98
Vocabulary: 
 {'A': 0, 'u': 1, 'a': 2, 'J': 3, 'B': 4, '?': 5, 'N': 6, 'e': 7, 'r': 8, '/': 9, '`': 10, '2': 11, 'R': 12, 'h': 13, '1': 14, '_': 15, 'f': 16, ']': 17, '3': 18, 'I': 19, 'n': 20, '=': 21, 'q': 22, 'K': 23, '9': 24, 'F': 25, 'd': 26, 'P': 27, '}': 28, 'E': 29, '#': 30, '*': 31, 'i': 32, 'x': 33, 'p': 34, '8': 35, 't': 36, '^': 37, '[': 38, '\\': 39, 'o': 40, '{': 41, 'G': 42, 'l': 43, '(': 44, '|': 45, ' ': 46, 'w': 47, '4': 48, ':': 49, 'X': 50, 'b': 51, '-': 52, '\x01': 53, 'y': 54, '.': 55, '>': 56, 'C': 57, 'Z': 58, 'v': 59, '@': 60, 'j': 61, 'Y': 62, '\t': 63, '~': 64, ';': 65, 'W': 66, 'k': 67, 'g': 68, '$': 69, 'c': 70, 'z': 71, 'M': 72, '<': 73, 'O': 74, 'T': 75, '6': 76, '%': 77, 'H': 78, '0': 79, 'Q': 80, '\n': 81, 'U': 82, '5': 83, '+': 84, 's': 85, '7': 86, 'V': 87, 'L': 88, "'": 89, '!': 90, ',': 91, '"': 92, '&': 93, ')': 94, 'D': 95, 'm': 96, 'S': 97}
11100
2995


In [3]:
def text_to_tensor(text: str, vocab: Vocabulary) -> torch.LongTensor:
    """
    Convert a string to a Tensor with corresponding character indices
    e.g. "We have" -> [12, 6, 20, 13, 1, 25, 6] 
    """
    text_indices = [vocab.char_to_idx[c] for c in text]
    
    return torch.tensor(text_indices)
  
def tensor_to_text(x: torch.LongTensor, vocab: Vocabulary) -> str:
    """
    Convert a Tensor of character indices to its string representation
    e.g. [12, 6, 20, 13, 1, 25, 6] -> "We have"
    """
    return "".join(vocab.idx_to_char[idx.item()] for idx in x)

def batch_tensor_to_text(x: torch.LongTensor, vocab: Vocabulary) -> List[str]:
    """
    The batched version of tensor_to_text
    E.g. [[2, 1, 3, 0, 0], [3, 1, 20]] -> [bac, cat]
    :param x: Tensor of size (batch_size x time_steps)
    :return: a list of corresponding strings 
    """
    assert len(x.size()) == 2, "wrong number of dimensions (should be 2)"
    outputs = []
    for batch_idx in range(len(x)):
        outputs.append(tensor_to_text(x[batch_idx], vocab))
            
    return outputs 
  
#check that a random text is correctly converted to numbers and back to text
random_text = "I have apples"
encoded_text = text_to_tensor(random_text, vocab)
decoded_text = tensor_to_text(encoded_text, vocab)
assert random_text == decoded_text, "Not the same text as the original"
print(encoded_text)
print(decoded_text)

#convert input text to numbers
data = {}
for key, value in trainData.items():
  data[key] = text_to_tensor(value, vocab)
  #print("Size of input tensor is: ", data[key].size(0))
  
#convert input text to numbers
testData = {}
for key, value in _testData.items():
  testData[key] = text_to_tensor(value, vocab)

#setup device (CPU/GPU)
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

print(device)
#move tensor to CUDA if available
for key, value in data.items(): # move data
  data[key] = data[key].to(device)

for key, value in testData.items(): # move data
  testData[key] = testData[key].to(device)

for key, value in trainData_gt_labels.items(): # also move labels and convert to tensor
  trainData_gt_labels[key] = torch.ones(1).to(device) * trainData_gt_labels[key]

for key, value in testData_gt_labels.items(): # also move labels and convert to tensor
  testData_gt_labels[key] = torch.ones(1).to(device) * testData_gt_labels[key]

tensor([19, 46, 13,  2, 59,  7, 46,  2, 34, 34, 43,  7, 85])
I have apples
cuda:0


In [4]:
# CREATE TENSORS OF THE BATCHES - FOR TRAIN
# Will have 2 dicts : batches_x and batches_y, mapping from batchNumber to a tensor

keys = list(data.keys())
nrKeys = len(keys)

#from random import shuffle

#shuffle(keys)

batch_size = 32
max_len = 250

nrBatches = nrKeys // batch_size
batches_x = {}
batches_y = {}
for i in range(nrBatches):
  currentKeys = keys[i*batch_size : (i+1)*batch_size]
  batch_x = torch.Tensor(())
  batch_x = batch_x.to(device)
  batch_y = torch.Tensor(())
  batch_y = batch_y.to(device)
  for j in range(batch_size):
    current_x = data[currentKeys[j]][:max_len]
    current_x = current_x.view((1, max_len))
    
    current_y = trainData_gt_labels[currentKeys[j]]
    ##current_y = current_y.view((1, 1))
    #current_y = torch.zeros((1, nrLanguages))
    #current_y[0, trainData_gt_labels[currentKeys[j]].long()] = 1
    #current_y = current_y.to(device)
    
    batch_x = torch.cat((batch_x, current_x.float()), 0)
    batch_y = torch.cat((batch_y, current_y.float()), 0)
  
  batches_x[i] = batch_x
  batches_y[i] = batch_y
  print(batch_x.shape)
  print(batch_y.shape)
  print(i, 'out of', nrBatches)

torch.Size([32, 250])
torch.Size([32])
0 out of 346
torch.Size([32, 250])
torch.Size([32])
1 out of 346
torch.Size([32, 250])
torch.Size([32])
2 out of 346
torch.Size([32, 250])
torch.Size([32])
3 out of 346
torch.Size([32, 250])
torch.Size([32])
4 out of 346
torch.Size([32, 250])
torch.Size([32])
5 out of 346
torch.Size([32, 250])
torch.Size([32])
6 out of 346
torch.Size([32, 250])
torch.Size([32])
7 out of 346
torch.Size([32, 250])
torch.Size([32])
8 out of 346
torch.Size([32, 250])
torch.Size([32])
9 out of 346
torch.Size([32, 250])
torch.Size([32])
10 out of 346
torch.Size([32, 250])
torch.Size([32])
11 out of 346
torch.Size([32, 250])
torch.Size([32])
12 out of 346
torch.Size([32, 250])
torch.Size([32])
13 out of 346
torch.Size([32, 250])
torch.Size([32])
14 out of 346
torch.Size([32, 250])
torch.Size([32])
15 out of 346
torch.Size([32, 250])
torch.Size([32])
16 out of 346
torch.Size([32, 250])
torch.Size([32])
17 out of 346
torch.Size([32, 250])
torch.Size([32])
18 out of 346
tor

In [5]:
# CREATE TENSORS OF THE BATCHES - FOR TRAIN
# Will have 2 dicts : batches_x and batches_y, mapping from batchNumber to a tensor

keys = list(data.keys())
nrKeys = len(keys)

#from random import shuffle

#shuffle(keys)

batch_size = 32
max_len = 250

nrBatches = nrKeys // batch_size
batches_x = {}
batches_y = {}
for i in range(nrBatches):
  currentKeys = keys[i*batch_size : (i+1)*batch_size]
  batch_x = torch.Tensor(())
  batch_x = batch_x.to(device)
  batch_y = torch.Tensor(())
  batch_y = batch_y.to(device)
  for j in range(batch_size):
    current_x = data[currentKeys[j]][:max_len]
    current_x = current_x.view((1, max_len))
    
    current_y = trainData_gt_labels[currentKeys[j]]
    ##current_y = current_y.view((1, 1))
    #current_y = torch.zeros((1, nrLanguages))
    #current_y[0, trainData_gt_labels[currentKeys[j]].long()] = 1
    #current_y = current_y.to(device)
    
    batch_x = torch.cat((batch_x, current_x.float()), 0)
    batch_y = torch.cat((batch_y, current_y.float()), 0)
  
  batches_x[i] = batch_x
  batches_y[i] = batch_y
  print(batch_x.shape)
  print(batch_y.shape)
  print(i, 'out of', nrBatches)

torch.Size([32, 250])
torch.Size([32])
0 out of 346
torch.Size([32, 250])
torch.Size([32])
1 out of 346
torch.Size([32, 250])
torch.Size([32])
2 out of 346
torch.Size([32, 250])
torch.Size([32])
3 out of 346
torch.Size([32, 250])
torch.Size([32])
4 out of 346
torch.Size([32, 250])
torch.Size([32])
5 out of 346
torch.Size([32, 250])
torch.Size([32])
6 out of 346
torch.Size([32, 250])
torch.Size([32])
7 out of 346
torch.Size([32, 250])
torch.Size([32])
8 out of 346
torch.Size([32, 250])
torch.Size([32])
9 out of 346
torch.Size([32, 250])
torch.Size([32])
10 out of 346
torch.Size([32, 250])
torch.Size([32])
11 out of 346
torch.Size([32, 250])
torch.Size([32])
12 out of 346
torch.Size([32, 250])
torch.Size([32])
13 out of 346
torch.Size([32, 250])
torch.Size([32])
14 out of 346
torch.Size([32, 250])
torch.Size([32])
15 out of 346
torch.Size([32, 250])
torch.Size([32])
16 out of 346
torch.Size([32, 250])
torch.Size([32])
17 out of 346
torch.Size([32, 250])
torch.Size([32])
18 out of 346
tor

In [6]:
print('Total nr. of batches :', nrBatches)

#split corpus into training/eval set
train_dev_cutoff = int(nrBatches * 0.8)

train_data_x = {}
train_data_y = {}
for i in range(train_dev_cutoff):
  train_data_x[i] = batches_x[i].long()
  train_data_y[i] = batches_y[i].long()
  
dev_data_x = {}
dev_data_y = {}
for i in range(train_dev_cutoff, nrBatches):
  dev_data_x[i] = batches_x[i].long()
  dev_data_y[i] = batches_y[i].long()

nrBatchesTrain = len(train_data_x.keys())
print(train_dev_cutoff)
print(nrBatchesTrain)
nrBatchesDev = len(dev_data_x.keys())
print(nrBatchesDev)
print(nrBatches - nrBatchesTrain)

Total nr. of batches : 346
276
276
70
70


In [7]:
print(train_data_x[0].device)
print(train_data_x[0].type())

cuda:0
torch.cuda.LongTensor


In [8]:
char_idx = torch.LongTensor([7])

#create one-hot representation for 7
#this is a vector of zeroes, except for the 7th position, where it has a one
char_one_hot = torch.zeros(vocab.size())
char_one_hot[char_idx] = 1
print("One-hot representation for 7th caracter: ", char_one_hot)
print("Lungimea: ", len(char_one_hot))
#create an Embedding layer that outputs vectors of size 100; 
#behind the scenes, this is just a weight matrix of size vocab_size x 100
emb_layer = nn.Embedding(vocab.size(), 100)
W = emb_layer.weight #this is the actual weight matrix 
print("Embedding matrix size: ", W.size())

#multiplying the one-hot vector for 7 by the weight matrix results in a vector 
#equal to the 7th row of the matrix
emb_one_hot = char_one_hot @ W
print("emb_one_hot size: ", emb_one_hot.size())

#the emb_layer receives an index and returns the corresponding row at that index
#in the weight matrix
emb_idx = emb_layer(char_idx).squeeze()
print("emb_idx size: ", emb_idx.size())

if emb_one_hot.equal(emb_idx):
    print("Same embedding")
print("Linia de la embeding: ",emb_one_hot)

One-hot representation for 7th caracter:  tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.])
Lungimea:  98
Embedding matrix size:  torch.Size([98, 100])
emb_one_hot size:  torch.Size([100])
emb_idx size:  torch.Size([100])
Same embedding
Linia de la embeding:  tensor([-1.6235,  0.9393,  0.8173, -2.0041, -1.4562, -2.5784, -0.5761, -0.0266,
         0.7691, -0.0843,  1.7956, -0.3173,  1.7894, -0.7176,  0.7972,  1.0033,
        -0.0489,  1.4673, -0.4491,  1.2312, -0.3215, -1.2506,  1.9888,  0.9312,
         0.3012, -1.3068,  1.1241, -1.5193,  0.8069, -0.3610, -1.2656,  1.0549,
         0.5975,  0.0557,  0.

In [0]:
############################## PARAMETERS ######################################
_hyperparameters_dict = {
    "batch_size": 32,
    "num_epochs": 10,
    "max_len": 250,
    "embedding_size": 100, 
    "rnn_size": 512,
    "learning_algo": "adam",
    "learning_rate": 0.001,
    "max_grad_norm": 5.0
}

In [0]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size: int, char_embedding_size: int, 
                 rnn_size: int):
        super().__init__()
        #TODO
        self.vocab_size = vocab_size
        self.char_embedding_size = char_embedding_size
        self.rnn_size = rnn_size
        
        self.embedding = nn.Embedding(num_embeddings = vocab_size, 
                                      embedding_dim = char_embedding_size)

        self.rnn_cell = nn.GRUCell(input_size = char_embedding_size,
                                   hidden_size = rnn_size)
        self.logits = nn.Linear(in_features=rnn_size, out_features=6)
        self.softmax = nn.Softmax(dim = 2)
        
        self.loss = nn.CrossEntropyLoss()
    
    def get_loss(self, logits: torch.FloatTensor, y: torch.FloatTensor):
        """
        Computes loss for a batch of sequences. The sequence loss is the 
        average of the individual losses at each timestep. The batch loss is
        the average of sequence losses across all batches.

        :param logits: unnormalized probabilities for T timesteps, size
                       batch_size x max_timesteps x vocab_size
        :param y: ground truth values (index of correct characters), size
                  batch_size x max_timesteps
        :returns: loss as a scalar
        """
        #print('Logits shape :', logits.shape)
        #print('Target shape :', y.shape)
        return self.loss(logits, y)
        
    
    def get_logits(self, hidden_states: torch.FloatTensor, 
                   temperature: float = 1.0):
        """
        Computes the unnormalized probabilities from hidden states. Optionally
        divide logits by a temperature, in order to influence predictions at 
        test time (https://www.quora.com/What-is-Temperature-in-LSTM)
        
        :param hidden_states: tensor of size batch_size x timesteps x rnn_size
        :param temperature: coefficient that scales outputs before turning them 
        to probabilities. A low temperature (0.1) results in more conservative 
        predictions, while a higher temperature (0.9) results in more diverse
        predictions
        
        :return: tensor of size batch_size x timesteps x vocab_size
        """
        #print('Hidden states shape :', hidden_states.shape)
        return self.logits(hidden_states) / temperature
        
    def forward(self, x: torch.LongTensor, 
                hidden_start: torch.FloatTensor = None) -> torch.FloatTensor:
        """
        Computes the hidden states
        for the current batch (x, y). 
        :param x: input of size batch_size x max_len
        :param hidden_start: hidden state at time step t = 0, 
                             size batch_size x rnn_size
        :return: hidden states at all timesteps, 
                 size batch_size x timesteps x rnn_size
        """
        max_len = x.size(1)
        
        #batch_size x max_len x embedding_dim
        x_embedded = self.embedding(x)
        
        #compute hidden states and logits for each time step
        prev_hidden = hidden_start
        for t in range(max_len):
            hidden_state = self.rnn_cell(x_embedded[:,t,:], prev_hidden)
            prev_hidden = hidden_state
        
        return hidden_state

In [0]:
#instantiate the RNNLM module
network = RNNLM(vocab.size(), 
            _hyperparameters_dict['embedding_size'], 
            _hyperparameters_dict['rnn_size'])

# move network to GPU if available
network = network.to(device)

optimizer = Adam(params = network.parameters(), lr=0.001)

In [12]:
# CHECKPOINT: make sure you understand each parameter size
print("Neural network parameters: ")
for param_name, param in network.named_parameters():
    print("\t" + param_name, " size: ", param.size())

Neural network parameters: 
	embedding.weight  size:  torch.Size([98, 100])
	rnn_cell.weight_ih  size:  torch.Size([1536, 100])
	rnn_cell.weight_hh  size:  torch.Size([1536, 512])
	rnn_cell.bias_ih  size:  torch.Size([1536])
	rnn_cell.bias_hh  size:  torch.Size([1536])
	logits.weight  size:  torch.Size([6, 512])
	logits.bias  size:  torch.Size([6])


In [0]:
# CHECKPOINT: make sure you can feedforward and backpropagate through network
xb, yb = batches_x[0].long(), batches_y[0].long()

hidden_start = torch.zeros(_hyperparameters_dict["batch_size"],
                            _hyperparameters_dict["rnn_size"]).to(device)
hidden_states = network(xb, hidden_start)
logits = network.get_logits(hidden_states)
loss = network.get_loss(logits, yb)
loss.backward()

In [14]:
#train the network for 60 iterations and save save hidden states (one slice of 
#the batch) every 30 iterations

prev_hidden = torch.zeros(_hyperparameters_dict["batch_size"],
                           _hyperparameters_dict["rnn_size"]).to(device)

for i in range(nrBatches):
    #print(i, 'out of', nrBatches)
    xb, yb = batches_x[i], batches_y[i]
    
    #gradients are set to zero every new batch
    optimizer.zero_grad()
    
    #feedforward
    hidden_states = network(xb.long(), prev_hidden)
    logits = network.get_logits(hidden_states)
    loss = network.get_loss(logits, yb.long())
    
    #backpropagation -> compute gradient of loss with respect to all weights
    loss.backward()
    
    #clip gradients if they get have norm > 5.0
    torch.nn.utils.clip_grad_norm_(list(network.parameters()), 5.0)
    
    #update weights 
    optimizer.step()

    #the hidden states from iteration it should no longer be linked to 
    #the hidden states from iteration (it+1)
    hidden_states.detach_()
    
    if i % 60 == 0:
        print("Iteration %d, loss = %f" %(i, loss))
    
    prev_hidden = hidden_states

Iteration 0, loss = 1.811685
Iteration 60, loss = 1.438825
Iteration 120, loss = 1.430934
Iteration 180, loss = 1.311178
Iteration 240, loss = 1.565592
Iteration 300, loss = 0.517341


In [0]:
import torch.nn.functional as F

class Trainer:
    def __init__(self, model: nn.Module, 
                 train_data_x: torch.LongTensor,
                 train_data_y: torch.LongTensor,
                 dev_data_x: torch.LongTensor,
                 dev_data_y: torch.LongTensor,
                 test_data_x: torch.LongTensor,
                 test_data_y: torch.LongTensor,
                 vocab: Vocabulary, 
                 hyperparams: Dict):
        self.model = model
        self.train_data_x = train_data_x
        self.train_data_y = train_data_y
        self.dev_data_x = dev_data_x
        self.dev_data_y = dev_data_y
        self.test_data_x = test_data_x
        self.test_data_y = test_data_y
        self.vocab = vocab
        if hyperparams['learning_algo'] == 'adam':
            self.optimizer = Adam(params = self.model.parameters(),
                                  lr = hyperparams['learning_rate'])
        else:
            self.optimizer = SGD(params = self.model.parameters(), 
                                 lr = hyperparams['learning_rate'])
        self.num_epochs = hyperparams['num_epochs']
        #self.max_len = hyperparams['max_len'] # batch-urile sunt deja construite cu 
        self.batch_size = hyperparams['batch_size']
        self.rnn_size = hyperparams['rnn_size']
        self.max_grad_norm = hyperparams['max_grad_norm']
        
        #number of characters in training/dev data
        self.nrTrainBatches = len(train_data_x.keys())
        self.nrDevBatches = len(dev_data_x.keys())
        self.nrTestBatches = len(test_data_x.keys())
        
        self.temperature = hyperparams['temperature']
        
    def train_epoch(self, epoch_num: int) -> float:
        """
        Compute the loss on the training set
        :param epoch_num: number of current epoch
        """
        self.model.train()
        epoch_loss = 0.0
        hidden_start = torch.zeros(self.batch_size, self.rnn_size).to(device)
        
        for i in range(train_dev_cutoff):
            #print(i, 'out of', nrBatches)
            x, y = self.train_data_x[i], self.train_data_y[i]

            # reset gradients
            self.optimizer.zero_grad()
          
            # compute hidden states
            # batch x timesteps x hidden_size
            hidden_states = self.model(x, hidden_start)
            
            # compute unnormalized probabilities
            # batch x timesteps x vocab_size
            logits = self.model.get_logits(hidden_states, self.temperature)
            
            # compute loss
            # scalar
            batch_loss = self.model.get_loss(logits, y)
            epoch_loss += batch_loss.item()
                       
            # backpropagate loss
            batch_loss.backward()
            
            # clip gradients if they get too large
            torch.nn.utils.clip_grad_norm_(list(self.model.parameters()), 
                                           self.max_grad_norm)
            
            # update parameters
            self.optimizer.step()
            
            # we use a stateful RNN, which means the first hidden state for the
            # next batch is the last hidden state of the current batch
            hidden_states.detach_()
            hidden_start = hidden_states
            
            if i % 100 == 0:
                print("epoch %d, %d/%d batches, batch loss = %f"
                      % (epoch_num, (i + 1), 
                         self.nrTrainBatches, batch_loss.item()))

        epoch_loss /= self.nrTrainBatches
        
        return epoch_loss

    def eval_epoch(self, epoch_num: int) -> float:
        """
        Compute the loss on the validation set
        :param epoch_num: number of current epoch
        """
        epoch_loss = 0.0
        hidden_start = torch.zeros(self.batch_size, self.rnn_size).to(device)
        with torch.no_grad():
            for i in range(train_dev_cutoff, nrBatches):
                #print(i, 'out of', nrBatches)
                x, y = self.dev_data_x[i], self.dev_data_y[i]
                
                #batch x timesteps x hidden_size
                hidden_states = self.model(x, hidden_start)
            
                #batch x timesteps x vocab_size
                logits = self.model.get_logits(hidden_states, self.temperature)
            
                batch_loss = self.model.get_loss(logits, y)
                epoch_loss += batch_loss.item()
                
                # we use a stateful RNN, which means the first hidden state for 
                # the next batch is the last hidden state of the current batch
                hidden_states.detach_()
                hidden_start = hidden_states
                
            epoch_loss /= self.nrDevBatches
        
        return epoch_loss    
            
    def train(self) -> Dict:
        train_losses, dev_losses = [], []
        for epoch in range(self.num_epochs):
            epoch_train_loss = self.train_epoch(epoch)
            epoch_dev_loss = self.eval_epoch(epoch)
            train_losses.append(epoch_train_loss)
            dev_losses.append(epoch_dev_loss)
        return {"train_losses": train_losses,
                "dev_losses": dev_losses}
      
    def test(self):
        hidden_start = torch.zeros(self.batch_size, self.rnn_size).to(device)
        with torch.no_grad():
            cnt = 0
            meanAcc = 0
            conf_mat = torch.zeros((nrLanguages, nrLanguages))
            #for i in range(train_dev_cutoff, nrBatches):
            for i in range(nrBatchesTest):
                cnt = cnt + 1
                #print(i, 'out of', nrBatches)
                #x, y = self.dev_data_x[i], self.dev_data_y[i]
                x, y = self.test_data_x[i], self.test_data_y[i]
                
                #batch x timesteps x hidden_size
                hidden_states = self.model(x, hidden_start)
            
                #batch x timesteps x vocab_size
                logits = self.model.get_logits(hidden_states, self.temperature)
            
                sft = F.softmax(logits)
                pred = torch.argmax(sft, dim=1)

                nrPredictedOk = (pred == y).sum()

                acc = float(nrPredictedOk) / batch_size
                meanAcc = meanAcc + acc
                
                #print(acc)
                # update confusion matrix
                for it in range(batch_size):
                    p = pred[it]
                    t = y[it].long()
                    
                    conf_mat[t][p] = conf_mat[t][p] + 1
                
                # we use a stateful RNN, which means the first hidden state for 
                # the next batch is the last hidden state of the current batch
                hidden_states.detach_()
                hidden_start = hidden_states
                        
        meanAcc = meanAcc / cnt
        return meanAcc, conf_mat

def plot_losses(metrics: Dict):
    """
    Plots training/validation losses.
    :param metrics: dictionar
    """
    plt.figure()
    plt.plot(metrics['train_losses'], c='b', label='Train')
    plt.plot(metrics['dev_losses'], c='g', label='Valid')
    plt.ylabel('Loss')
    plt.xlabel('Iteration')
    plt.legend()
    plt.show()

In [18]:
#train network for some epoch
trainer = Trainer(network, train_data_x, train_data_y, dev_data_x, dev_data_y, test_data_x, test_data_y, vocab, _hyperparameters_dict)

metrics = trainer.train()

NameError: ignored

In [0]:
#plot training and validations losses each epoch
plot_losses(metrics)