# Part 1 for Assignment 2, LT2326, Autumn 2021

Note that part 1 and part 2 have been done in reverse order.  

**Name**: Max Boholm (gusbohom)

In [1]:
# MB collected all libraries at one place
import sys
import os
import numpy as np
import random
import pandas as pd

import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn as nn
import torch.optim as optim
from torch.distributions.uniform import Uniform

import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

In [2]:
# MB. New cell
gpu_device = "cuda:1"

## 1. Loading the data, padding (based on 2.0)

Start-of-sentence (sos) and end-of-sentence (eos) tokens have been added to the data.

In [3]:
# MB added code for adding start-of-sentence (sos) and end-of-sentence (eos) tokens

sos = "#"
eos = "!"
# MB. Neither "#" nor "!" seems to be in the original data

def read_chinese_data(inputfilename):
    with open(inputfilename, "r") as inputfile:
        sentences = []
        collection_words = []
        collection_labels = []
        for line in inputfile:
            if line[0] == '#':
                continue
            columns = line.split()
            if columns == []:
                collection_words = [sos] + collection_words + [eos] # MB modified the iteration here
                collection_labels = [1] + collection_labels + [1]   # ... and here

                sentences.append((''.join(collection_words), collection_labels))
                collection_words = []
                collection_labels = []
                continue
            
            collection_words.append(columns[1])
            collection_labels += [1] + ([0] * (len(columns[1]) - 1))
            
    return sentences

In [4]:
train_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-train.conllu')
# train_sentences[0] # MB added this line

In [5]:
test_sentences = read_chinese_data('/scratch/lt2316-h20-resources/zh_gsd-ud-test.conllu')
# test_sentences[0] # MB added this line

In [6]:
def index_chars(sentences):
    megasentence = ''.join(sentences)
    char_list = set()
    for c in megasentence:
        char_list.add(c)
    char_list = [0] + list(char_list)
    return char_list, {char_list[x]:x for x in range(len(char_list))}

In [7]:
int_index, char_index = index_chars([x[0] for x in train_sentences + test_sentences])

In [8]:
def convert_sentence(sentence, index):
    return [index[x] for x in sentence]

In [9]:
def pad_lengths(sentences, max_length, padding=0):
    return [x + ([padding] * (max_length - len(x))) for x in sentences]

In [10]:
def create_dataset(x, device="cpu"):
    converted = [(convert_sentence(x1[0], char_index), x1[1]) for x1 in x]
    X, y = zip(*converted)
    lengths = [len(x2) for x2 in X]
    padded_X = pad_lengths(X, max(lengths))
    Xt = torch.LongTensor(padded_X).to(device)
    padded_y = pad_lengths(y, max(lengths), padding=-1)
    yt = torch.LongTensor(padded_y).to(device)
    lengths_t = torch.LongTensor(lengths).to(device)
    return Xt, lengths_t, yt

In [11]:
train_X_tensor, train_lengths_tensor, train_y_tensor = create_dataset(train_sentences, gpu_device)
test_X_tensor, test_lengths_tensor, test_y_tensor = create_dataset(test_sentences, gpu_device)

## 2. Packing the sequences for RNN

Cells for illstrating structure of data and performance of `pack_padded_sequence` and `pad_packed_sequence` have been removed.

## 3. Batching (based on 1.0, 1.1, 1.2)

Cells for illustrating `Batcher` have been removed. I have not done anything with `Batcher`.

In [12]:
class Batcher:
    def __init__(self, X, lengths, y, device, batch_size=50, max_iter=None):
        self.X = X
        self.lengths = lengths # We need the lengths to efficiently use the padding.
        self.y = y
        self.device = device
        self.batch_size=batch_size
        self.max_iter = max_iter
        self.curr_iter = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.curr_iter == self.max_iter:
            raise StopIteration
        permutation = torch.randperm(self.X.size()[0], device=self.device)
        permX = self.X[permutation]
        permlengths = self.lengths[permutation]
        permy = self.y[permutation]
        splitX = torch.split(permX, self.batch_size)
        splitlengths = torch.split(permlengths, self.batch_size)
        splity = torch.split(permy, self.batch_size)
        
        self.curr_iter += 1
        return zip(splitX, splitlengths, splity)

## 4. Modeling

Cells for illustrating layers have been removed. The `Segmenter` model is removed. I have added the model `PredictNext`, which is a text generating model.  

A note on language model and the objective to "given a start symbol, produce a variety of sentences that terminate with a stop symbol" (part 1 of assignment 2): as designed here, a trained model, would given the same first-token (e.g. start-of-sentence token) always generate the *same* sequence *if we not added some randomness to the text generation*. The solution for doing this here is to define the initial hidden state and cell state of the LSTM by random numbers in text generation (while by zeros in training). Thus, the `PredictNext` model has a method (`initHidden`) which outputs an inital hidden state and an intial cell state by random number or zeros. 

In [13]:
# MB. New cell defining a text generator (using code from previous model definition as basis)

class PredictNext(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_dim):
        super(PredictNext, self).__init__()
        
        self.vocab_size = vocab_size
        self.emb_size = emb_size
        self.hidden = hidden_dim
        
        self.emb = nn.Embedding(self.vocab_size, self.emb_size, 0)
        self.lstm = nn.LSTM(self.emb_size, self.hidden, batch_first=True)
        self.classifier = nn.Linear(self.hidden, self.vocab_size)
        self.softmax = nn.LogSoftmax(1) # MB correct dimension?

    def forward(self, previous, h_c_states): # M.B. removed lengths
        
        bsz = previous.shape[0]
        
        emb_previous = self.emb(previous)
        output, (hidden, cell) = self.lstm(emb_previous, h_c_states)
        classification_over_vocabulary = self.classifier(hidden.reshape(bsz, self.hidden)) # MB length of input and output is 1
        classification_over_vocabulary = self.softmax(classification_over_vocabulary)
        next_one = classification_over_vocabulary.argmax(1).unsqueeze(1)
        
        return next_one, classification_over_vocabulary, (hidden, cell)
    
    def initHidden(self, batchsize, zero = True, distrib_low = -2, distrib_high = 2):
        """ MB. For initialization of hidden state and cell state of LSTMs. There are 
            two options here: zero initialization and random initialization. 
            Random initialization ranges from `distrib_low` to `distrib_high`. Here, 
            the `Uniform` function from `torch.distributions.uniform` is used for this.
            Given some experimenting `torch.rand` (which is based on a uniform 
            distribution from 0 to 1) gave only minimal variation in sentence generation
            (see below); thus, the approach chosen here. However, the default values 
            selected here are arbitrary and have little variation other than 
            `text_generator`(below) should yield (substantially) different sequences 
            when called.
        """
        
        if zero:
            init_hidden = torch.zeros(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
            init_cell = torch.zeros(1, batchsize, self.hidden, device = gpu_device)
#         else:
#             init_hidden = torch.rand(1, batchsize, self.hidden, device = gpu_device) # for unstacked lstms; see https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
#             init_cell = torch.rand(1, batchsize, self.hidden, device = gpu_device)
        else:
            init_hidden = Uniform(distrib_low, distrib_high).sample([1, batchsize, self.hidden]).to(gpu_device)
            init_cell = Uniform(distrib_low, distrib_high).sample([1, batchsize, self.hidden]).to(gpu_device)
        
        return init_hidden, init_cell  
        

## 5. Training

The training loop is in large parts the same as in the original. However, modifications have been made to fit the new model. 

In [14]:
# MB modified: 
#    one variable per line; 
#    variable for LSTM hidden dimension; 

def train(X, 
          lengths, 
          y, 
          vocab_size, 
          emb_size, 
          lstm_hidden_dim, 
          batch_size, 
          epochs, 
          device,
          model=None): 
    
    b = Batcher(X, lengths, y, device, batch_size=batch_size, max_iter=epochs)
    
    if not model:
        m = PredictNext(vocab_size, emb_size, lstm_hidden_dim).to(device) 
    else:
        m = model
        
    loss = nn.NLLLoss(ignore_index=-1) # MB note-to-self: ignore index=-1
    optimizer = optim.Adam(m.parameters(), lr=0.005)
    epoch = 0
    
    
    for split in b:
        tot_loss = 0
        for batch in split:
            
            optimizer.zero_grad()
            
            sentence = batch[0]
            lengths = batch[1]
            bsz = sentence.shape[0]
            seq_len = sentence.shape[1]

            total_batch_loss = 0
            
            init_hidden, init_cell = m.initHidden(bsz)
            h_c_states = (init_hidden, init_cell)

            loss_o2 = 0
            the_who = sentence[:, 0].unsqueeze(1)

            for i in range(1, seq_len-1):

                the_who, my_generation, h_c_states = m(the_who, h_c_states)

                target = sentence[:, i]

                # MB a little detour for padding's sake... 
                pad_idx = char_index[0] # MB. this the index for padding in sentences
                target = torch.where(target == pad_idx, -1, target) # MB. https://pytorch.org/docs/stable/generated/torch.where.html
                # MB. -1 is the padding of targets and the ignored index in loss 

                # MB. PS. I now realize that I, like the segmentation model, could have used
                # the `train_lengths_tensor` for "ignoring" the padding in loss calculation, 
                # but I am already past deadline and do not implement that solution here, now. 

                loss_for_this_prediction = loss(my_generation, target)

                loss_o2 += loss_for_this_prediction

            total_batch_loss += loss_o2
            
            tot_loss += total_batch_loss
            total_batch_loss.backward()
            optimizer.step()
        
        print("Total loss in epoch {} is {}.".format(epoch+1, tot_loss)) # MB added +1
        epoch += 1
    return m
      

In [16]:
# MB modification: one parameter per line (easier to read)
model = train(X = train_X_tensor, 
                  lengths = train_lengths_tensor, 
                  y = train_y_tensor, 
                  vocab_size = len(int_index), 
                  emb_size = 200, 
                  lstm_hidden_dim = 150, 
                  batch_size = 50, 
                  epochs = 10, 
                  device = gpu_device)

Total loss in epoch 1 is 57142.16796875.
Total loss in epoch 2 is 54735.671875.
Total loss in epoch 3 is 54541.796875.
Total loss in epoch 4 is 54247.13671875.
Total loss in epoch 5 is 54507.0625.
Total loss in epoch 6 is 54604.96875.
Total loss in epoch 7 is 53613.05859375.
Total loss in epoch 8 is 53991.3671875.
Total loss in epoch 9 is 53163.62890625.
Total loss in epoch 10 is 53773.68359375.


## 6. Generation

A function `text_generator` has been defined to generate text from a trained model. Note here (as mentioned above) that initialization of hidden and cell state are set by random numbers. One each call, `text_generator` generate different sequences.

In [19]:
# MB. New cell.
def text_generator(model, prime_token = sos, max_length = 180):
    model.eval()

    rolling_stone = torch.tensor([[char_index[prime_token]]]).to(gpu_device)
    
    hidden_cell_states = model.initHidden(1, zero = False)
    # MB. We need some variation to the loop in order to produce variable sentences. 
    # To use random initialization for the initial hidden and cell state of the LSTM  
    # is my suggestion for solving that. 
    
    length_of_generation = 0
    ex_nihilo = []
    end_of_sentence = False
    length_of_generation = 0
    
    while end_of_sentence == False and length_of_generation < max_length:
        rolling_stone, just_ignore_this, hidden_cell_states = model(rolling_stone, hidden_cell_states)
        rs_as_string = str(int_index[rolling_stone.flatten()])
        if rs_as_string == eos:
            end_of_sentence = True
        else:
            ex_nihilo.append(rs_as_string)
            length_of_generation += 1

    return "".join(ex_nihilo)

In [24]:
# MB. New cell. 
text_generator(model)

'根10，，，，，，，，，，，，。。'

In [25]:
# ... being different from:
text_generator(model)

'這9，，，，，，，，，，。。'