# LINK TO STREAMLIT APP

https://es335-assignment3-sabarmatisigmoid-boz6q2bg66xgu3rc4dcwmz.streamlit.app/

# IMPORTING RELEVANT LIBRARIES AND DATA

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import numpy as np
import json
import pickle
import time
import re # for regular expressions
from sklearn.manifold import TSNE # for t-SNE
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # to make graphs sharper
from pprint import pprint # pretty print

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # choosing device to run with

In [None]:
device # checking which device is being used

device(type='cuda')

In [None]:
from google.colab import drive # mounting google drive
drive.mount('/content/drive')

folder_path = '/content/drive/MyDrive/War_and_Peace/'

Mounted at /content/drive


In [None]:
with open(folder_path + 'processed_warandpeace.txt', 'r') as f: # loading processed text
    pr_text = json.load(f)

with open(folder_path + 'word2int.pkl', 'rb') as f: # loading word2int dictionary
    word2int = pickle.load(f)

with open(folder_path + 'int2word.pkl', 'rb') as f: # loading int2word dictionary
    int2word = pickle.load(f)

In [None]:
words=list(word2int.keys())
num_words=len(pr_text.split())
vocab_size = len(words)

In [None]:
for word in words[:10]+words[18326:18336]:
  print(word2int[word], ':', word)

0 :  
1 : .
2 : .2
3 : .ar
4 : .ate
5 : .ly
6 : .n
7 : .not
8 : .o
9 : .of
18326 : znamenka
18327 : zone
18328 : zoology
18329 : zu
18330 : zubov
18331 : zubova
18332 : zubovski
18333 : zum
18334 : zweck
18335 : <UNK>


In [None]:
print(num_words)
print(vocab_size)

593963
18336


In [None]:
max_word_len = 0
for word in words:
  if len(word)>max_word_len:
    max_word_len = len(word)
print(max_word_len) # length of longest word

27


In [None]:
line_len = len("slave,' as you call yourself! But how do you do? I see I have frightened")
print(line_len) # length of longest line

72


# SETTING PARAMETERS

emb_dim = 32 or 128


---


block_size = 4 or 8


---

activation_function = "ReLU" or "Sin"


---

random_seed = 96 or 42

In [None]:
block_size = 8 # context length: how many words do we take to predict the next one
num_layers = 3 # number of layers in the model
hidden_size = 1024 # size of the hidden layers in the model
emb_dim = 128 # embedding layer dimension
batch_size = 2048 # batch size
num_epochs = 150 # number of epochs to train for
learning_rate = 0.01 # learning rate for the optimizer
random_seed = 96 # random seed for reproducibility
activation_function = 'ReLU' # activation function to use

# CREATING X AND Y

In [None]:
torch.manual_seed(random_seed) # setting random seed for reproducibility

<torch._C.Generator at 0x7b6838cb0c30>

In [None]:
X, Y = [], [] # creating empty lists to store X and Y
context = [0]*block_size # creating context list

pr_text_split = pr_text.split() # splitting the one big string the text is in into a list of separated words

for word in pr_text_split: # iterating through each word
  X.append(context) # adding context to X
  Y.append(word2int[word]) # adding current word as the one following the previous context
  context = context[1:] + [word2int[word]] # updating context to include the word at the end for context for next word

In [None]:
for i in range(20): # printing context and next word to check
  context = X[i]
  word = Y[i]
  for con in context:
    print(int2word[con], end = ' ')
  print('--->', end = ' ')
  print(int2word[word])

                ---> well
              well ---> prince
            well prince ---> so
          well prince so ---> genoa
        well prince so genoa ---> and
      well prince so genoa and ---> lucca
    well prince so genoa and lucca ---> are
  well prince so genoa and lucca are ---> now
well prince so genoa and lucca are now ---> just
prince so genoa and lucca are now just ---> family
so genoa and lucca are now just family ---> estates
genoa and lucca are now just family estates ---> of
and lucca are now just family estates of ---> the
lucca are now just family estates of the ---> buonapartes
are now just family estates of the buonapartes ---> .
now just family estates of the buonapartes . ---> but
just family estates of the buonapartes . but ---> i
family estates of the buonapartes . but i ---> warn
estates of the buonapartes . but i warn ---> you
of the buonapartes . but i warn you ---> if


In [None]:
X = torch.tensor(X).to(device) # moving data to GPU
Y = torch.tensor(Y).to(device)

In [None]:
print('Shape of X:', X.shape)
print('Datatype of X:', X.dtype)
print('Shape of Y:', Y.shape)
print('Datatype of Y:', Y.dtype)

Shape of X: torch.Size([593963, 8])
Datatype of X: torch.int64
Shape of Y: torch.Size([593963])
Datatype of Y: torch.int64


# CREATING EMBEDDING LAYER

Creating an embedding layer basically means assigning a certain real valued vector with emb_dim dimensions to each unique word in the vocabulary. This reduces the number of dimensions from what we might have used for each unique word had we used one-hot encoding. This transforms the discrete vocabulary data into continuous data.

A dense vector is a type of vector where most (or all) of its elements are non-zero. It is the opposite of a sparse vector, where most elements are zero.

In [None]:
emb = torch.nn.Embedding(len(word2int), emb_dim) # embedding layer for words

In [None]:
emb.weight # randomly generated

Parameter containing:
tensor([[ 2.4603,  1.2647,  0.2663,  ...,  1.1148, -0.1940,  0.0418],
        [-1.1276,  0.0920,  1.1525,  ..., -0.5578, -0.4559, -0.4152],
        [-1.5283, -2.0629, -0.3219,  ..., -0.9540,  1.5494, -1.5271],
        ...,
        [ 0.5906,  0.0234,  0.7349,  ..., -0.5650, -0.7422,  1.2607],
        [-1.4616, -0.5968,  0.9015,  ..., -1.3212, -0.1003, -0.1985],
        [-0.3401,  1.8790, -0.5124,  ..., -0.8150, -1.0105,  0.2686]],
       requires_grad=True)

In [None]:
emb.weight.shape # each unique word in the vocabulary has a particular vector

torch.Size([18336, 128])

# CREATING NEURAL NETWORK MODEL

The code below creates a neural network model with one input layer which has (block_size * emb_dim) neurons to store the context as vectors, (num_layers) layers with (hidden_size) neurons each and an output layer with (vocab_size) neurons, each corresponding to a unique word in the vocabulary.

In [None]:
def sine_init(m): # SIREN initialization for gradient stability when working with sin activation function
    if isinstance(m, nn.Linear):
        input_dim = m.weight.size(1)
        std = 1 / input_dim  # scale factor for stability
        nn.init.uniform_(m.weight, -std, std)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

class NextWord(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_size, activation_function): # init method defines the architecture of the neural network
    super().__init__() # calls the superclass and its constructor
    self.emb = nn.Embedding(vocab_size, emb_dim) # embedding layer
    self.layers = nn.ModuleList() # list of layers
    hl_1 = nn.Linear(block_size * emb_dim, hidden_size)
    if activation_function == 'Sin':
      sine_init(hl_1)
    self.layers.append(hl_1) # first layer, maps from (block_size * emb_dim) neurons to (hidden_size) neurons
    # for layer in range(num_layers): # creating hidden layers
    #   self.layers.append(nn.Linear(hidden_size, hidden_size)) # hidden layers
    for i in range(num_layers): # creating hidden layers
      layer = nn.Linear(int(hidden_size/(2**i)), int(hidden_size/(2**(i+1))))
      if activation_function == 'Sin':
        sine_init(layer)
      self.layers.append(layer)
    self.layers.append(nn.Linear(int(hidden_size/(2**num_layers)), vocab_size)) # output layer
    if activation_function == 'ReLU':
      self.activation = nn.ReLU()
    elif activation_function == 'Sin':
      self.activation = lambda x: torch.sin(x)

  def forward(self, x):
    x = self.emb(x) # embedding layer
    x = x.view(x.shape[0], -1) # flatten the embedding layer
    for layer in self.layers: # passing through the layers
      x = layer(x)
      x = self.activation(x)
    return x

In [None]:
model = NextWord(block_size, vocab_size, emb_dim, hidden_size,activation_function).to(device) # creating the model
# model = torch.compile(model) # compiling the model
loss_fn = nn.CrossEntropyLoss() # loss function
opt = torch.optim.SGD(model.parameters(), lr = learning_rate, momentum = 0.65) # optimizer

In [None]:
for param_name, param in model.named_parameters():
    print(param_name, param.shape)

emb.weight torch.Size([18336, 128])
layers.0.weight torch.Size([1024, 1024])
layers.0.bias torch.Size([1024])
layers.1.weight torch.Size([512, 1024])
layers.1.bias torch.Size([512])
layers.2.weight torch.Size([256, 512])
layers.2.bias torch.Size([256])
layers.3.weight torch.Size([128, 256])
layers.3.bias torch.Size([128])
layers.4.weight torch.Size([18336, 128])
layers.4.bias torch.Size([18336])


# GENERATING TEXT FROM UNTRAINED MODEL

In [None]:
def print_in_lines(text, line_len, number_lines):
    text_words = text.split()  # splitting the text into words
    on_word = 0  # index to iterate through words
    for _ in range(number_lines):  # for specified number of lines
        curr_len = 0  # length of current line
        while on_word < len(text_words) and curr_len + len(text_words[on_word]) <= line_len:
            print(text_words[on_word], end=' ')
            curr_len += len(text_words[on_word]) + 1  # +1 for the space after the word
            on_word += 1
        print()  # onto the next line

In [None]:
def set_context(input_text):
  input_words = input_text.split()
  if len(input_words) < block_size: # context has to be a minimum length, if the input provided isn't long enough, padding it
    input_words = [0] * (block_size - len(input_words)) + input_words
  words_for_context = input_words[len(input_words)-block_size:len(input_words)] # 0 or more, to take the last (block_size) words for context
  context  = []
  for word in words_for_context:
    if word in word2int.keys():
      context.append(word2int[word])
    else:
      context.append(18335)
  context = [word2int[word] for word in words_for_context] # mapping the words to ints
  return context

In [None]:
base_context = [0]*block_size # defining a base case for context in case of empty input

In [None]:
def generate_text(model, int2word, word2int, block_size, k_lines = 10, context = base_context):
    gen_text = '' # initializing output text
    max_len = line_len*k_lines + max_word_len # deciding number of words to be predicted for printing k lines
    text_len = 0 # initializing number of words in output text
    while text_len < max_len: # predicting words one by one
      x = torch.tensor(context).view(1, -1).to(device) # converting context to tensor, flattening it, saving it to the gpu
      y_pred = model(x) # predicting next word
      ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item() # getting int for next word
      wor = int2word[ix] # mapping int to word
      gen_text += wor + ' ' # adding word to output with space
      context = context[1:] + [ix] # updating context
      text_len += 1 + len(wor) # updating output length

    return gen_text

In [None]:
print_in_lines(generate_text(model, int2word, word2int, block_size, 10), line_len, 10)

flatly pimples eggshell torch expedient 1809 brisk ask relevant liberty 
tremor barefooted assistance oaks repletion trammeled yellow spangled 
fardistant desperately efficacy lelorgne platons mistook mesdames 
roadside galloped summit bestknown hating phenomena strikes tearfully 
artificially boss 178 tavern stream fir pranced refutation mysterious 
accusations acid sour contributions disinclined group dressmakers aloof 
treated predilection batard 62 bullet helplessness report brows chucked 
stirring easy relentless coral castanets agent truthful caldrons 
starshaped wicket woodwork things doubling confined charms remotest 
obolenski majordomo correctly recently elusive clement duets echkino 


# TRAINING MODEL

In [None]:
# mini-batch training

print_every = 10 # printing loss every 10 epochs
elapsed_time = []
for epoch in range(num_epochs):
    start_time = time.time()
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
        x=torch.where((x>18335)|(x<0), torch.tensor(18335).to(device), x)

        loss = loss_fn(y_pred, y) # calulating loss
        loss.backward() # backpropagation
        opt.step() # updating weights
        opt.zero_grad() # clears model parameters beofre next iteration
    end_time = time.time()
    elapsed_time.append(end_time - start_time)
    if epoch % print_every == 0 or epoch==num_epochs-1:
        print(epoch, loss.item()) # printing loss value to keep track of the process

0 7.639541149139404
10 6.6822638511657715
20 6.267232418060303
30 5.8050336837768555
40 5.424478054046631
50 4.8165130615234375
60 4.700401782989502
70 3.9168150424957275
80 3.4314124584198
90 2.7705395221710205
100 2.150843858718872
110 2.1912953853607178
120 2.0493619441986084
130 1.5014652013778687
140 1.3942604064941406
149 1.4576506614685059


In [None]:
model_name = f'model_{emb_dim}_{block_size}_{activation_function}_{random_seed}.pth' # naming using model parameters
torch.save(model.state_dict(), folder_path + model_name)

In [None]:
print(model_name)

model_128_8_ReLU_96.pth


# GENERATING TEXT FROM TRANIED MODEL

In [None]:
print_in_lines(generate_text(model, int2word, word2int, block_size, 10), line_len, 10)

player if the day in cruel . . . he went to him two of any days . in 
moscow that only historians is kissed this eighth initiated plumes songs 
a 26 a order and fresh of thousands of folds dokhturovs of demanded of 
tree courtyards anton when putting the kings from time and had scruple 
the blissful in an rein in the expresses of their muddy position is the 
fighting movement reason the complete of zu of news officers by began by 
militiaman became the cavalry tall greatest yet langleterre in an lofty 
expression is taking the hours of these grass and daughters hand . in 
the letter the staff of inches . . . . that impeding another akharovs to 
the form . so will is that that needs . lads then shall he escape a 
