In [1]:
import torch
import torch.nn as nn
import re

In [8]:
class TextReader:
    def __init__(self,filename):
        self.filename = filename
        
    def get_all_words(self,regex_params=None):
        '''
        regex_params = {
            "uppercase": True,
            "digits":False,
            "punctuation_list":None
        }
        '''
        if regex_params:
            regex_string = "a-z"
            if regex_params["uppercase"]: regex_string += "A-Z"
            if regex_params["digits"]: regex_string += "0-9"
            if regex_params["punctuation_list"]: regex_string += "".join(regex_params["punctuation_list"])
            regex_string = "[" + regex_string +"]"
            with open(self.filename,"r") as f: text = f.read()
            words = re.findall(regex_string,text)
            return words
        else:
            with open(self.filename,"r") as f: words = f.read().split(" ")
            return words + [" "]
        
    def get_unique_words(self,regex_params=None, distinguish_casing=False):
        '''
        regex_params = {
            "uppercase": True,
            "digits":False,
            "punctuation_list":None
        }
        '''
        all_words = self.get_all_words(regex_params)
        if not distinguish_casing: return list(set([word.lower() for word in all_words]))
        else: return list(set(all_words))
    
    
    def get_X_and_Y(self,window_size=10):
        with open(self.filename,"r",errors="ignore") as f:
            text  = f.read()
            X,Y = [],[]
            for i in range(len(text)-window_size-1):
                X.append(text[i:i+window_size])
                Y.append(text[i+1:window_size+i+1])
        return X,Y

In [9]:
class VocabBuilder:
    def __init__(self, words = None, sentences=None, unknown_token="unk"):
        self.unknown_token = unknown_token
        if words:
            self.words = words
            self.char_to_index, self.index_to_char = self.get_char_vocab()
        if sentences:
            self.sentences = sentences
            self.word_to_index, self.index_to_word = self.get_word_vocab()
        if not words and not sentences:
            print("At least 1 argument is required")
        
    def get_char_vocab(self):
        char_to_index, index_to_char = {},{}
        all_chars = list(set("".join(self.words)))
        for i,char in enumerate(all_chars):
            char_to_index[char] = i
            index_to_char[i] = char
        vocab_length = len(char_to_index)
        char_to_index[self.unknown_token] = vocab_length
        index_to_char[vocab_length] = self.unknown_token
        return char_to_index, index_to_char
    
    def get_word_vocab(self):
        word_to_index, index_to_word = {},{}
        all_words = list(set(" ".join(self.sentences.split(" "))))
        for i,word in enumerate(all_words):
            word_to_index[word] = i
            index_to_word[i] = word
        return word_to_index, index_to_word

In [10]:
class GenerateEncoding:
    def __init__(self,data_x,vocab_x,data_y,vocab_y,unknown_token):
        self.data_x = data_x
        self.vocab_x = vocab_x
        self.data_y = data_y
        self.vocab_y = vocab_y
        self.unknown_token = unknown_token
        self.pure_vocab_x = self.remove_unknown_token_from_voab(vocab_x)
        self.pure_vocab_y = self.remove_unknown_token_from_voab(vocab_y)
    
    def remove_unknown_token_from_voab(self,vocab):
        return {k:v for k,v in vocab.items() if k != self.unknown_token}
        
    def get_encoding_X(self,raw_text=None):
        if raw_text: data_to_encode = raw_text
        else: data_to_encode = self.data_x
        encoded_X = []
        for word in data_to_encode:
            characters = list(word)
            word_encoding = []
            for c in characters: 
                if c not in self.pure_vocab_x: word_encoding.append(self.vocab_x[self.unknown_token])
                else: word_encoding.append(self.vocab_x[c])
            encoded_X.append(word_encoding)
        return encoded_X
    
    def get_encoding_Y(self,raw_text=None):
        if raw_text: data_to_encode = raw_text
        else: data_to_encode = self.data_y
        encoded_Y = []
        for word in data_to_encode:
            characters = list(word)
            word_encoding = []
            for c in characters: 
                if c not in self.pure_vocab_y: word_encoding.append(self.vocab_y[self.unknown_token])
                else: word_encoding.append(self.vocab_y[c])
            encoded_Y.append(word_encoding)
        return encoded_Y

In [11]:
class BatchGenerator:
    def __init__(self,X,Y,batch_size):
        self.X = X
        self.Y = Y
        self.batch_size = batch_size
    
    def get_batch(self,batch_index,make_tensor=False):
        Xb = self.X[batch_index*self.batch_size:(batch_index+1)*self.batch_size]
        Yb = self.Y[batch_index*self.batch_size:(batch_index+1)*self.batch_size]
        if make_tensor: return torch.tensor(Xb),torch.tensor(Yb)
        return Xb,Yb

In [12]:
class MyCharLevelRNNModel(nn.Module):
    def __init__(self,vocab_size, embedding_dim, lstm_neurons, num_lstm_layers, num_classes,
                 make_birectional=False, debug_mode=False):
        super().__init__()
        self.debug_mode = debug_mode
        self.bidirectional = make_birectional
        self.lstm_neurons = lstm_neurons
        self.num_lstm_layers = num_lstm_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_neurons, 
                            num_layers=num_lstm_layers, bidirectional=make_birectional, batch_first=True)
        
        in_features = lstm_neurons
        if self.bidirectional: in_features = 2*lstm_neurons
        self.linear1 = nn.Linear(in_features=in_features, out_features=100)
        self.relu = nn.LeakyReLU()
        self.linear2 = nn.Linear(in_features=100, out_features=num_classes)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self,x,ht,ct):
        if self.debug_mode: print("Before embedding layer:",x.shape)
        x = self.embedding(x)
        if self.debug_mode: print("After embedding layer:",x.shape)
        x, (ht, ct) = self.lstm(x,(ht,ct))
        if self.debug_mode: print("After lstm layer:",x.shape,ht.shape,ct.shape)
        x = x.reshape(-1, x.shape[2])
        if self.debug_mode: print("After reshaping:",x.shape)
        x = self.linear1(x)
        x = self.relu(x)
        if self.debug_mode: print("After 1st linear layer:",x.shape)
        x = self.linear2(x)
        x = self.log_softmax(x)
        if self.debug_mode: print("After 2nd linear layer:",x.shape)
        return x, ht,ct
    
    def init_state_of_lstm(self,batch_size):
        if self.bidirectional: first_param = 2*self.num_lstm_layers
        else: first_param = self.num_lstm_layers
        return (
            torch.randn(first_param, batch_size, self.lstm_neurons),
            torch.randn(first_param, batch_size, self.lstm_neurons),
        )

In [129]:
text_reader = TextReader("data2.txt")
# all_words = text_reader.get_all_words(regex_params={
#     "uppercase":True,"digits":True,"punctuation_list":[" ",".",",","'",";","\-","?","!","\[","\]","{","}","(",")"]
# })
all_words = text_reader.get_all_words(regex_params=None)
window_size = 12
X,Y = text_reader.get_X_and_Y(window_size=window_size)
vocab_builder = VocabBuilder(words=all_words,unknown_token="unk")
char_to_index, index_to_char = vocab_builder.char_to_index, vocab_builder.index_to_char
print(len(char_to_index))

encoding_generator = GenerateEncoding(X,char_to_index,Y,char_to_index,unknown_token="unk")
X_enc = encoding_generator.get_encoding_X()
Y_enc = encoding_generator.get_encoding_Y()
print(len(X),len(Y),X[:4],Y[:4],X[-4:],Y[-4:])
print(len(X_enc),len(Y_enc),X_enc[:4],Y_enc[:4],X_enc[-4:],Y_enc[-4:])

batch_size = 5
batch_generator = BatchGenerator(X_enc,Y_enc,batch_size)
Xb,Yb = batch_generator.get_batch(batch_index=1)
print(len(Xb),len(Xb[0]),len(Yb),len(Yb[0]))

77
12854 12854 ['At Reading R', 't Reading Ro', ' Reading Roc', 'Reading Rock'] ['t Reading Ro', ' Reading Roc', 'Reading Rock', 'eading Rocke'] [' relevant to', 'relevant tod', 'elevant toda', 'levant today'] ['relevant tod', 'elevant toda', 'levant today', 'evant today.']
12854 12854 [[42, 75, 49, 40, 3, 23, 9, 63, 54, 35, 49, 40], [75, 49, 40, 3, 23, 9, 63, 54, 35, 49, 40, 39], [49, 40, 3, 23, 9, 63, 54, 35, 49, 40, 39, 32], [40, 3, 23, 9, 63, 54, 35, 49, 40, 39, 32, 26]] [[75, 49, 40, 3, 23, 9, 63, 54, 35, 49, 40, 39], [49, 40, 3, 23, 9, 63, 54, 35, 49, 40, 39, 32], [40, 3, 23, 9, 63, 54, 35, 49, 40, 39, 32, 26], [3, 23, 9, 63, 54, 35, 49, 40, 39, 32, 26, 3]] [[49, 65, 3, 2, 3, 0, 23, 54, 75, 49, 75, 39], [65, 3, 2, 3, 0, 23, 54, 75, 49, 75, 39, 9], [3, 2, 3, 0, 23, 54, 75, 49, 75, 39, 9, 23], [2, 3, 0, 23, 54, 75, 49, 75, 39, 9, 23, 25]] [[65, 3, 2, 3, 0, 23, 54, 75, 49, 75, 39, 9], [3, 2, 3, 0, 23, 54, 75, 49, 75, 39, 9, 23], [2, 3, 0, 23, 54, 75, 49, 75, 39, 9, 23, 25], [3, 0, 2

In [130]:
epochs = 50
batch_size = 32
batch_generator = BatchGenerator(X_enc,Y_enc,batch_size)
num_batches = len(X_enc)//batch_size
embedding_dim = 3
vocab_size = len(index_to_char)
num_classes = len(index_to_char)
num_lstm_layers = 2
lstm_neurons = 128
make_bidirectional = False

In [131]:
model = MyCharLevelRNNModel(vocab_size=vocab_size, embedding_dim=embedding_dim, lstm_neurons=lstm_neurons, 
                   num_lstm_layers=num_lstm_layers, num_classes = num_classes,
                   make_birectional=make_bidirectional, debug_mode=True)
optimizer = torch.optim.Adam(model.parameters(),lr=0.1)
loss_function = nn.NLLLoss()
(ht,ct) = model.init_state_of_lstm(batch_size)
Y_actual, Y_pred = [], []

optimizer.zero_grad()
Xb, Yb = batch_generator.get_batch(2,make_tensor=True)

op, ht,ct = model(Xb,ht,ct)
print(op.shape)
print(op[0])
Yb = Yb.reshape(-1)
print(op.shape, Yb.shape)
loss = loss_function(op, Yb)
print(loss)
ht = ht.detach()
ct = ct.detach()
loss.backward()
optimizer.step()
Y_pred += [int(el) for el in torch.argmax(op,axis=1)]
Y_actual += [int(el) for el in Yb]

optimizer.zero_grad()
Xb, Yb = batch_generator.get_batch(3,make_tensor=True)
op, ht,ct = model(Xb,ht,ct)
print(op.shape)
print(op[0])
Yb = Yb.reshape(-1)
print(op.shape, Yb.shape)
loss = loss_function(op, Yb)
print(loss)
ht = ht.detach()
ct = ct.detach()
loss.backward()
optimizer.step()
Y_pred += [int(el) for el in torch.argmax(op,axis=1)]
Y_actual += [int(el) for el in Yb]

Before embedding layer: torch.Size([32, 12])
After embedding layer: torch.Size([32, 12, 3])
After lstm layer: torch.Size([32, 12, 128]) torch.Size([2, 32, 128]) torch.Size([2, 32, 128])
After reshaping: torch.Size([384, 128])
After 1st linear layer: torch.Size([384, 100])
After 2nd linear layer: torch.Size([384, 77])
torch.Size([384, 77])
tensor([-4.4361, -4.3294, -4.2114, -4.2545, -4.3792, -4.3892, -4.4505, -4.3554,
        -4.3625, -4.3226, -4.3423, -4.3603, -4.3385, -4.4103, -4.3496, -4.2818,
        -4.4174, -4.3350, -4.3268, -4.2208, -4.2867, -4.2928, -4.4521, -4.2276,
        -4.2879, -4.2412, -4.4175, -4.4062, -4.3712, -4.4885, -4.3801, -4.2676,
        -4.4052, -4.4136, -4.3674, -4.4011, -4.3821, -4.3810, -4.3118, -4.4981,
        -4.1939, -4.4214, -4.3237, -4.3946, -4.4105, -4.3398, -4.4227, -4.2411,
        -4.2748, -4.4929, -4.2987, -4.3210, -4.2568, -4.3841, -4.3083, -4.4307,
        -4.3511, -4.2583, -4.2456, -4.3684, -4.3079, -4.1747, -4.4052, -4.3565,
        -4.3175, -4

In [132]:
model = MyCharLevelRNNModel(vocab_size=vocab_size, embedding_dim=embedding_dim, lstm_neurons=lstm_neurons, 
                   num_lstm_layers=num_lstm_layers, num_classes = num_classes,
                   make_birectional=make_bidirectional, debug_mode=False)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
loss_function = nn.NLLLoss()

In [141]:
for e in range(epochs):
    model.train()
    (ht,ct) = model.init_state_of_lstm(batch_size)
    epoch_loss = 0
    Y_actual, Y_pred = [], []
    for i in range(num_batches):
        if i%20 == 0: print(i, end=' ')
        optimizer.zero_grad()
        Xb, Yb = batch_generator.get_batch(i,make_tensor=True)
        op, ht,ct = model(Xb,ht,ct)
        Yb = Yb.reshape(-1)
        loss = loss_function(op, Yb)
        epoch_loss += loss.item()
        ht = ht.detach()
        ct = ct.detach()
        loss.backward()
        optimizer.step()
    print("\nEpoch: {}, Loss: {}".format(e+1,epoch_loss))

0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 1, Loss: 469.7949805855751
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 2, Loss: 475.5837924480438
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 3, Loss: 471.34292113780975
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 4, Loss: 464.0449299812317
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 5, Loss: 462.09237909317017
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 6, Loss: 458.13329952955246
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 7, Loss: 459.25620913505554
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 8, Loss: 455.44268375635147
0 20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 
Epoch: 9, Lo

In [142]:
from copy import deepcopy

In [153]:
test_string = ["purchase a sk"]
test_string_enc = encoding_generator.get_encoding_X(raw_text=test_string)
pred_op = deepcopy(test_string_enc)
print(pred_op)

model.eval()
if make_bidirectional: first_param = 2*num_lstm_layers
else: first_param = num_lstm_layers
ht_pred = torch.randn(first_param, 1, lstm_neurons)
ct_pred = torch.randn(first_param, 1, lstm_neurons)

unigram = True # unigram will work, becuase it is a statefulRNN (ht and ct is getting updated for every character)
window_size = 5
num_chars = len(test_string[0])+100

with torch.no_grad():
    for i in range(num_chars):
        input_vec = torch.tensor([pred_op[0][i:i+1]])
        op,ht_pred,ct_pred = model(input_vec,ht_pred,ct_pred)
        op = torch.argmax(op,axis=1).tolist()
        if i >= len(test_string_enc[0])-1: pred_op[0].append(op[0])
    pred_word = "".join([index_to_char[el] for el in pred_op[0]])
    print(pred_word)

[[47, 17, 65, 32, 11, 23, 21, 3, 49, 23, 49, 21, 26]]
purchase a skateboard, he and the program was also designed to accept new axioms in the program was also designed 
