In [1]:
import torch.nn as nn
import torch
import random
import os
from sklearn.model_selection import train_test_split

In [2]:
def generate_data_files():
    if os.path.exists("jumbled.txt"): os.remove("jumbled.txt")
    if os.path.exists("unjumbled.txt"): os.remove("unjumbled.txt")   
    f1 = open("jumbled.txt","w")
    f2 = open("unjumbled.txt","w")
    with open("source.txt","r") as f:
        for line in f:
            split_line = line.lower().strip().split()
            sentence = split_line[1:]
            f2.write(' '.join(sentence)+"\n")
            random.shuffle(sentence)
            f1.write(" ".join(sentence)+"\n")
    f1.close()
    f2.close()

In [3]:
generate_data_files()

In [4]:
def get_count_of_words():
    word_count_dict = {}
    with open("../Datasets/Jumble_Unjumble/jumbled.txt","r") as f:
        for line in f:
            words = line.split()
            for word in words:
                if word in word_count_dict: word_count_dict[word] += 1
                else: word_count_dict[word] = 1
    return word_count_dict

In [5]:
word_count_dict = get_count_of_words()
print(len(word_count_dict))

8918


In [6]:
min_word_count = 1
count = 0
for k,v in word_count_dict.items():
    if v > min_word_count: count += 1
print(count)

5237


In [12]:
def replace_less_frequent_words(filename,word_count_dict,min_word_count,replace_token="<unk>"):
    with open(filename,"r") as f:
        with open("../Datasets/Jumble_Unjumble/processed_"+filename.split("/")[-1],"w") as f1:
            for line in f:
                words = line.strip().split()
                sentence_to_write = []
                for word in words:
                    if word_count_dict[word] > min_word_count: sentence_to_write.append(word)
                    else: sentence_to_write.append(replace_token)
                sentence_to_write.append("\n")
                f1.write(" ".join(sentence_to_write))

In [13]:
replace_less_frequent_words("../Datasets/Jumble_Unjumble/jumbled.txt",word_count_dict,min_word_count=min_word_count,replace_token="<unk>")

In [14]:
replace_less_frequent_words("../Datasets/Jumble_Unjumble/unjumbled.txt",word_count_dict,min_word_count=min_word_count,replace_token="<unk>")

## Classes

In [10]:
class VocabBuilder:
    def __init__(self,text_corpus,unknown_token=None,pad_token=None,sos_token=None,eos_token=None):
        '''
        text_corpus = [
            sentence_1,  # sentence_1 = "a yellow car ..."
            sentence_2
            ...
        ]
        '''
        self.text_corpus = text_corpus
        self.unknown_token = unknown_token or "<unk>"
        self.pad_token = pad_token or "<pad>"
        self.sos_token = sos_token or "<sos>"
        self.eos_token = eos_token or "<eos>"
        self.word_to_index, self.index_to_word = self.get_vocabs()
                        
    def get_vocabs(self):
        word_to_index = {}
        index_count = 0
        for sentence in self.text_corpus:
            words = sentence.split()
            for word in words:
                if word not in word_to_index:
                    word_to_index[word] = index_count
                    index_count += 1
        if not self.unknown_token in word_to_index: 
            word_to_index[self.unknown_token] = index_count
            index_count += 1
        if not self.pad_token in word_to_index: 
            word_to_index[self.pad_token] = index_count
            index_count += 1
        if not self.sos_token in word_to_index: 
            word_to_index[self.sos_token] = index_count
            index_count += 1
        if not self.eos_token in word_to_index: 
            word_to_index[self.eos_token] = index_count
            index_count += 1
        index_to_word = {v:k for k,v in word_to_index.items()}
        return word_to_index, index_to_word

In [11]:
class EncodeDecode:
    def __init__(self,word_to_index,index_to_word,pad_token,unknown_token,smallcase=True):
        self.smallcase = smallcase
        self.word_to_index = word_to_index
        self.index_to_word = index_to_word
        self.pad_token = pad_token
        self.unknown_token = unknown_token
    
    def get_encoding(self,sentence):
        '''
        sentence can be a string, or a list of words
        '''
        if isinstance(sentence,str): sentence = sentence.split(" ")
        if self.smallcase: sentence =  [word.lower() for word in sentence]
        encoded_sentence = []
        for word in sentence:
            if word in self.word_to_index: encoded_sentence.append(self.word_to_index[word])
            else: encoded_sentence.append(self.word_to_index[self.unknown_token])
        return encoded_sentence
    
    def get_decoding(self,encoded_sentence):
        '''
        encoded_sentence must be a list of vocab indices.
        Ex: encoded_sentence = [24,21,4,1,..] 
        '''
        sentence = [self.index_to_word[index] for index in encoded_sentence]
        return " ".join(sentence)

In [31]:
class UnjumbleEncoderModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim,num_lstm_layers,hidden_size,make_bidirectional,debug):
        super().__init__()
        self.debug = debug
        self.bidirectional = make_bidirectional
        self.num_lstm_layers = num_lstm_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(input_size=embedding_dim,hidden_size=hidden_size,dropout=0.5,
                            num_layers=num_lstm_layers,bidirectional=make_bidirectional,batch_first=True)
        
    def forward(self,x,h):
        if self.debug: print("Before starting: x Shape:",x.shape,"Prev State Shape",h.shape)
        
        x = self.embedding(x)
        x = self.relu(x)
        if self.debug: print("Embedding, x Shape:",x.shape)
        
        op,ht = self.gru(x,h)
        if self.debug: print("GRU, op Shape:",op.shape,"ht shape",ht.shape)
        
        if self.bidirectional: 
            ht_for_decoder = torch.cat((ht[-1],ht[-2]),axis=1)
            ht_for_decoder = ht_for_decoder.unsqueeze(0)
        else: ht_for_decoder = ht[-1].unsqueeze(0)
        if self.debug: print("ht for decoder shape",ht_for_decoder.shape)
            
        return ht,ht_for_decoder
    
    def init_hidden(self):
        first_param = self.num_lstm_layers
        if self.bidirectional: first_param *= 2
        return torch.zeros(first_param, 1, self.hidden_size)

class UnjumbleDecoderModel(nn.Module):
    def __init__(self,vocab_size,embedding_dim,num_lstm_layers,hidden_size,make_bidirectional,debug):
        super().__init__()
        self.debug = debug
        self.bidirectional = make_bidirectional
        self.num_lstm_layers = num_lstm_layers
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(input_size=embedding_dim,hidden_size=hidden_size,
                            num_layers=num_lstm_layers,bidirectional=make_bidirectional,batch_first=True)
        self.in_features = hidden_size*2 if make_bidirectional else hidden_size
        self.linear = nn.Linear(in_features=self.in_features, out_features=vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self,x,prev_state):
        if self.debug: print("Before starting: x Shape:",x.shape,"Prev State Shape",prev_state.shape)
        
        x = self.embedding(x)
        x = self.relu(x)
        if self.debug: print("Embedding, x Shape:",x.shape)
        
        x,ht = self.gru(x,prev_state)
        if self.debug: print("GRU, x Shape:",x.shape,"ht shape",ht.shape)
            
        # Resizing caption for Linear Layer
        x = x.reshape(-1,x.shape[2])
        if self.debug: print("Reshaping x Shape:",x.shape)
        
        x = self.linear(x)
        if self.debug: print("Linear x Shape:",x.shape)
        
        op = self.log_softmax(x)
        if self.debug: print("log_softmax op Shape:",op.shape)
        
        return op,ht
        

In [32]:
unknown_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"
Xe,Xd,Y = [],[],[]
with open("processed_jumbled.txt","r") as f:
    for line in f:
        Xe.append(line.strip()+" " +eos_token)
with open("processed_unjumbled.txt","r") as f:
    for line in f:
        Xd.append(sos_token+" "+line.strip())
        Y.append(line.strip()+" " +eos_token)
print(len(Xe),len(Xd),len(Y))

40460 40460 40460


### Putting it all together

In [33]:
Xtr_e, Xval_e, Xtr_d, Xval_d, Ytr, Yval = train_test_split(Xe,Xd,Y,test_size=0.1,random_state=20)
print(len(Xtr_e), len(Xval_e), len(Xtr_d), len(Xval_d), len(Ytr), len(Yval))
encoder_vocab_builder = VocabBuilder(Xtr_e,unknown_token=unknown_token,pad_token=pad_token,sos_token=sos_token,eos_token=eos_token)
decoder_vocab_builder = VocabBuilder(Xtr_d,unknown_token=unknown_token,pad_token=pad_token,sos_token=sos_token,eos_token=eos_token)
encoder_wtoi,encoder_itow = encoder_vocab_builder.word_to_index, encoder_vocab_builder.index_to_word
decoder_wtoi,decoder_itow = decoder_vocab_builder.word_to_index, decoder_vocab_builder.index_to_word
len(encoder_itow),len(encoder_wtoi),len(decoder_wtoi),len(decoder_itow)

36414 4046 36414 4046 36414 4046


(3005, 3005, 3005, 3005)

In [34]:
device = torch.device("cuda:0")
hidden_size_encoder = 400
hidden_size_decoder = hidden_size_encoder
model_encoder = UnjumbleEncoderModel(
    vocab_size=len(encoder_wtoi),embedding_dim=1000,num_lstm_layers=2,
    hidden_size=hidden_size_encoder,make_bidirectional=True,debug=True
).to(device)
if model_encoder.bidirectional: hidden_size_decoder = 2*hidden_size_encoder
model_decoder = UnjumbleDecoderModel(
    vocab_size=len(encoder_wtoi),embedding_dim=1000,num_lstm_layers=1,
    hidden_size=hidden_size_decoder,make_bidirectional=False,debug=True
).to(device)
loss_fn = nn.NLLLoss()
optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),lr=0.003)
optimizer_decoder = torch.optim.Adam(model_decoder.parameters(),lr=0.003)

In [35]:
data_index = 6
encoder_encode_decode = EncodeDecode(encoder_wtoi,encoder_itow,pad_token,unknown_token)
decoder_encode_decode = EncodeDecode(decoder_wtoi,decoder_itow,pad_token,unknown_token)
print(Xtr_e[data_index],encoder_encode_decode.get_encoding(Xtr_e[data_index]))
print(Xtr_d[data_index],decoder_encode_decode.get_encoding(Xtr_d[data_index]))
print(Ytr[data_index],decoder_encode_decode.get_encoding(Ytr[data_index]))

init_ht_for_encoder = model_encoder.init_hidden().to(device)
model_encoder.train()
model_decoder.train()

optimizer_encoder.zero_grad()
optimizer_decoder.zero_grad()
Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xtr_e[data_index])]).to(device)
Xd_b = torch.tensor([decoder_encode_decode.get_encoding(Xtr_d[data_index])]).to(device)
Y_b = torch.tensor([decoder_encode_decode.get_encoding(Ytr[data_index])]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
op,_ = model_decoder(Xd_b,ht_for_decoder)
ht = ht.detach()
loss = loss_fn(op,Y_b.reshape(-1))
loss.backward()
optimizer_encoder.step()
optimizer_decoder.step()
print("---------------------------------------------")
optimizer_encoder.zero_grad()
optimizer_decoder.zero_grad()
Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xtr_e[data_index+1])]).to(device)
Xd_b = torch.tensor([decoder_encode_decode.get_encoding(Xtr_d[data_index+1])]).to(device)
Y_b = torch.tensor([decoder_encode_decode.get_encoding(Ytr[data_index+1])]).to(device)
print(Xe_b.shape,Xd_b.shape,Y_b.shape)
ht,ht_for_decoder = model_encoder(Xe_b,ht)
op,_ = model_decoder(Xd_b,ht_for_decoder)
ht = ht.detach()
loss = loss_fn(op,Y_b.reshape(-1))
loss.backward()
optimizer_encoder.step()
optimizer_decoder.step()

a snowy mountain . man descends a <eos> [11, 47, 48, 2, 49, 50, 11, 9]
<sos> a man descends a snowy mountain . [0, 10, 47, 48, 10, 49, 50, 9]
a man descends a snowy mountain . <eos> [10, 47, 48, 10, 49, 50, 9, 3004]
torch.Size([1, 8]) torch.Size([1, 8]) torch.Size([1, 8])
Before starting: x Shape: torch.Size([1, 8]) Prev State Shape torch.Size([4, 1, 400])
Embedding, x Shape: torch.Size([1, 8, 1000])
GRU, op Shape: torch.Size([1, 8, 800]) ht shape torch.Size([4, 1, 400])
ht for decoder shape torch.Size([1, 1, 800])
Before starting: x Shape: torch.Size([1, 8]) Prev State Shape torch.Size([1, 1, 800])
Embedding, x Shape: torch.Size([1, 8, 1000])
GRU, x Shape: torch.Size([1, 8, 800]) ht shape torch.Size([1, 1, 800])
Reshaping x Shape: torch.Size([8, 800])
Linear x Shape: torch.Size([8, 3005])
log_softmax op Shape: torch.Size([8, 3005])
---------------------------------------------
torch.Size([1, 9]) torch.Size([1, 9]) torch.Size([1, 9])
Before starting: x Shape: torch.Size([1, 9]) Prev St

In [36]:
def predict(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi):
    data_index = random.randint(0,100)
    Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xval_e[data_index])]).to(device)
    print(Xval_e[data_index],Xe_b)
    print(Xval_d[data_index])
    
    model_encoder.eval()
    model_decoder.eval()
    with torch.no_grad():
        init_ht_for_encoder = model_encoder.init_hidden().to(device)
        ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
        sos_word = torch.tensor([[decoder_wtoi["<sos>"]]]).to(device)
        op,ht = model_decoder(sos_word,ht_for_decoder)
        unjumbled_sentence = []
        for i in range(25):
            predicted_word = torch.argmax(op,axis=1).tolist()
#             print("Predicted .....................",predicted_word)
            if predicted_word[0] == decoder_wtoi["<eos>"]: break
            unjumbled_sentence.append(decoder_itow[predicted_word[0]])
            op,ht = model_decoder(torch.tensor([predicted_word]).to(device),ht)
        print("_______________________________________")
        print(unjumbled_sentence)

In [37]:
predict(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi)

pool spider-man boy a the wearing <unk> at of . edge young the <eos> tensor([[ 193, 1453,   12,   11,    3,  106,    5,    8,    4,    2,  740,   27,
            3,    9]], device='cuda:0')
<sos> a young boy wearing spider-man <unk> at the edge of the pool .
Before starting: x Shape: torch.Size([1, 14]) Prev State Shape torch.Size([4, 1, 400])
Embedding, x Shape: torch.Size([1, 14, 1000])
GRU, op Shape: torch.Size([1, 14, 800]) ht shape torch.Size([4, 1, 400])
ht for decoder shape torch.Size([1, 1, 800])
Before starting: x Shape: torch.Size([1, 1]) Prev State Shape torch.Size([1, 1, 800])
Embedding, x Shape: torch.Size([1, 1, 1000])
GRU, x Shape: torch.Size([1, 1, 800]) ht shape torch.Size([1, 1, 800])
Reshaping x Shape: torch.Size([1, 800])
Linear x Shape: torch.Size([1, 3005])
log_softmax op Shape: torch.Size([1, 3005])
Before starting: x Shape: torch.Size([1, 1]) Prev State Shape torch.Size([1, 1, 800])
Embedding, x Shape: torch.Size([1, 1, 1000])
GRU, x Shape: torch.Size([1, 1, 800

## For Actual Training

In [38]:
device = torch.device("cuda:0")
hidden_size_encoder = 300
hidden_size_decoder = hidden_size_encoder
model_encoder = UnjumbleEncoderModel(
    vocab_size=len(encoder_wtoi),embedding_dim=1000,num_lstm_layers=2,
    hidden_size=hidden_size_encoder,make_bidirectional=True,debug=False
).to(device)
if model_encoder.bidirectional: hidden_size_decoder = 2*hidden_size_encoder
model_decoder = UnjumbleDecoderModel(
    vocab_size=len(encoder_wtoi),embedding_dim=1000,num_lstm_layers=1,
    hidden_size=hidden_size_decoder,make_bidirectional=False,debug=False
).to(device)
loss_fn = nn.NLLLoss()
optimizer_encoder = torch.optim.Adam(model_encoder.parameters(),lr=0.001)
optimizer_decoder = torch.optim.Adam(model_decoder.parameters(),lr=0.001)
epochs = 10

In [40]:
for i in range(epochs):
    init_ht_for_encoder = model_encoder.init_hidden().to(device)
    model_encoder.train()
    model_decoder.train()
    epoch_loss = 0
    for j in range(len(Xtr_e)):
        optimizer_encoder.zero_grad()
        optimizer_decoder.zero_grad()
        Xe_b = torch.tensor([encoder_encode_decode.get_encoding(Xtr_e[j])]).to(device)
        Xd_b = torch.tensor([decoder_encode_decode.get_encoding(Xtr_d[j])]).to(device)
        Y_b = torch.tensor([decoder_encode_decode.get_encoding(Ytr[j])]).to(device)
        ht,ht_for_decoder = model_encoder(Xe_b,init_ht_for_encoder)
        op,_ = model_decoder(Xd_b,ht_for_decoder)
#         ht = ht.detach()
#         init_ht_for_encoder = ht
        loss = loss_fn(op,Y_b.reshape(-1))
        loss.backward()
        optimizer_encoder.step()
        optimizer_decoder.step()
        batch_loss = loss.item()
        epoch_loss += batch_loss
        if j%2000 == 0: print("Epoch:",i,"Batch:",j,"Loss:",batch_loss)
    print("______________________________________")
    print("Epoch Loss:",epoch_loss)
    predict(model_encoder,model_decoder,encoder_encode_decode,decoder_itow,decoder_wtoi)
    print("_______________________________________")

Epoch: 0 Batch: 0 Loss: 2.549741268157959
Epoch: 0 Batch: 2000 Loss: 2.512336254119873
Epoch: 0 Batch: 4000 Loss: 0.63258957862854
Epoch: 0 Batch: 6000 Loss: 2.558344841003418
Epoch: 0 Batch: 8000 Loss: 2.824500799179077
Epoch: 0 Batch: 10000 Loss: 2.8001487255096436
Epoch: 0 Batch: 12000 Loss: 4.142967700958252
Epoch: 0 Batch: 14000 Loss: 2.927480459213257
Epoch: 0 Batch: 16000 Loss: 3.238931894302368
Epoch: 0 Batch: 18000 Loss: 1.7174347639083862
Epoch: 0 Batch: 20000 Loss: 2.857111930847168
Epoch: 0 Batch: 22000 Loss: 3.4794559478759766
Epoch: 0 Batch: 24000 Loss: 2.9830617904663086
Epoch: 0 Batch: 26000 Loss: 0.9317401051521301
Epoch: 0 Batch: 28000 Loss: 1.9379465579986572
Epoch: 0 Batch: 30000 Loss: 1.0846409797668457
Epoch: 0 Batch: 32000 Loss: 3.3337976932525635
Epoch: 0 Batch: 34000 Loss: 3.112779378890991
Epoch: 0 Batch: 36000 Loss: 3.6252198219299316
______________________________________
Epoch Loss: 91561.87906358391
through of very a <unk> ears with brown floppy . grass a 

KeyboardInterrupt: 