In [None]:
import gensim
import nltk
import numpy as np
import pandas as pd
import gzip
import torch
from nltk.corpus import brown
from torchtext.datasets import SQuAD1
import string
import torch.nn as nn
import random 

stemmer = nltk.stem.snowball.SnowballStemmer('english')

nltk.download('brown')
nltk.download('punkt')

# Output, save, and load brown embeddings

model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

w2v = gensim.models.Word2Vec.load('brown.embedding')

In [None]:
SOS_token = 0
EOS_token = 1

class Vocab:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.words_count = len(self.word2index)

    def add_words(self, sentence):
        for word in sentence.split(" "):
            if word not in self.word2index:
                self.word2index[word] = self.words_count
                self.index2word[self.words_count] = word
                self.words_count += 1

In [None]:
def loadDF(path):
    '''
    You will use this function to load the dataset into a Pandas Dataframe for processing.
    '''
    dataset_train, dataset_dev = SQuAD1(root = path, split = ('train', 'dev'))

    df_train = pd.DataFrame.from_dict(dataset_train)
    df_dev = pd.DataFrame.from_dict(dataset_dev)
    
    return df_train, df_dev


def prepare_text(sentence):
    
    '''
    Our text needs to be cleaned with a tokenizer. This function will perform that task.
    https://www.nltk.org/api/nltk.tokenize.html
    '''
    sentence = ''.join([s.lower() for s in sentence if s not in string.punctuation])
    sentence = ' '.join(stemmer.stem(w) for w in sentence.split())
    tokens = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence)
    
    return tokens



def train_test_split(SRC, TRG):
    
    '''
    Input: SRC, our list of questions from the dataset
            TRG, our list of responses from the dataset
    Output: Training and test datasets for SRC & TRG
    '''
    
    return SRC_train_dataset, SRC_test_dataset, TRG_train_dataset, TRG_test_dataset


In [None]:
class Encoder(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        
        super(Encoder, self).__init__()
        
        # self.embedding provides a vector representation of the inputs to our model
        
        # self.lstm, accepts the vectorized input and passes a hidden state
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        
    
    def forward(self, inputs):
        
        '''
        Inputs: i, the src vector
        Outputs: o, the encoder outputs
                h, the hidden state
                c, the cell state
        '''
        outputs = self.embedding(i).view(1, 1, -1)
        outputs, (hidden, cell_state) = self.lstm(outputs)
        
        return outputs, hidden, cell_state
    

class Decoder(nn.Module):
      
    def __init__(self, hidden_size, output_size):
        
        super(Decoder, self).__init__()
        
        # self.embedding provides a vector representation of the target to our model
        
        # self.lstm, accepts the embeddings and outputs a hidden state

        # self.ouput, predicts on the hidden state via a linear output layer
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.embedding = nn.Embedding(output_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim= 1)
        
    def forward(self, inputs, hidden):
        
        '''
        Inputs: i, the target vector
        Outputs: o, the prediction
                h, the hidden state
        '''
        outputs = self.embedding(inputs).view(1, 1, -1)
        outputs, hidden = self.lstm(outputs, hidden_size)
        outputs = self.softmax(self.fc(outputs[0]))        
        
        return outputs, hidden
        
        

class Seq2Seq(nn.Module):
    
    def __init__(self, encoder_input_size, encoder_hidden_size, decoder_hidden_size, decoder_output_size):
        
        super(Seq2Seq, self).__init__()
        
        self.input_size = encoder_input_size
        self.en_hidden_size = encoder_hidden_size
        self.de_hidden_size = decoder_hidden_size
        self.output_size = decoder_output_size
        
        self.encoder = Encoder(self.input_size, self.en_hidden_size)
        self.decoder = Decoder(self.de_hidden_size, self.output_size)       
    
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):      
        
        outputs = {
            'decoder_output':[]
        }
        
        encoder_hidden = torch.zeros([1, 1, self.hidden_size]).to(device) # 1 = number of LSTM layers
        cell_state = torch.zeros([1, 1, self.hidden_size]).to(device)  
        
        for i in range(src_len):
            encoder_output, encoder_hidden, cell_state = self.encoder(src[i], encoder_hidden, cell_state)

        decoder_input = torch.Tensor([[0]]).long().to(device) # 0 = SOS_token
        decoder_hidden = encoder_hidden
        
        for i in range(trg_len):
            decoder_output, decoder_hidden, cell_state = self.decoder(decoder_input, decoder_hidden, cell_state)
            outputs['decoder_output'].append(decoder_output)
            
            if self.training: # Model not in eval mode
                decoder_input = target_tensor[i] if random.random() > teacher_force else decoder_output.argmax(1) # teacher forcing
            else:
                _, top_index = decoder_output.data.topk(1)
                decoder_input = top_index.squeeze().detach()        
        
        return outputs

In [None]:
df_train, df_dev = loadDF('.data')
df_train.head()

In [None]:
feature = ["Sentence", "Question", "Answer", "?"]

df_train.columns = feature
df_dev.columns = feature

df_train.head()

In [None]:
df_train = df_train[["Question", "Answer"]]
df_dev = df_dev[["Question", "Answer"]]

df_train.head()

In [None]:
# df_train_clean
df_train["Question"] = df_train["Question"].astype(str)
df_train["Answer"] = df_train["Answer"].astype(str)

df_train.head()

In [None]:
df_train["Answer"] = df_train["Answer"].str.replace("[", "")
df_train["Answer"] = df_train["Answer"].str.replace("]", "")
df_train["Answer"] = df_train["Answer"].str.replace("'", "")

df_train.head()

In [None]:
# df_dev_clean
df_dev["Question"] = df_dev["Question"].astype(str)
df_dev["Answer"] = df_dev["Answer"].astype(str)

df_dev.head()

In [None]:
df_dev["Answer"] = df_dev["Answer"].str.replace("[", "")
df_dev["Answer"] = df_dev["Answer"].str.replace("]", "")
df_dev["Answer"] = df_dev["Answer"].str.replace("'", "")

df_dev.head()

In [None]:
feature = ["SRC", "TRG"]
df_train.columns = feature
df_dev.columns = feature

df_train.head()

In [None]:
df_train["new_SRC"] = "SOS " + df_train["SRC"] + " EOS"
df_dev["new_SRC"] = "SOS " + df_dev["SRC"] + " EOS"

df_train.head()

In [None]:
df_train['new_SRC'] = df_train['new_SRC'].apply(prepare_text)
df_dev['new_SRC'] = df_dev['new_SRC'].apply(prepare_text)

df_train.head()

In [None]:
df_train['TRG'] = df_train['TRG'].apply(prepare_text)
df_dev['TRG'] = df_dev['TRG'].apply(prepare_text)

df_train.head()

In [None]:
SRC_train_dataset = df_train["new_SRC"]
SRC_dev_dataset = df_dev["new_SRC"]
TRG_train_dataset = df_train["TRG"]
TRG_dev_dataset = df_dev["TRG"]

SRC_train_dataset