### DATA CLEANING/PREPROCESSING

In [None]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

#Function that removes stop-words, punctuations, number and lower-casing
def extract_sentence(sent):
    stop_words = set(stopwords.words('english'))
    
    sent = sent.lower() #Lower casing
    sent = re.sub(re.compile(r'<.*?>'),'', sent)
    tokens = [w for w in sent.split() if not w in stop_words] #remove stop-words
    sent = (" ".join(tokens)).strip()
    
    sent = re.sub(r'\d','', sent) #remove number
    sent = re.sub(r'[^\w\s]','',sent) #remove punctuation
    return sent

In [None]:
import pandas as pd

df = pd.read_csv('newdataset1.csv')

newreview = df['review']
newsummary = df['summary']

clean_review = []
clean_summary = []
for i in range(len(newreview)):
    clean_review.append(extract_sentence(newreview[i]))
    clean_summary.append(extract_sentence(newsummary[i]))

In [None]:
newdata = pd.DataFrame(clean_review, columns=['review'])
newdata['summary'] = clean_summary
newdata

### SPLIT TRAIN AND TEST

In [None]:
from sklearn.model_selection import train_test_split

#split train and test
train, test = train_test_split(newdata, test_size = 0.2)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

### WORDS TO NUMBERS

In [1]:
from torchtext.data import Field, TabularDataset, BucketIterator, BPTTIterator, Iterator
import spacy
import torch

spacy = spacy.load('en')

def tokenize(text):
    return [tok.text for tok in spacy.tokenizer(text)]

review = Field(
    sequential=True, use_vocab = True,tokenize=tokenize,lower=True
)
summary = Field(
    sequential=True, use_vocab = True,tokenize=tokenize,
    lower=True,init_token = '<sos>',eos_token = '<eos>'
)

field = [('review',review),('summary', summary)]

train_x,test_x = TabularDataset.splits(
    path='', skip_header=True, train = 'train.csv', test = 'test.csv',
    format = 'csv', fields= field)

In [2]:
review.build_vocab(train_x, max_size=10000, min_freq=2, vectors='glove.6B.100d')
summary.build_vocab(train_x, max_size=10000, min_freq=2, vectors='glove.6B.100d')

train_iter, test_iter = BucketIterator.splits((train_x, test_x), batch_size = 10, 
                                              sort_key=lambda x:len(x.review), sort_within_batch = True )

### MODEL

In [4]:
from torch import nn
from torch import optim
import random

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, n_layers, dropout):
        super(Encoder, self).__init__()
        
        self.hid_size = hidden_size #hidden_size should be the same as decor
        self.n_layers = n_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, dropout = dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [5]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, output_size, n_layers, dropout):
        super(Decoder, self).__init__()
        
        self.output_size = output_size
        self.hid_size = hidden_size #hidden_size should be the same as encoder
        self.n_layers = n_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        
        x = x.unsqueeze(0) #making x: (1,N)
    
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output)
        
        prediction = prediction.squeeze(0)

        return prediction, hidden, cell

In [6]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder

        
    def forward(self, review, summary, teacher_forcing_ratio = 0.5):

        batch_size = summary.shape[1]
        summary_len = summary.shape[0]
        summary_vocab_size = self.decoder.output_size #len(summary.vocab)

        outputs = torch.zeros(summary_len, batch_size, summary_vocab_size)
        
        hidden, cell = self.encoder(review)
        
        #start token <sos>
        x = summary[0,:]
        
        for t in range(1, summary_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output #store outputs of decoder

            pred = output.argmax(1) #best guess
            
            if random.random() < teacher_forcing_ratio:
                x = summary[t] 
            else:
                x = pred
        
        return outputs

### TRAINING

In [7]:
#training function
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        review = batch.review
        summary = batch.summary
        
        optimizer.zero_grad()
        
        output = model(review, summary)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        summary = summary[1:].view(-1)

        
        loss = criterion(output, summary)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [8]:
#evaluation function
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            review = batch.review
            summary = batch.summary

            output = model(review, summary, 0) 

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            summary = summary[1:].view(-1)

            loss = criterion(output, summary)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [9]:
#function to output summary
def summarize(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            review = batch.review
            summary = batch.summary

            output = model(review, summary, 0) 

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            summary = summary[1:].view(-1)

            loss = criterion(output, summary)
            
            epoch_loss += loss.item()
        
    return output

In [10]:
num_epochs = 20
learning_rate = 0.001
vocab_size_encoder = len(review.vocab)
vocab_size_decoder = len(review.vocab)
output_size = len(summary.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2 #2 or 4 or more
dropout = 0.5

encoder = Encoder(vocab_size_encoder, encoder_embedding_size, hidden_size, num_layers, dropout)
decoder = Decoder(vocab_size_decoder, decoder_embedding_size, hidden_size, output_size,
                      num_layers, dropout)

model = Seq2Seq(encoder, decoder)

In [11]:
pad_idx = review.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

In [12]:
import time

#timer function to keep track of duration of each epoch
def timer(tic, toc):
    time = toc - tic
    mins = int(time / 60)
    sec = int(time - (mins * 60))
    return mins, sec

### RUNNING

In [14]:
#initialize the weight
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(6008, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(6008, 300)
    (rnn): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=1024, out_features=704, bias=True)
  )
)

In [15]:
#training using train.csv

num_epochs = 20
CLIP = 1

for epoch in range(num_epochs):
    
    tic = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    
    toc = time.time()
    
    mins, sec = timer(tic, toc)
    

    torch.save(model.state_dict(), 'modelseq2seq.pt')
    
    print(f'Epoch: {epoch+1} | Time: {mins}m {sec}s')
    print(f'\tTrain Loss: {train_loss:.3f}')

Epoch: 1 | Time: 3m 38s
	Train Loss: 7.051
Epoch: 2 | Time: 2m 33s
	Train Loss: 5.218
Epoch: 3 | Time: 2m 34s
	Train Loss: 4.578
Epoch: 4 | Time: 2m 34s
	Train Loss: 4.333
Epoch: 5 | Time: 2m 33s
	Train Loss: 4.260
Epoch: 6 | Time: 2m 40s
	Train Loss: 4.226
Epoch: 7 | Time: 2m 40s
	Train Loss: 4.185
Epoch: 8 | Time: 2m 33s
	Train Loss: 4.163
Epoch: 9 | Time: 2m 34s
	Train Loss: 4.152
Epoch: 10 | Time: 2m 35s
	Train Loss: 4.130
Epoch: 11 | Time: 2m 41s
	Train Loss: 4.112
Epoch: 12 | Time: 2m 43s
	Train Loss: 4.108
Epoch: 13 | Time: 2m 44s
	Train Loss: 4.087
Epoch: 14 | Time: 2m 48s
	Train Loss: 4.068
Epoch: 15 | Time: 2m 43s
	Train Loss: 4.057
Epoch: 16 | Time: 2m 45s
	Train Loss: 4.042
Epoch: 17 | Time: 2m 52s
	Train Loss: 4.032
Epoch: 18 | Time: 2m 46s
	Train Loss: 4.018
Epoch: 19 | Time: 2m 52s
	Train Loss: 4.006
Epoch: 20 | Time: 2m 51s
	Train Loss: 3.990


### EVALUATING

In [24]:
#evaluation using test.csv
model.load_state_dict(torch.load('modelseq2seq.pt'))
test_loss = evaluate(model, test_iter, criterion)

print(f' Test Loss: {test_loss:.3f}')

 Test Loss: 3.447


In [21]:
#function to convert output of model into words
def tensor_to_sentence(output):
    output = torch.round(output)
    trans_sent = []
    xsumm = []

    for i in range(len(output)):
        xsumm.append(int(output[i]))

    for i in xsumm:
        convert = review.vocab.itos[i]
        trans_sent.append(convert)
        
    trans_sent = " ".join(trans_sent)
    return trans_sent

In [23]:
#summary test
model.load_state_dict(torch.load('modelseq2seq.pt'))
output = summarize(model, test_iter, criterion)

output_sentence = tensor_to_sentence(output[0])
print(output_sentence)

movie youthful yj zombi film one one film film zombi film zombi zombi zombi zeffirelli zhukov zeffirelli zeffirelli zhukov zhukov film zhukov film zhukov film zhukov zhukov zealots zhukov zealots zhukov zhukov zeffirelli zeffirelli zhukov zealots film zeffirelli zeffirelli   zeffirelli zeffirelli zealots zealots zeffirelli zhukov   zealots zhukov zealots zeffirelli zealots zeffirelli zeffirelli   zealots zealots zhukov zeffirelli zeffirelli zealots zeffirelli zeffirelli zeffirelli zeffirelli zeffirelli zealots zeffirelli zeffirelli zeffirelli zhukov zealots zealots zeffirelli zeffirelli zeffirelli zeffirelli film zeffirelli zealots zhukov zeffirelli zeffirelli zeffirelli zeffirelli zeffirelli zeffirelli zealots zealots zeffirelli zealots zeffirelli zeffirelli zeffirelli zealots zeffirelli zealots zaljko zombi zealots zeffirelli zealots zeffirelli zaljko zealots zealots film zeffirelli film zealots zeffirelli zealots zeffirelli zealots zealots zealots zealots zealots zaljko zeffirelli z