# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import fileinput
import os, glob

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import tensorflow as tf
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Test is this out Check

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords   #This is just a test to see the features of liveshare

from rouge_score import rouge_scorer

# Extract News Data from the Dataset

## Download and extract the dataset

The first step is to download and extract the dataset. The dataset can be downloaded from here, and then extracted by unzipping the file. After extracting the data, you should have a directory named 16119_db21c91a1ab47385bb13773ed8238c31. The directory contains 31 JSONL files, which you will iterate over to extract the text and title from each of the dictionaries. Finally, you will put these values in a list called dataset and target respectively. The length of the list dataset and target will be 94403. So essentially our dataset size is about 100K.

In [4]:
# Set path of folder with json files
path = '/home/nelly88/Projects/Summarize COVID-19 News Using NLP and PyTorch/16119_db21c91a1ab47385bb13773ed8238c31/'

# Create dataset list to store features and target list to store target values
dataset = []
target = []


# Iterate through the first two json files in the folder
for filename in sorted(os.listdir(path))[:2]: 
     if filename.endswith('.json'):
         
         # Open the files and read through each line
         with open(os.path.join(path, filename), 'r') as f: # This returns f as a list of two strings/lines i.e. the full path of the first two json files
             lines = f.readlines() # Reads throug each line of strings in the list and stores the contents in lines
        # Iterate through each line, load the json files, extract the value of the text key and title and append to dataset and target respectively
         for line in lines:
             dataset.append(json.loads(line)['text']) 
             target.append(json.loads(line)['title'])

print(len(dataset))
print(len(target))

94403
94403


## Text Clean Up

Text cleanup is the process of cleaning up text data so that it is ready for use in a natural language processing application. The goal is to make the data as clean and consistent as possible so that the model can accurately learn from it. The steps include but not limited toe: converting text to lowercase, splitting sentences into individual words, applying the contraction hashmap on all the words of the text, removing the stopwords that are in English, removing apostrophes, using regular expression to remove parentheses outside a word, and using regular expression to remove punctuations. Finally, space characters are added before and after full stops. These steps are included in the `preprocess` function, which is then applied to the dataset and target. The length of most text is around 600, and the length of most target summary sentences are about 30. Any input beyond these lengths is disregarded.

In [5]:
# Define a contraction hashmap
contraction_map = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


### Preprocessing

In [6]:
stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower() # lowercase
    text = text.split() # convert have'nt -> have not
    # Apply the contraction hashmap on all the words of the text
    for i in range(len(text)):
        word = text[i]
        if word in contraction_map:
            text[i] = contraction_map[word]
    text = " ".join(text)
    text = text.split()
    newtext = []
    # Remove the stopwords that are in English
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    # Remove 's. For example your's becomes your
    text = text.replace("'s",'') # convert your's -> your
    # Remove parentheses outside a word. For example (word) becomes word
    text = re.sub(r'\(.*\)','',text) # remove (words)
    # Remove punctuations
    text = re.sub(r'[^a-zA-Z0-9. ]','',text) # remove punctuations
    # Add a space character before and after the full stop. For example . becomes .
    text = re.sub(r'\.',' . ',text)
    return (text)

# Call the preprocess(text) function for all the items of dataset and target. Store the results in X and Y respectively.
X = [preprocess(text) for text in dataset]
Y = [preprocess(text) for text in target]

In [None]:
max_len_text = 600 
max_len_target = 30

short_text=[]
short_summary=[]

# For each text in dataset
for i in range(len(dataset)): 
    # If the length of the text is less than the max length
    if(len(target[i].split())<=max_len_target and len(dataset[i].split())<=max_len_text):
        # Append the text to the short_text list and the summary to the short_summary list
        short_text.append(dataset[i])
        short_summary.append(target[i])
        
# Create a dataframe with the short_text and short_summary
temp_df=pd.DataFrame({'text':short_text,'summary':short_summary})

temp_df.head()

In [None]:
# Remove empty strings from summary and the text column
newdf = temp_df[temp_df['summary'].str.strip().astype(bool)] 
df = newdf[newdf['text'].str.strip().astype(bool)]
df.head()

### Text feature generation

After the text cleanup, we need to convert the text into numerical representations through feature generation. 
One way to generate features is to use a technique called one-hot vector. This involves converting each word into a vector with a single element. Because there are many words, the vector will be huge. However, we can use a trick to trim the data down to a few thousand words.



We need to create a hashmap that keeps track of when each word first appeared in the text. We will also need a hashmap that maps word indices to words themselves, as well as a hashmap that counts the number of occurrences of each word. We will use these hashmaps later to replace rare words.

Finally, we need to mark the start and end of each sentence in the target list. We can do this using special tokens called SOS_token and EOS_token.

The `Lang` class keeps word -> index (word2index) and index -> word (index2word) dictionaries, as well as a count of each word word2count to use to later replace rare words.

The `addSentence` method tokenizes the sentence into words using split, and adds each word to the dictionaries.

The `addWord` method adds a word to both dictionaries, if it is not already there.

In [9]:
SOS_token = 0
EOS_token = 1

# Define a class Lang
class Lang:

    def __init__(self, name):
        self.name = name
        # Create a hashmap word2index that keeps track of when each word first appeared in the text. This is for both the X and Y.
        self.word2index = {}
        # Create a hashmap index2word to keep track of which index is which word. 
        self.word2count = {}
        # Create a separate hashmap word2count to count the number of occurrences of each word. We will need this later to replace rare words.
        self.index2word = {0: "SOS", 1: "EOS"} 
        self.n_words = 2 # Count SOS and EOS

    # Add a sentence to the hashmap word2index and word2count
    def addSentence(self, sentence): 
        for word in sentence.split(' '):
            self.addWord(word)
    
    # Add a word to the hashmap word2index and word2count 
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

### Make the features ready for the model

We need to apply the `Lang` class to the data and features to get them ready for the model. A function, `readData`, is defined to take text and summary as input. This function creates a tuple from the text and summary, and then creates an input and output object by passing the text and summary to the Lang class. The function then returns the input, output, and pairs. Another function, `prepareData`, is defined to take the list of df['text'] and list of df['summary'] as input. This function calls readData(X,Y) and gets back the input, output, and pairs. For each item in the pairs list, the input.addSentence(pair0) and output.addSentence(pair1) functions are called. Finally, the `prepareData` function returns the input, output, and pairs.

In [10]:
# Create lists of text and summary strings from the dataframe
text_X = list(df['text'])
summary_Y = list(df['summary'])

# Define a function readData that takes text and summary as input
def readData(text, summary):
    print("Reading lines...")
    # Split every line into pairs and normalize
    pairs = [[text[i], summary[i]] for i in range(len(text))] 
    input_lang = Lang(text)
    output_lang = Lang(summary)
    return input_lang, output_lang, pairs

# Define prepareData function that calls readData and returns the input, output, and pairs
def prepareData(text, summary): 
    input_lang, output_lang, pairs = readData(text, summary) # Note that readData takes in the same parameters as prepareData
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        # Add the sentences to the input and output vocabularies
        input_lang.addSentence(pair[0]) 
        output_lang.addSentence(pair[1])
    return input_lang, output_lang, pairs

# Call the prepareData function and store the results in input, output, and pairs
input_lang, output_lang, pairs = prepareData(text_X, summary_Y)

Reading lines...
Read 62358 sentence pairs
Counting words...


In [None]:
print(random.choice(pairs)) # What does this do?
len(pairs)

# Build an Attention Based Deep Learn. Model for Abstractive Text Summarization

## Define a Sequence-to-Sequence Model

The goal of this section is to build a sequence-to-sequence model with Attention for summarizing the news data. This is done by first converting the input sequence to a fixed-length vector using an encoder. The decoder then uses this vector to generate a shorter summary of the text. The attention network figures out which part of the sequence to pay attention to in the decoder network. This allows the model to handle long sequences of text effectively.

### Define the encoder class

In the cell below,  we defined the EncoderRNN class. We also initialized the hidden_size and input_size parameters, and we defined the forward method that takes in an input and a hidden state and returns an output and a new hidden state. Finally, we also created a initHidden method that initializes the hidden state to zeros.



The encoder class is a recurrent neural network that takes in input data and hidden layer initialization parameters. The input_size and hidden_size parameters are set when the encoder is initialized, as well as the embedding layer. The forward propagation of the neural network is executed using the reshaped embedding and the hidden layer input. The initHidden method initializes the initial hidden input.

In [12]:
MAX_LENGTH = max_len_text # Maximum number of words in a sequence


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


### Define the (Attention) decoder class

Here's what the class is doing:

The initializations are pretty straightforward. We're deriving from nn.Module, so we have to call ```super().__init__()```
to properly initialize it. We also define the various layers we'll be using.

The forward function is where the magic happens. We take in an input word, the last hidden state, and all of the
encoder's hidden states. The input word is passed through the embedding layer, then dropped out. Then, it's
concatenated with the previous hidden state, and passed through a linear layer (called attn in the code). This
layer will return an attention weight for each encoder hidden state.

We then take the weighted sum of the encoder hidden states, using the attention weights as the weights. This is
called the context vector, as it represents the "context" of the sentence we're translating.

The context vector and the last hidden state are then passed through the second linear layer, which will make the
final hidden state. This is used to get a score for each word in the output vocabulary.

The output of the decoder is the result of passing the hidden state through the linear layer, then taking the
log softmax of that result.


In [13]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Convert the training data to tensors

The next step is to convert the the list of pairs of sentences into tensors so that they can be used for training. torch.tensor is used for this purpose. This will help ensure that the training data is in a format that can be used by the model.

The `indexesFromSentence` function takes a sentence and a Lang object as input. It then returns a list of indexes corresponding to the words in the sentence. The `tensorFromSentence` function takes a sentence and a Lang object as input. It then returns a tensor of indexes corresponding to the words in the sentence. The `tensorsFromPair` function takes a pair and two Lang objects as input. It then returns a tuple of tensors corresponding to the input and output sentences. The `tensorsFromPairs` function takes a list of pairs and two Lang objects as input. It then returns a list of tuples of tensors corresponding to the input and output sentences.

In [14]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')] 


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

## Train a Sequence-to-Sequence Model

The `train` method takes the input and output tensors, the encoder, decoder, encoder optimizer, decoder optimizer, criterion, and the maximum length of the output as input. It then loops through the input and output tensors, and for each pair, it initializes the encoder and decoder hidden states. It then loops through the input tensor and passes the input tensor and hidden state to the encoder. The encoder outputs the output and hidden state. The decoder input is initialized to SOS_token. The decoder hidden state is initialized to the encoder hidden state. The decoder then loops through the output tensor and passes the decoder input, decoder hidden state, and encoder output to the decoder. The decoder outputs the output, decoder hidden state, and decoder attention weights. The decoder input is set to the index of the word with the highest output value. The loss is calculated by comparing the decoder output to the target output. The encoder and decoder gradients are zeroed out. The encoder and decoder gradients are backpropagated. The encoder and decoder optimizers are stepped. The loss is returned.

### The train method

The `train` method takes the input and output tensors, the encoder, decoder, encoder optimizer, decoder optimizer, criterion, and the maximum length of the output as input. The input_tensor and target_tensor are the Tensors that contain the input and target sentences. The encoder and decoder are the models. The encoder_optimizer and decoder_optimizer are the optimizers for their respective models. First, we set the encoder’s initial hidden state to be all zeros. Then we set up the optimizers, making sure to zero out their gradients first. Next, we get the length of the input and target sentences. We also create a Tensor of zeros that will hold the encoder’s output vectors. This will help us visualize what the attention of the decoder is doing while it is training. Then we loop through each time step in the input sentence. At each time step, we get an output vector (hidden state) from the encoder. We store the encoder’s output vectors. Then we set the initial input to the SOS token for the decoder. We also set the initial hidden state to be the encoder’s final hidden state. Then we decide if we are going to use teacher forcing or not. Teacher forcing is the technique where the target word is passed as the next input to the decoder. See the next section for more details about teacher forcing. If we are using teacher forcing, we use the target sentence as the next input to the decoder. If we are not using teacher forcing, we use the decoder’s prediction as the next input to the decoder. We then calculate the loss, and call loss.backward() to do backpropagation. Finally, we call the optimizer’s step function to update the parameters.




In [22]:
teacher_forcing_ratio = 0.5
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    print(input_length)

    for i in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
        #print(encoder_output.size())
        encoder_outputs[i] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length


In [23]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

### The trainIters method

The `trainIters` method trains a sequence-to-sequence model using gradient descent. The number of iterations to train, the learning rate, and the optimizers for the encoder and decoder objects must be defined. The input and target tensors are converted to arrays and the loss is calculated. The "train" method is called num_iters times. Log messages can be added to track the start and end of training. The loss can be saved to monitor the model's performance over time.

 



In [28]:
def trainIters(encoder, decoder, n_iters, learning_rate=0.01):
    print("Training....")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
    

### Inference

The `infer` functions takes four arguments: the encoder, decoder, sentence, and max_length. The max_length argument sets the maximum number of words in the output sentence. For every pair in the input data, the infer method outputs two values: output words and attentions. The output sentence is simply a string consisting of the output words. The evaluation_input.txt file contains a list of target summary sentences and their corresponding output sentences. This file will be used in model evaluation.

In [25]:
def infer(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [26]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        output_words, attentions = infer(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        with open('evaluation_input.txt','a') as out:
            out.write('{}, {}\n'.format(pair[1],output_sentence))

### Train the model

 First, create an encoder object from the Encoder class and a decoder object from the AttnDecoder class. Then, pass those two objects along with 1000 as the parameter to the `trainIters` function and see the training happening.

In [None]:
hidden_size = 300
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder, attn_decoder, 10)

### Save the model

The model can be saved as a state_dict, which is a Python dictionary that stores the current state of the model. The model can also be saved as a tar file, which stores the entire model including the model parameters, the optimizer parameters, and the state_dict. The model can be loaded using the `load_state_dict` method.

In [None]:
# Save the model as state_dict for Inference

ENCODER_MODEL_PATH_10 = 'encoder_10.pth'
DECODER_MODEL_PATH_10 = 'decoder_10.pth'
torch.save(encoder.state_dict(), ENCODER_MODEL_PATH_10)
torch.save(attn_decoder.state_dict(), DECODER_MODEL_PATH_10)

# Save the model as state_dict for Inference
ENCODER_ENITRE = 'encoder_entire.pth'
DECODER_ENITRE = 'decoder_entire.pth'
torch.save(encoder, ENCODER_ENITRE)
torch.save(attn_decoder, DECODER_ENITRE)


ENCODER_PARAMS = 'encoder_params.tar'
learning_rate=0.01
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

torch.save({
            'epoch': 10,
            'model_state_dict': encoder.state_dict(),
            'optimizer_state_dict': encoder_optimizer.state_dict(),
            'loss': loss,
            }, ENCODER_PARAMS)

### Apply the Inference

In [None]:
evaluateRandomly(encoder, attn_decoder)

# Model Evaluation

## Evaluate summarization results with ROUGE score

We will use Rouge score to evaluate the summarization results of the models. ROUGE assigns a score to your model based on the similarity of words between a human-generated summary and a machine-generated summary. That means, ROUGE simply counts the number of n-grams that are similar in your model-generated summary with respect to a human-generated summary. For example, ROUGE-1 means the overlap of unigrams between the model-generated summary and the human-generated summary. ROUGE-2 means the overlap of bigrams and so on.

In [None]:
# Read each line from input text and create a list of tuple
def read_input(filename = 'evaluation_input.txt'):
    input_pair = []
    with open(filename) as fp:
        for cnt, line in enumerate(fp):
            temp = []
            #print(line)
            res = line.split(',')
            #print(res[0])
            #print(res[1])
            temp.append(res[0].rstrip('\n'))
            temp.append(res[1].rstrip('\n'))
            input_pair.append(temp)
    return input_pair

# Write scores to a file
def write_score(scores):
    # write scores to a file. This file is the out of this milestone
     with open('score.txt','a') as out:
        out.write(json.dumps(scores))
     
    
# Define a scoring function to instantiate the rogue_scorer object
def scoring(input_pair):

    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    for pair in input_pair:
        scores = scorer.score(pair[0],pair[1])
        print(scores)
        write_score(scores)


input_pair = read_input()
scoring(input_pair)

# Inspecting Your Models with TensorBoard

## Setting up Tensorboard

Setting up Tensorboard is simple – all you need is the TensorFlow installation and the pip install of Tensorboard. Once you have these, you can create a SummaryWriter object that will store the data to be visualized by Tensorboard. Make sure to direct this output to a directory that Tensorboard can access. You can start the Tensorboard service by typing 'tensorboard --logdir=runs' in your terminal, which will give you a message telling you the URL to access Tensorboard. In your browser, go to this URL and you will see the Tensorboard interface. Right now, there is no data to visualize because we have not written anything to Tensorboard yet. We will do this next.

In [None]:
# Import SummaryWriter from torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter

# Create a SummaryWriter object writer by specifying a directory from which the Tensorboard will get the data to visualize
writer = SummaryWriter('runs/summary')

## Working with TensorBoard

You can use TensorBoard to observe training loss of each iteration, total training loss over each epoch, encoder parameters and decoder parameters. To do this, you first need to define a function to write the data to TensorBoard. You can then call this function in the code for your program. Finally, you can use the TensorBoard tool to view the data.

TensorBoard takes different types of input and creates different representations of them, such as scalars, image, text, histogram, graph, embedding, and so on. In this project, we focused on scalars and histograms. Scalars are for the metrics we generally think of such as loss, accuracy, and so on. which we can visualize using `.add_scalars`. We can also visualize different model parameters with `.add_histogram`. For example, we can observe training loss of each iteration, total training loss over each epoch, encoder parameters and decoder parameters. TensorBoard is a useful tool for visualizing training data and observing model behavior.

In [None]:
global_step = 1000

# For tensorboard visualization       
def variable2numpy(var):  
    return var.data.cpu().numpy()

# Define a function write_to_tensorboard(writer, loss, total_loss, encoder, decoder). Here writer is the SummaryWriter object
def write_to_tensorboard(writer, loss, total_loss, encoder, decoder):
    
    # Write the scalars to tensorboard as
    writer.add_scalars('loss in each iteration', {'loss':loss})
    writer.add_scalars('total loss', {'total loss':total_loss})
    
    # Write the encoder parameter using .add_histogram method
    for name, param in encoder.named_parameters():
        name = name.replace('.', '/')
        writer.add_histogram('encoder/{}'.format(name), variable2numpy(param), global_step, bins='auto')
        if param.grad is not None:
            writer.add_histogram('encoder/{}/grad'.format(name), variable2numpy(param.grad), global_step, bins='auto')
            
    # Write the decoder parameter using .add_histogram method
    for name, param in decoder.named_parameters():
        name = name.replace('.', '/')
        writer.add_histogram('decoder/{}'.format(name), variable2numpy(param), global_step, bins='auto')
        if param.grad is not None:
            writer.add_histogram('decoder/{}/grad'.format(name), variable2numpy(param.grad), global_step, bins='auto')

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        write_to_tensorboard(writer, loss, plot_loss_total, encoder, decoder)

In [None]:
# Import SummaryWriter from torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter

# Create a SummaryWriter object writer by specifying a directory from which the Tensorboard will get the data to visualize
writer = SummaryWriter('runs/summary')

# Iterate on Hyperparameters to Improve your Model

Many hyperparameters can impact the performance of a deep learning model, but not all have the same impact. For this project, we will focus on two - the learning rate and the size of hidden layers. We will experiment with different values for each to see how they affect the model's accuracy and efficiency. By training the model with different values for these parameters, we can find the optimal values that produce the best results.