# Summarizing Covid-19 News Using NLP and Pytorch

In [63]:
import pandas as pd
import numpy as np
import json
import os, glob

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
webhose_2019_12 = 'datasubset/16119_webhose_2019_12_db21c91a1ab47385bb13773ed8238c31_0000001.json'
webhose_2020_01 = 'datasubset/16119_webhose_2020_01_db21c91a1ab47385bb13773ed8238c31_0000001.json'

## Download and extract the dataset

Read each of those files, extract the value of the text key and title key from those objects.

In [4]:
dataset = []
target = []
for filename in [webhose_2019_12, webhose_2020_01]:
    with open(filename, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        result = json.loads(json_str)
        dataset.append(result['text'])
        target.append(result['title'])

The length of the list dataset and target will be 94403. So essentially our dataset size is about 100K.

In [5]:
len(dataset), len(target)

(94403, 94403)

In [6]:
target[:10]

['Global Swine Healthcare Market by Products, Diseases & Geography – Forecast to 2024',
 'FDA launches app for health care professionals to report novel uses of existing medicines for patients with difficult-to-treat infectious diseases',
 'C-Suite Awards: Regina Yan',
 'FDA Launches Infectious Disease Crowdsourcing App for Clinicians FDA Launches Infectious Disease Crowdsourcing App for Clinicians',
 'Drug Safety Oversight Board',
 'How Prepared Are We For The Next Pandemic? Not Very, Experts Show',
 'Suspected MERS case reported',
 'Factors associated with and barriers to disclosure of a sexual assault to formal on-campus resources among college students - Mennicke A, Bowling J, Gromer J, Ryan C.',
 "The effect of university students' violence tendency on their attitude towards domestic violence and the factors affecting domestic violence attitudes - Yagiz R, Sevil U, Guner Ö.",
 'YoYo Discusses Career, Teaching and Female Empowerment']

## Text cleanup

In [7]:
from contraction_hashmap import contraction_map

In [8]:
import re
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower() # lowercase
    text = text.split() # convert have'nt -> have not
    for i in range(len(text)):
        word = text[i]
        if word in contraction_map:
            text[i] = contraction_map[word]
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s",'') # convert your's -> your
    text = re.sub(r'\(.*\)','',text) # remove (words)
    text = re.sub(r'[^a-zA-Z0-9. ]','',text) # remove punctuations
    text = re.sub(r'\.',' . ',text)
    return text

In [9]:
X = [preprocess(text) for text in dataset]

In [10]:
len(X)

94403

In [11]:
Y = [preprocess(text) for text in target]

In [12]:
len(Y)

94403

In [13]:
max_len_text = 600
max_len_target = 30

In [14]:
short_text=[]
short_summary=[]

for i in range(len(dataset)):
    if(len(target[i].split())<=max_len_target and len(dataset[i].split())<=max_len_text):
        short_text.append(dataset[i])
        short_summary.append(target[i])

temp_df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [15]:
temp_df.head()

Unnamed: 0,text,summary
0,FDA launches app for health care professionals...,FDA launches app for health care professionals...
1,"Of all of Regina Yan ’s many traits, an open m...",C-Suite Awards: Regina Yan
2,The CURE ID app allows clinicians to share and...,FDA Launches Infectious Disease Crowdsourcing ...
3,The DSB is composed of representatives from tw...,Drug Safety Oversight Board
4,The Centre for Health Protection (CHP) of the ...,Suspected MERS case reported


In [16]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64893 entries, 0 to 64892
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     64893 non-null  object
 1   summary  64893 non-null  object
dtypes: object(2)
memory usage: 1014.1+ KB


In [16]:
newdf = temp_df[temp_df['summary'].str.strip().astype(bool)]
df = newdf[newdf['text'].str.strip().astype(bool)]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62358 entries, 0 to 64892
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     62358 non-null  object
 1   summary  62358 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


## Text feature generation

Now that we have done the text cleanup, we need to convert the text into numerical representations to be used by the model. This process is called feature generation. There are different ways to generate features out of text data. Here we will use one-hot vector[3] technique with some tweaks.

### Define a class Lang

In [18]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

## Make the features ready for the model

### Define a function readData(text, summary) 

This takes text and summary as input. Here text and summary are two lists of strings. When we call this readData function, we will call it with our cleaned data X and Y respectively. This function does the following operations:
Creates a tuple from text and summary as in pairs = [[text[i],summary[i]] for i in range(len(text))]
Creates input and output object by passing text and summary to the Lang class Note that we are only creating objects here. Not executing any other functions from the Lang class.
Return input, output, pairs

In [19]:
def readData(text, summary):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]
    
    input_lang = Lang(text)
    output_lang = Lang(summary)

    return input_lang, output_lang, pairs

### Define a function prepareData that takes list(df['text']) and list(df['summary']) as input.

This prepareData function calls readData(X,Y) and gets back input, output, and pairs
For each item in the pairs list, we will do the following:

In [20]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readData(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang, pairs

In [21]:
input_lang, output_lang, pairs = prepareData(list(df['text']), list(df['summary']))
print(random.choice(pairs))

Reading lines...
Read 62358 sentence pairs
Counting words...
['REGINA - A Saskatchewan university has cancelled all China trips it has organized for the next three months due to the spread of the coronavirus.\nUniversity of Regina spokesman Paul Dederick says anyone who would have been involved in the travel has been advised.\nHe says, effective immediately, all other travel to China by students, staff, faculty as part of student exchanges or research partnerships will require the dean\'s approval and must include a plan on how to decrease health risks.\nDederick says the measures are precautionary as the federal government has issued a travel advisory for China.\nHealth officials in Ontario have confirmed Canada\'s first case of the coronavirus and believe the patient\'s wife to be the second case.\nThey say the risk to Canadians is low.\n"As a precautionary measure the university is taking a pro-active approach to ensure faculty, staff and students travelling to and from China are aw

## Deliverable

The deliverable is a Jupyter Notebook documenting your workflow. The end result of this notebook is a list of pairs of sentences. The 1st column in each row of this list is the text sentence and the 2nd column is the target/summary sentence. A sample output below:



In [22]:
print(pairs[0][0])
print()
print(pairs[0][1])

FDA launches app for health care professionals to report novel uses of existing medicines for patients with difficult-to-treat infectious diseases FDA launches app for health care professionals to report novel uses of existing medicines for patients with difficult-to-treat infectious diseases Authors: FDA Pinworms of the red howler monkey (Alouatta seniculus) in Colombia: gathering the pieces of the pinworm-primate puzzle Publication date: Available online 4 December 2019Source: International Journal for Parasitology: Parasites and WildlifeAuthor(s): Brenda Solórzano-García, Andrés Link Ospina, Silvia Rondón, Gerardo Pérez-Ponce de LeónAbstractPinworms of primates are believed to be highly host specific parasites, forming co-evolutionary associations with their hosts. In order to assess the strength and reach of such evolutionary links, we need to have a broad understanding of the pinworm diversity associated with primates. Here, we employed an integrative taxonomic approach to assess 

## Build an Attention Based Deep Learning Model for Abstractive Text Summarization

### Define a Sequence-to-Sequence Model

In [23]:
MAX_LENGTH = max_len_text

#### Define the encoder class

In [24]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

 #### Define the class AttnDecoder

In [25]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout=0.1, max_length=MAX_LENGTH):
        super(AttnDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#### Convert the training data to tensors

In [26]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

### Train a Sequence-to-Sequence Model

#### The train method

In [27]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for i in range(input_length):
        try:
            encoder_output, encoder_hidden = encoder(input_tensor[i], encoder_hidden)
            encoder_outputs[i] = encoder_output[0, 0]
        except IndexError:
            print('Index Error in train')
            print('index=',i)
            print('input_length', input_length)
            print('target_length', target_length)

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

#### The trainIters method

The trainIters method has 4 important parameters. You can have more parameters for debugging/logging purposes.

- encoder: The encoder object of the Encoder class
- decoder: The decoder object of the AttnDecoder class
- num_iters: Number of iterations you want to train using the train method. This is an integer number.
- learning_rate: Learning rate hyperparameter for your neural network. Feel free to use a default value for this parameter.

We need to define the optimizers for encoder and decoder objects. These optimizers are gradient descent optimizers.

Convert the training pairs to tensors

Define the loss function. We are solving a classification problem, so we have to use a loss function that is commonly used in classification problems: log loss. In particular, here we will use negative log loss as criterion = nn.NLLLoss(). Refer to Understanding how LSTM works.

Finally, we will call the train method num_iters times.

We can save this loss output to look at the training loss over time. This helps us to debug the model and makes sure our model is improving over time.

Add some log messages in the beginning and end of this method to keep track of the start and end of training such as starting training ... and end training ... as an example.

In [28]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [29]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    print('Starting training ...')
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('Iteration', iter)
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    print('Stopping training ...')
    return plot_losses

In [30]:
torch.cuda.is_available()

True

#### Now actually train the model

In [31]:
hidden_size = 300
encoder = Encoder(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoder(hidden_size, output_lang.n_words, dropout=0.1).to(device)

plot_losses = trainIters(encoder, decoder, 500, print_every=50)

Starting training ...
Iteration 50
6m 11s (- 55m 41s) (50 10%) 7.0218
Iteration 100
12m 36s (- 50m 25s) (100 20%) 5.1913
Iteration 150
18m 1s (- 42m 3s) (150 30%) 6.5851
Iteration 200
24m 17s (- 36m 26s) (200 40%) 6.1937
Iteration 250
30m 53s (- 30m 53s) (250 50%) 6.3365
Iteration 300
36m 24s (- 24m 16s) (300 60%) 6.3908
Iteration 350
42m 58s (- 18m 25s) (350 70%) 6.4690
Iteration 400
48m 59s (- 12m 14s) (400 80%) 6.9316
Iteration 450
55m 0s (- 6m 6s) (450 90%) 6.8060
Iteration 500
61m 51s (- 0m 0s) (500 100%) 6.4888
Stopping training ...


#### Save the model

Save the model both as state_dict and the entire model.

Not sure about this as there is no "the model". I am assuming that this refers to the encoder and the decoder.

In [40]:
torch.save(encoder.state_dict(), 'encoder_weights.pth')
torch.save(decoder.state_dict(), 'decoder_weights.pth')

In [34]:
torch.save(encoder, 'encoder.pth')
torch.save(encoder, 'decoder.pth')

#### Inference

Logically it is similar to the train method. But there is no target. So, we feed the decoder’s output as decoder’s input of the next time step. Let’s define a method infer for this purpose.

In [64]:
def infer(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [65]:
def inferRandomly(encoder, decoder, n=10):
    f = open("evaluation_input.txt", "w")
    
    for i in range(n):
        pair = random.choice(pairs)
        output_words, attentions = infer(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        f.write(pair[1]+','+output_sentence+'\n')
    
    f.close()

In [68]:
inferRandomly(encoder, decoder)