# Summarizing Covid-19 News Using NLP and Pytorch

In [35]:
import pandas as pd
import numpy as np
import json
import os, glob

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
webhose_2019_12 = '16119_db21c91a1ab47385bb13773ed8238c31/16119_webhose_2019_12_db21c91a1ab47385bb13773ed8238c31_0000001.json'
webhose_2020_01 = '16119_db21c91a1ab47385bb13773ed8238c31/16119_webhose_2020_01_db21c91a1ab47385bb13773ed8238c31_0000001.json'

## Download and extract the dataset

Read each of those files, extract the value of the text key and title key from those objects.

In [44]:
dataset = []
target = []
for filename in [webhose_2019_12, webhose_2020_01]:
    with open(filename, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        result = json.loads(json_str)
        dataset.append(result['text'])
        target.append(result['title'])

The length of the list dataset and target will be 94403. So essentially our dataset size is about 100K.

In [45]:
len(dataset), len(target)

(94403, 94403)

## Text cleanup

In [47]:
from contraction_hashmap import contraction_map

In [54]:
import re
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower() # lowercase
    text = text.split() # convert have'nt -> have not
    for i in range(len(text)):
        word = text[i]
        if word in contraction_map:
            text[i] = contraction_map[word]
    text = " ".join(text)
    text = text.split()
    newtext = []
    for word in text:
        if word not in stop_words:
            newtext.append(word)
    text = " ".join(newtext)
    text = text.replace("'s",'') # convert your's -> your
    text = re.sub(r'\(.*\)','',text) # remove (words)
    text = re.sub(r'[^a-zA-Z0-9. ]','',text) # remove punctuations
    text = re.sub(r'\.',' . ',text)
    return text

In [55]:
X = [preprocess(text) for text in dataset]

In [56]:
len(X)

94403

In [61]:
Y = [preprocess(text) for text in target]

In [62]:
len(Y)

94403

In [63]:
max_len_text = 600
max_len_target = 30

In [64]:
short_text=[]
short_summary=[]

for i in range(len(dataset)):
    if(len(target[i].split())<=max_len_target and len(dataset[i].split())<=max_len_text):
        short_text.append(dataset[i])
        short_summary.append(target[i])

temp_df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [65]:
temp_df.head()

Unnamed: 0,text,summary
0,FDA launches app for health care professionals...,FDA launches app for health care professionals...
1,"Of all of Regina Yan ’s many traits, an open m...",C-Suite Awards: Regina Yan
2,The CURE ID app allows clinicians to share and...,FDA Launches Infectious Disease Crowdsourcing ...
3,The DSB is composed of representatives from tw...,Drug Safety Oversight Board
4,The Centre for Health Protection (CHP) of the ...,Suspected MERS case reported


In [68]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64893 entries, 0 to 64892
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     64893 non-null  object
 1   summary  64893 non-null  object
dtypes: object(2)
memory usage: 1014.1+ KB


In [66]:
newdf = temp_df[temp_df['summary'].str.strip().astype(bool)]
df = newdf[newdf['text'].str.strip().astype(bool)]

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62358 entries, 0 to 64892
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     62358 non-null  object
 1   summary  62358 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


## Text feature generation

Now that we have done the text cleanup, we need to convert the text into numerical representations to be used by the model. This process is called feature generation. There are different ways to generate features out of text data. Here we will use one-hot vector[3] technique with some tweaks.

### Define a class Lang

In [69]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

### Create a hashmap word2index

This keeps track of when each word first appeared in the text. This is for both the X and Y.

In [70]:
word2index = {}

### Create a hashmap index2word to keep track of which index is which word.

In [71]:
index2word = {}

### Create a separate hashmap word2count

This is to count the number of occurrences of each word. We will need this later to replace rare words.

In [73]:
word2count = {}

### Define SOS_token = 0 and EOS_token = 1

We will need to mark the start of the sentence and end of the sentence for all of the sentences in the target list. So, we need to use some special token.

In [72]:
SOS_token = 0
EOS_token = 1

## Make the features ready for the model

### Define a function readData(text, summary) 

This takes text and summary as input. Here text and summary are two lists of strings. When we call this readData function, we will call it with our cleaned data X and Y respectively. This function does the following operations:
Creates a tuple from text and summary as in pairs = [[text[i],summary[i]] for i in range(len(text))]
Creates input and output object by passing text and summary to the Lang class Note that we are only creating objects here. Not executing any other functions from the Lang class.
Return input, output, pairs

In [80]:
def readData(text, summary):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    
    input_lang = Lang(text)
    output_lang = Lang(summary)

    return input_lang, output_lang, pairs

### Define a function prepareData that takes list(df['text']) and list(df['summary']) as input.

This prepareData function calls readData(X,Y) and gets back input, output, and pairs
For each item in the pairs list, we will do the following:

In [77]:
pairs = []
for pair in pairs:
    input.addSentence(pair[0])
    output.addSentence(pair[1])
# return input, output, pairs

In [81]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readData(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    return input_lang, output_lang, pairs

In [104]:
input_lang, output_lang, pairs = prepareData(X, Y)

Reading lines...
Read 94403 sentence pairs
Counting words...


## Deliverable

The deliverable is a Jupyter Notebook documenting your workflow. The end result of this notebook is a list of pairs of sentences. The 1st column in each row of this list is the text sentence and the 2nd column is the target/summary sentence. A sample output below:



In [98]:
print(pairs[0][0])
print()
print(pairs[0][1])

dublin swine healthcare market  growth trends forecast  5 . 2 . 2 coccidiosis 5 . 2 . 3 respiratory diseases 5 . 2 . 4 swine dysentery 5 . 2 . 5 porcine parvovirus 5 . 2 . 6 others 5 . 3 geography 5 . 3 . 1 north america 5 . 3 . 2 europe 5 . 3 . 3 asiapacific 5 . 3 . 4 middle east  africa 5 . 3 . 5 south america 6 competitive landscape 6 . 1 company profiles 6 . 1 . 1 abaxis 6 . 1 . 2 bayer animal health 6 . 1 . 3 boehringer ingelheim 6 . 1 . 4 ceva animal health inc .  6 . 1 . 5 elanco 6 . 1 . 6 idvet 6 . 1 . 7 merck animal health 6 . 1 . 8 merial 6 . 1 . 9 vetoquinol s . a .  6 . 1 . 10 virbac 6 . 1 . 11 zoetis animal healthcare 7 market opportunities future trends information report visit httpswww . researchandmarkets . comrshhuje research markets also offers custom research services providing focused comprehensive tailored research .  contact researchandmarkets . com laura wood senior press manager pressresearchandmarkets . com e . s . t office hours call 19173000470 u . s . can to

## Build an Attention Based Deep Learning Model for Abstractive Text Summarization

### Define a Sequence-to-Sequence Model

In [99]:
MAX_LENGTH = max_len_text

#### Define the encoder class

In [100]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

 #### Define the class AttnDecoder

In [101]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#### Convert the training data to tensors

In [105]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [106]:
tensors = [tensorsFromPair(pair) for pair in pairs]

In [109]:
type(tensors)

list