# NEWS SUMMARIZATION USING SEQ2SEQ
THE PURPOSE OF THIS PROJECT IS TO BUILD A MODEL THAT SUMMARIZES NEWS ARTICLES USING DEEP LEARNING .THE SEQ 2 SEQ MODEL 
INCUDES AN ENCODER AND DECODER MODEL

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import nltk
import re
import spacy
from torchtext.data import Field,Iterator,BucketIterator,Example,Dataset
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.snowball import SnowBallStemmer
import random
import math

In [None]:
# first we do some text data processing
class NewsDataset(Dataset):
    def __init__(self,path,fields,**kwargs):
        # we initialize the fields
        if not isinstance(fields[0](tuple,list)):
            fields = [('src'fields[0]),('trg',fields[1])]
            self.new_list = self.read_data()
            # To read news articles and summarize them in a pandas dataframe
            examples = [Example.fromlist(list(item),fields)for item in self_new_list]
            # the above code is so that our model reads news items as torch text examples
        super().__init__(examples,fields,**kwargs)
    def __len__(self):
        try :
            return len(self.examples)
        except TypeError:
            return 2**32
    def __get__item(self,index):
        return self.examples[index]
    def __iter__(self):
        for x in self.examples:
            yield x
    def __getattr__(self,atrr):
        if attr in self.fields:
            for x in self.examples:
                yield getattr(x,atrr)
    def read_data(self):
        Articles = []
        Summaries = []
        for d,path,filenames in tqdm(os.walk(self.path)):
            for file in filenames:
                if os.path.isfile(d+'/'+file):
                    if ('Summaries') in (d+'/'+file):
                        with open(d+'/'+file,errors ='ignore') as f:
                            summary = 
                            
                            ''.join([i.rstrip() for i in f.readlines[i]])
                            Summaries.append(summary)
                 else :
                    with open(d+'/'+file,'r',errors='ignore') as f:
                        articles = ''.join([i.rstrip() for i in f.readlines(i)])
                        Articles.append(articles)
            return zip(Articles,Summaries)
    #  A functions that cleans text data 
    def clean_data(self.text):
        text = self._remove_links(text)
        text = self._remove_numbers(text)
        text = self._remove_punct(text)
        return text.lower
    def _remove_punct(self,text):
        no_punct = ''
        for c in text:
            if c not in string.punctuation:
                no_punct+=c
        return no_punct
    def _remove_numbers(self,text) :
        return re.sub(r'[0-9]','',text)
    def _remove_links(self,text):
        return re.sub(r'http\s+','',text)
    def _get_root(self,word_list):
        ps= PorterStemmer()
        return [ps.stem for word in word_list]
   
    

In [None]:
 def tokenize_en(self,text):
        # Using spacy tokenizer
        #first instantiating the tokenizer
        spacy_eng = spacy.load('en')
        return [tok.text for tok in spacy_en.tokenize(text)]
SRC = Field(tokenizer = tokenizer_eng,
           init_token ='<sos>',
           eos_token = '<eos>',
           fix_length = 500
           lower =True)
TRG = Field(tokenizer = tokenize_eng,
           init_token ='<sos>',
            eos_token = '<eos>',
            fix_lenght = 200
            lower = True)

In [None]:
news_data = NewsDataset(path='/kaggle/input',fields = [SRC,TRG])

In [None]:
# Split our data into train,valid and test sets
train_data,valid_data,test_data = news_data.split(split_ratio=[0.8,0.1,0.1]random_state=42)


In [None]:
# Building our vocabularies
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)
# I am going to create two variables ,one for src len and another for trg len
# Using the variables i will create a function that returns the lenght of the tokens

def get_item_lenght(data):
    src_len = []
    trg_len = []
    for item in data.examples:
        src_len.append(len(vars(item))['src']
        trg_len.append(len(vars(item))['trg']
    return src_len,trg_len

In [None]:
# Build an iterator since we are goin to be using batch sizes
train_iterator,valid_iterator,text_iterator = Iterator.splits((train_data,valid_data,test_data),batch_size=64,sort key = lambda x:len(x.src))

# BUILDING OF THE ENCODER,DECODER AND THE SEQ2SEQ MODEL

In [None]:
# Encoder model
class Encoder(nn.Module):
    def __init__(self,input_dim,emd_dim,hid_dim,n_layer,dropout=0.2,bidirection=True):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding_dim = nn.Embedding(input_dim,emb_dim)
        self.lstm = nn.LSTM(emb_dim,hid_dim,n_layers,bidirection=bidirection,dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self,src):
        embedded = self.dropout(self.embedded(src))
        outputs,(hidden,cell) = self.lstm(embedded)
        return outputs,hidden,cell
class Decoder(nn.Module):
    def __init__(self,output_dim,emb_dim,hid_dim,n_layers,bidirection=True,dropout=0.25):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim,emb_dim)
        self.lstm = nn.LSTM(emb_dim,hid_dim,n_layers,dropout=dropout,bidirection=bidirection)
        self.dropout = nn.Dropout(dropout)
        # A linear layer for our output will be a good fit since its about predicting words
        self.fc = nn.Linear(hid_dim,output_dim)
    def forward(self,trg):
        trg = trg.unsqueeze(0)
        embedded = self.dropout(self.embedding(trg))
        output,(hidden,cell) = self.lstm(embedded)
        prediction = self.fc(output).squeeze(0)
        return prediction,hidden,cell

# SEQ2SEQ MODEL

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device):
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self,src,trg,teacher_forcing_ratio= 0.5):
        # First we define the batch size 
        batch_size = trg.shape[1]
        trg_len  = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # Creating a decoder to store our tensor units
        outputs = torch.zeros(trg_len,batch_size,trg_vocab_size)
        hidden,cell = self.encoder(src)
        dec_input = trg[0,:] # to get the sos token only
        for t in range(1,trg_len):
            output,hidden,cell = self.decoder(dec_input,hidden,cell)
            outputs[t] = output
            teacher_force = random.randn() > teacher_forcing_ratio
            # we use top 1 to get the top output
            top1 = output.argmax(1)
            dec_input =trg[t] if teacher_force else top1
        return outputs
        

In [None]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
enc_emb_dim = 128
dec_emb_dim = 128
hid_dim = 256
n_layers = 2
enc = Encoder(input_dim,emb_dim,hid_dim,n_layers)
dec = Decoder(output_dim,emb_dim,hid_dim,n_layers)
model = Seq2Seq(enc,dec,device)

# MODEL TRAINING

In [1]:
class Seq_Trainer(object):
    def __init__(self,model,train_iterator,valid_iterator,pad_idx,device,clip,learning_rate):
        self.model = model
        self.train_iterator = train_iterator
        self.valid_iterator = valid_iterator
        self.clip =clip
        self.optimizer = torch.optim.Adam(self.model.parameters(),lr=learning_rate)
        self.criterion = nn.CrossEntropyLoss(ignore_idx= pad_idx)
        self.model.apply(self.init_weights)
    def init_weights(self,m):
        for name, param in m.named_parameters():
            nn.init_uniform(param.data,-0.08,0.08)
    def count_parameters(self):
        return sum(p.numel() for p in model.parameters()if p.requires_grad)
    def train(self):
        self.model.train
        epoch_loss = 0.0
        for i , batch in enumerate(self.train_iterator):
            src = batch.src
            trg = batch.trg
            self.optimizer.zero_grad()
            output = model(src,trg)
            output_dim = output[1:,].view(-1,output_dim)
            trg = trg[1:].view(-1)
            loss = self.criterion(output,trg)
            loss.backwards()
            torch.nn.utils.clip_grad_norm(self.model.parameters(),self.clip)
            self.optimizer.step()
            epoch_loss+=loss
        return epoch_loss/len(self.train_iterator)
    def evaluate(self):
        self.model.eval()
        epoch_loss=0.0
        for i,batch in enumerate(valid_iterator):
            src = batch.src
            trg = batch.trg
            self.optimizer.zero_grad()
            output = self.model(src,trg,0)
            output_dim = output[1:].view(-1,output_dim)
            trg = trg[1:].view(-1)
            val_loss = self.criterion(output,trg)
            val_loss.backward()
            torch.nn.utils.clip_grad_norm(self.model.parameters(),self.clip)
            self.optimizer.step()
            epoch_loss +=val_loss.item()
        return epoch_loss/len(valid_iterator)
    def fit(self,nepochs):
        best_valid_loss = float('inf')
        for epoch in range(nepochs):
            start_time = time.time()
            train_loss = self.train()
            valid_loss = self.evaluate()
            epoch_min,epoch_secs = (start_time,end_time)
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            print(f"Epoch {epoch+1:02}|Time {epoch_mins}m {epoch_secs}s")
            print(f'Train Loss{train_loss} |Valid Loss{valid_loss}')
    
    def  predict(self,iterator):
        self.model.eval()
        with torch.no_grad():
            for i ,batch in enumerate(tqdm(iterator)):
                src = batch.src
                trg = batch.trg
                outputs = self.model(src,trg,0) # This is to turn off the teacher forcing ratio
                if i == 0:
                    outputs = torch.argmax(output,-1)
                else :
                    outputs = torch.cat(outputs,torch.argmax(output,-1),dim=-1)
            return torch.transpose(outputs,0,1)

0.001

In [None]:
# We can config some variables
pad_idx = TRG.vocab.stoi(TRG.pad_tokens)
trainer = Seq_Trainer(model,train_iterator,valid_iterator,pad_idx=pad_idx,1,1e-3)

THANKS TO :: 
https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb
https://github.com/Mjkim88/Pytorch-Torchtext-Seq2Seq
https://torchtext.readthedocs.io/en/latest/
https://www.kaggle.com/mallaavinash/text-summarization

# THE END