In [1]:
!pip install rouge



In [2]:
import math
import numpy as np
import pandas as pd
import os
import random
import re
import spacy
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.autograd import Variable

from glob import glob
from sklearn.model_selection import train_test_split
from tqdm import notebook, tqdm_notebook
from torchtext.data import Field, BucketIterator, TabularDataset

from rouge import Rouge

## Load data

In [3]:
glob('../../data/pl_articles/*')

['../../data/pl_articles/val.csv',
 '../../data/pl_articles/test.csv',
 '../../data/pl_articles/train.csv']

In [108]:
nlp = spacy.load('pl_spacy_model', disable=['ner', 'parser'])

In [201]:
def tokenize_pl(text):
    text = re.sub(r"(\„|\”|\–|\\)", '', str(text))
    text = re.sub(r"(\/)", ' ', str(text))
    text = re.sub("[0-9]+", " NUM ", str(text)) # hide numbers
    text = re.sub("\s+", ' ', str(text))
    return [tok.text for tok in nlp.tokenizer(text)]

## Get dataset

In [194]:
def load_dataset(batch_size):
    TEXT = Field(tokenize=tokenize_pl, include_lengths=True, tokenizer_language='pl',
                 init_token='<sos>', eos_token='<eos>')
    SUMMARY = Field(tokenize=tokenize_pl, include_lengths=True, tokenizer_language='pl',
                    init_token='<sos>', eos_token='<eos>')
    train, val, test = TabularDataset.splits(
        skip_header=True, 
        path='../../data/pl_articles/', format='csv', 
        fields=[('index', None), ('lead', SUMMARY), ('text', TEXT)],
        train='train.csv', validation='val.csv', test='test.csv'
    )
    TEXT.build_vocab(train, min_freq=2)
#     SUMMARY.build_vocab(train, min_freq=2)
    SUMMARY.vocab = TEXT.vocab
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test), batch_size=batch_size, repeat=False, sort_key=lambda x: len(x.text), sort_within_batch=False
    )
    return train_iter, val_iter, test_iter, TEXT, SUMMARY

In [195]:
batch_size = 32

In [198]:
train_iter, val_iter, test_iter, TEXT, SUMMARY = load_dataset(batch_size)

In [199]:
len(TEXT.vocab.stoi)

193831

In [200]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1ad747690>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             '.': 4,
             '<': 5,
             '>': 6,
             'num': 7,
             'w': 8,
             'z': 9,
             'na': 10,
             'i': 11,
             'być': 12,
             'się': 13,
             'nie': 14,
             'to': 15,
             'do': 16,
             'on': 17,
             'że': 18,
             'który': 19,
             'ten': 20,
             'mieć': 21,
             'o': 22,
             'a': 23,
             'po': 24,
             'ale': 25,
             'rok': 26,
             '–': 27,
             'od': 28,
             'móc': 29,
             'jak': 30,
             'mecz': 31,
             'co': 32,
             'za': 33,
             'dla': 34,
             'już': 35,
             'bardzo': 36,
             'będzie': 37,
     

In [125]:
def indices_from_text(text, lang=TEXT):
    indices = []
    for word in text.strip().split(' '):
        indices.append(lang.vocab.stoi[word])
    return Variable(torch.LongTensor(indices)).cuda()

In [126]:
def text_from_indices(indices, lang=TEXT):
    text = ""
    for element in indices:
        if type(element) is torch.Tensor:
            text += lang.vocab.itos[element.item()] + " "
        else:
            text += lang.vocab.itos[element] + " "
    return text

#### Random text

In [166]:
batch = next(iter(train_iter))

In [171]:
text_from_indices(batch.text[0].transpose(0, 1)[-1])

'<sos> nowy linia autobusowy mieć numer 57 . autobus będzie ruszać z dworzec lokalny w Rzeszów a kurs zakończyć w Łańcut podzwierzyniec . kurs z Rzeszów dworzec lokalny wykonywać będą w dzień roboczy o godz. kurs z Łańcut podzwierzyniec wykonywać będą w dzień roboczy o godz. pełny rozkład przystanek dostępny na strona ztm rzeszów . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> 

#### Random summary

In [172]:
text_from_indices(batch.lead[0].transpose(0, 1)[-1])

'<sos> mieszkaniec Łańcut będą móc od dzisiaj wybrać się do Rzeszów autobus linia mpk . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> '

## Seq2Seq model

In [173]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers,
                          dropout=dropout, bidirectional=True)

    def forward(self, sequence, hidden=None):
        embedding_output = self.embedding(sequence).cuda() # max_text_len x batch_size x embedding_size
        encoder_outputs, hidden = self.gru(embedding_output, hidden)
        # hidden: bidirectional x batch_size x hidden_size
        # output: max_text_len x batch_size x bidirectional * hidden_size
        encoder_outputs = encoder_outputs[:, :, :self.hidden_size] + encoder_outputs[:, :, self.hidden_size:]
        # output: max_text_len x batch_size x hidden_size
        encoder_outputs = encoder_outputs.cuda()
        return encoder_outputs, hidden

In [174]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.hidden_size = hidden_size
        self.attention = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1) 
        attn_energies = self.score(h, encoder_outputs) # batch_size x t x hidden
        return F.softmax(attn_energies, dim=1).unsqueeze(1) # batch_size x t

    def score(self, hidden, encoder_outputs):
        # batch_size x t x 2*hidden -> batch_size x t x hidden
        energy = torch.tanh(self.attention(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(1, 2) # batch_size x t x 2*hidden -> batch_size x t x hidden
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1) # batch_size x 1 x hidden
        energy = torch.bmm(v, energy) # batch_size x 1 x t
        return energy.squeeze(1) # batch_size x t

In [175]:
class DecoderRNN(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_size, embedding_size)
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(hidden_size + embedding_size, hidden_size, n_layers, dropout=dropout)
        self.output = nn.Linear(hidden_size * 2, output_size)

    def forward(self, sequence, hidden, encoder_outputs):
        # Get the embedding of the current input word (last output word)
        embedding_output = self.embedding(sequence).unsqueeze(0).cuda()  # 1 x batch_size x n
        embedding_output = self.dropout(embedding_output)
        # Calculate attention weights and apply to encoder outputs
        attention_weights = self.attention(hidden[-1], encoder_outputs).cuda()
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1)) # batch_size x 1 x n
        context = context.transpose(0, 1)  # (1,B,N)
        # Combine embedded input word and attended context, run through RNN
        decoder_input = torch.cat([embedding_output, context], 2)
        decoder_output, hidden = self.gru(decoder_input, hidden)
        decoder_output = decoder_output.squeeze(0)  # (1,B,N) -> (B,N)
        context = context.squeeze(0)
        decoder_output = self.output(torch.cat([decoder_output, context], 1))
        decoder_output = F.log_softmax(decoder_output, dim=1)
        return decoder_output, hidden, attention_weights

In [176]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, text, summary, teacher_forcing_ratio=0.5):
        batch_size = text.size(1)
        max_len = summary.size(0)
        vocab_size = self.decoder.output_size

        encoder_output, hidden = self.encoder(text)
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(summary.data[0, :]).cuda()  # sos
        
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()
        for t in range(1, max_len):
            output, hidden, attention_weights = self.decoder(
                    output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top_first = output.data.max(1)[1]
            output = Variable(summary.data[t] if is_teacher else top_first).cuda()
        return outputs

In [177]:
rouge = Rouge()

## Train

In [178]:
def train(e, model, optimizer, scheduler, train_iter, vocab_size, grad_clip, lang=TEXT):
    model.train()
    total_loss = 0
    pad = lang.vocab.stoi['<pad>']
    for b, batch in notebook.tqdm(enumerate(train_iter), total=len(train_iter)):
        text, len_text = batch.text
        summary, len_summary = batch.summary
        text, summary = text.cuda(), summary.cuda()
        optimizer.zero_grad()
        output = model(text, summary)
        loss = F.nll_loss(
            output[1:].view(-1, vocab_size),
            summary[1:].contiguous().view(-1),
            ignore_index=pad,
        )
        loss.backward()
        clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        scheduler.step()
        total_loss += loss.data

        if b % 10 == 0 and b != 0:
            total_loss = total_loss / 100
            print(f'[{b}] [loss: {total_loss}] [loss_exp: {math.exp(total_loss)}]')
            total_loss = 0
        if b % 50 == 0 and b != 0:
            target_summary = text_from_indices(summary.transpose(0, 1)[0])
            output_summary = summarize(text_from_indices(text.transpose(0, 1)[0]))[0]
            print('Original :', text_from_indices(text.transpose(0, 1)[0]))
            print(['-' * 80])
            print('Target :', target_summary)
            print(['-' * 80])
            print('Summary :', output_summary)
            print(['=' * 80])
            scores = calculate_rouge(hypothesis=output_summary, reference=target_summary)
            for key, value in scores[0].items():
                print(f'{key.upper()} [precision] : {np.round(value["p"] * 100, 2)}'
                      '| [recall] : {np.round(value["r"] * 100, 2)}'
                      '| [f-score] : {np.round(value["f"] * 100, 2)}')

In [179]:
def evaluate(model, val_iter, vocab_size, lang=TEXT):
    with torch.no_grad():
        pad = lang.vocab.stoi['<pad>']
        total_loss = 0
        for b, batch in enumerate(val_iter):
            text, len_text = batch.text
            summary, len_summary = batch.summary
#             text = Variable(text.data, volatile=True)
#             summary = Variable(summary.data, volatile=True)
            text = Variable(text.data.cuda(), volatile=True)
            summary = Variable(summary.data.cuda(), volatile=True)
            output = model(text, summary, teacher_forcing_ratio=0.0)
            loss = F.nll_loss(
                output[1:].view(-1, vocab_size),
                summary[1:].contiguous().view(-1),
                ignore_index=pad,
            )
            total_loss += loss.data
        return total_loss / len(val_iter)

In [180]:
def calculate_rouge(hypothesis, reference):
    hypothesis = hypothesis.split('<sos>')[1].split('<eos>')[0].strip()
    reference = reference.split('<sos>')[1].split('<eos>')[0].strip()
    scores = rouge.get_scores(hypothesis, reference)
    return scores

In [181]:
def summarize(text):
    with torch.no_grad():
        sequence = indices_from_text(text).unsqueeze(0)
        sequence_length = sequence.size(1)
        encoder_outputs, encoder_hidden = encoder(sequence.transpose(0, 1))
        
        decoder_input = Variable(torch.LongTensor([indices_from_text(TEXT.init_token)])).cuda()
        hidden = encoder_hidden[:decoder.n_layers]
        summary_words = ['<sos>']
        max_summary_length = int(sequence_length * 0.25)
        decoder_attentions = torch.zeros(max_summary_length, sequence_length)
        
        for idx in range(max_summary_length):
            output, hidden, decoder_attention = decoder(
                decoder_input, 
                hidden, 
                encoder_outputs,
            )
            decoder_attentions[idx, :decoder_attention.size(2)] += decoder_attention.squeeze(0).squeeze(0).cpu().data
            top_v, top_i = output.data.topk(1)
            ni = top_i[0]
            if ni == indices_from_text(TEXT.eos_token):
                break
            else:
                summary_words.append(text_from_indices(ni))
            
            decoder_input = Variable(torch.LongTensor([ni])).cuda()
        summary_words.append(TEXT.eos_token)
        summary = " ".join(summary_words).lstrip()
        return summary, decoder_attentions

In [182]:
epochs = 10
lr = 0.01
grad_clip = 10.0
scheduler_step_size = 50
scheduler_gamma = 0.75

In [183]:
hidden_size = 256
embed_size = 128

In [184]:
print(f'[!] preparing dataset...')
text_size, summary_size = len(TEXT.vocab), len(SUMMARY.vocab)
print(f'[TRAIN]: {len(train_iter)} | {len(train_iter.dataset)}\t [TEST]: {len(test_iter)} | {len(test_iter.dataset)}')
print(f'[TEXT_vocab] & [SUMMARY_vocab] (same) {text_size}')

[!] preparing dataset...
[TRAIN]: 5014 | 160423	 [TEST]: 627 | 20054
[TEXT_vocab] & [SUMMARY_vocab] (same) 209439


In [202]:
print("[!] Instantiating models...")
encoder = EncoderRNN(text_size, embed_size, hidden_size,
                  n_layers=2, dropout=0.5)
decoder = DecoderRNN(embed_size, hidden_size, text_size,
                  n_layers=1, dropout=0.5)
seq2seq = Seq2Seq(encoder, decoder).cuda()
optimizer = optim.Adam(seq2seq.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma)
print(seq2seq)

[!] Instantiating models...


  "num_layers={}".format(dropout, num_layers))


AssertionError: Torch not compiled with CUDA enabled

In [None]:
best_val_loss = None
for e in notebook.tqdm(range(1, epochs+1)):
    train(e, seq2seq, optimizer, scheduler, train_iter, text_size, grad_clip, TEXT)
    val_loss = evaluate(seq2seq, val_iter, text_size, TEXT)
    print(f'[Epoch: {e}] val_loss: {val_loss} | val_pp: {math.exp(val_loss)}')

    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        print("[!] saving model...")
        if not os.path.isdir(".save"):
            os.makedirs(".save")
        torch.save(seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (e))
        best_val_loss = val_loss
test_loss = evaluate(seq2seq, test_iter, text_size, TEXT)
print(f'[TEST] loss: {test_loss}')

In [None]:
glob('../working/*')

In [None]:
import os
os.chdir(r'../working')

In [None]:
torch.save(seq2seq.state_dict(), 'seq2seq_10.pt')