In [169]:
import collections

In [170]:
import json
import spacy
import numpy as np
import random
import pickle
from collections import defaultdict
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import time
import os

In [171]:
import torch
import torch.nn as nn
import torch.nn.functional as Func
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable

In [172]:
SEED = 0
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(0)

In [173]:
train = pd.read_json('./data/train.jsonl', lines= True)
valid = pd.read_json('./data/valid.jsonl', lines= True)
test = pd.read_json('./data/test.jsonl', lines= True)

In [389]:
data = 'A man and a child have been killed'

In [394]:
train.shape


(71604, 5)

In [396]:
valid.shape

(20000, 5)

In [390]:
train[train.text.str.contains(data)]


Unnamed: 0,id,summary,text,sent_bounds,extractive_summary


In [391]:
valid[valid.text.str.contains(data)]

Unnamed: 0,id,summary,text,sent_bounds,extractive_summary


In [392]:
test[test.text.str.contains(data)]

Unnamed: 0,id,text,sent_bounds


In [393]:
train.head(1)

Unnamed: 0,id,summary,text,sent_bounds,extractive_summary
0,1000000,A seven-hundred-year old oak gate at Salisbury...,The Grade I listed Harnham Gate was hit by a w...,"[[0, 107], [107, 255], [255, 362]]",1


In [175]:
train.shape, valid.shape

((71604, 5), (20000, 5))

In [303]:
class words_dict():
    def __init__(self):
        self.word_count = collections.defaultdict(int)
        self.id_to_word = {0: '_sos_', 1: '_eos_', 2: '_unk_'}
        self.word_to_id = {'_sos_': 0, '_eos_': 1, '_unk_': 2}
        self.n_words = 3
        self.tokenizer =  RegexpTokenizer(r'\w+')
        self.remain_id = []
        self.max_len = 200
        
    def add_word(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)
        for token in tokens:
            token = token.lower()
            if self.word_to_id.get(token):
                self.word_count[token] += 1
            else:
                self.word_to_id[token] = self.n_words
                self.id_to_word[self.n_words] = token
                self.n_words += 1
                self.word_count[token] = 1

    
    def predict(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)
        t_sen = [0] + (self.max_len - 1) * [1]
        for idx, token in enumerate(tokens):
            if idx+1 == self.max_len - 1:
                t_sen = t_sen[:-1] +  [1]
                break
            token = token.lower()
            if not self.word_to_id.get(token) :
                t_sen[idx+1] = 2
            else:
                t_sen[idx+1] = self.word_to_id[token]
        return t_sen
    
    def sort_dict(self):
        sort_d = sorted(self.word_count.items(), key = lambda x: x[1])[:int(self.n_words *0.8)]
        for (word, j) in sort_d:
            id = self.word_to_id[word]
            del self.word_to_id[word]
            del self.id_to_word[id]
        words = self.word_to_id.keys()
        print(f'Word count after reduce: {len(words)}')
        word_count = 0
        for w in words:
            self.id_to_word[word_count] = w
            self.word_to_id[w] = word_count 
            word_count += 1
        self.n_word = word_count
        return


In [304]:
def transform(df, name, dic= None):
    if not dic:
        dictionary = words_dict()
        for i in range(len(df)):
            text = df.loc[i, 'text']
            dictionary.add_word(text)
        print(f'Total {len(dictionary.word_to_id)} words')
        dictionary.sort_dict()
        t_sen, summary = [], []
        for i in range(len(df)):
            text = df.loc[i, 'text']
            ans = df.loc[i, 'summary']
            t_sen.append(dictionary.predict(text))
            summary.append(dictionary.predict(ans))
        t_sen = torch.tensor(t_sen)
        summary = torch.tensor(summary)
        torch.save(t_sen, f'./data/{name}.trc')
        torch.save(summary, f'./data/{name}_summary.trc')
        print(f'Total {len(dictionary.word_to_id)} words')
        return  dictionary, t_sen, summary
    else:
        t_sen, summary, id_list = [], [], []
        for i in range(len(df)):
            text = df.loc[i, 'text']
            ans = df.loc[i, 'summary']
            id = df.loc[i, 'id']
            t_sen.append(dic.predict(text))
            summary.append(dic.predict(ans))
            id_list.append(id)
        t_sen = torch.tensor(t_sen)
        summary = torch.tensor(summary)
        torch.save(t_sen, f'./data/{name}.trc')
        torch.save(summary, f'./data/{name}_summary.trc')
        torch.save(id_list, f'./data/{name}_idlist')
        print(f'Total {len(dic.word_to_id)} words')
        return  t_sen, summary

In [305]:
train_dict, t_sen, t_sum = transform(train, 'train')

Total 127369 words
Word count after reduce: 25474
Total 25474 words


In [306]:
with open('data/train_dict_cut.pkl', 'wb') as f:
    pickle.dump(train_dict, f, pickle.HIGHEST_PROTOCOL)

In [307]:
v_sen, v_sum = transform(valid, 'valid', train_dict)

Total 25474 words


In [327]:
train_dict.id_to_word[18249]

'fonte'

## save glove embedding mapping  id:{emb}

In [311]:
embeddings_dict = {}
with open("./data/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
# add SOS and EOS
embeddings_dict['_sos_'] =  np.random.rand(300, )
embeddings_dict['_eos_'] =  np.random.rand(300, )
embeddings_dict['_unk_'] =  np.random.rand(300, )

In [312]:
dictionary = words_dict()
with open('./data/train_dict_cut.pkl', 'rb') as f:
    dictionary = pickle.load(f)

In [313]:
mapping_dict = {}
for key, val in dictionary.id_to_word.items():
    mapping_dict[key] = embeddings_dict.get(val, embeddings_dict['_unk_'])

In [314]:
with open('data/glove_id_to_emb.pkl', 'wb') as f:
    pickle.dump(mapping_dict, f, pickle.HIGHEST_PROTOCOL)

In [336]:
max_len = 100
a = torch.rand(8,31)
batch, length = a.size()
lack = max_len - length
end = torch.ones(batch, max_len - length)
b = torch.cat((a,end), axis = 1)
b.shape


torch.Size([8, 100])