In [9]:
import json
import spacy
import en_core_web_sm
import numpy as np

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as Func
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence

In [2]:
train = []
with open('./data/train.jsonl', 'r') as f:
    file = list(f)
for row in file:
    train.append(json.loads(row))

In [25]:
# dictionary = {}
# dictionary_rev = {}
# for row in train:
#     tokens = nlp(row['text'])
#     tokens = [token for token in tokens if token.is_alpha]
#     for tk in tokens:
#         if not dictionary.get(tk):
#             dictionary[tk] = len(dictionary)
#             dictionary_rev[len(dictionary)] = tk

In [39]:
train[0]

{'id': '1000000',
 'summary': 'A seven-hundred-year old oak gate at Salisbury Cathedral has been demolished by a drink driver.\n',
 'text': 'The Grade I listed Harnham Gate was hit by a white van that smashed into the structure at about 02:00 BST.\nA 51-year-old man, from West Dean, has been arrested on suspicion of failing to stop, criminal damage and driving with excess alcohol, police said.\nWiltshire Police said the man remains in police custody and they have asked for witnesses to contact them.\n',
 'sent_bounds': [[0, 107], [107, 255], [255, 362]],
 'extractive_summary': 1}

In [None]:
embeddings_dict = {}
with open("./glove.6B/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [169]:
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, w2v, test = False):
        self.data = data
        self.tokenizer = tokenizer
        self.w2v = w2v
        self.test = test
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        
        embedding, words = self.tokenize(self.data[idx])
        embedding = torch.tensor(embedding)
        _, y_words = self.tokenize(self.data[idx], ans = True)
        
        if not self.test:
            return embedding, words, y_words
        else:
            return embedding
    
    def tokenize(self, sentence, ans = False):
        if ans :
            data = sentence['summary']
        else:
            data = sentence['text']
        embedding = []
        words = []
        tokens = self.tokenizer(data)
        
        tokens = filter(lambda x : x.text.lower() in self.w2v.keys(), tokens)
        for token in tokens:
            if token.is_alpha:
                token = token.lower_
                embedding.append(self.w2v[token])
                words.append(token)
        return embedding, words

In [170]:
def create_mini_batch(samples):
    # 測試集有 labels
    tokens_tensors, words, y_words = zip(*samples)
    # zero pad 到同一序列長度
    try:
        tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    except :
        return None, None, None
    return tokens_tensors, words, y_words

In [175]:
input_size = 300
batch_size = 2
nlp = en_core_web_sm.load()
train_dataset = SummaryDataset(train, nlp, embeddings_dict)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=create_mini_batch)


In [176]:
model = LSTM_model(input_size)
for x,words, y_words in train_loader:

#     pred = model(x)
    print(y_words)
    break

(['a', 'seven', 'hundred', 'year', 'old', 'oak', 'gate', 'at', 'salisbury', 'cathedral', 'has', 'been', 'demolished', 'by', 'a', 'drink', 'driver'], ['the', 'parents', 'and', 'sister', 'of', 'the', 'late', 'mp', 'jo', 'cox', 'have', 'opened', 'a', 'new', 'birth', 'centre', 'at', 'a', 'hospital', 'in', 'her', 'constituency'])


In [140]:
class LSTM_model(nn.Module):
    
    def __init__(self, input_size, hidden_size =100, n_layers = 1, drop_prob=0.2, bidirectional = False):
        super(LSTM_model, self).__init__()
        self.bidirectional = bidirectional
        self.model = nn.LSTM(input_size, hidden_size, n_layers,  batch_first = True, bidirectional= bidirectional)
        self.relu = nn.ReLU()
        if bidirectional:
            self.linear = nn.Linear(hidden_size * 2, 2) # input dim is 64*2 because its bidirectional
        else:
            self.linear = nn.Linear(hidden_size, 2)
        self.init_hidden()
        
    def forward(self, x, h= None):

        x, (hn, cn) = self.model(x, h)
        output = self.relu(x)
        output = self.linear(x)
        return output[0]
    
    def init_hidden(self):
        for name, p in self.model.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(p)
            elif 'bias' in name:
                nn.init.constant_(p, 0)