In [1]:
import json
import spacy
import en_core_web_sm
import numpy as np
import random
from collections import defaultdict

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as Func
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence

In [526]:
train = []
with open('./data/train.jsonl', 'r') as f:
    file = list(f)
for row in file:
    train.append(json.loads(row))
    
valid = []
with open('./data/valid.jsonl', 'r') as f:
    file = list(f)
for row in file:
    valid.append(json.loads(row))    
    
test = []
with open('./data/test.jsonl', 'r') as f:
    file = list(f)
for row in file:
    test.append(json.loads(row))

In [9]:
# dictionary = {}
# dictionary_rev = {}
# for row in train:
#     tokens = nlp(row['text'])
#     tokens = [token for token in tokens if token.is_alpha]
#     for tk in tokens:
#         if not dictionary.get(tk):
#             dictionary[tk] = len(dictionary)
#             dictionary_rev[len(dictionary)] = tk

In [414]:
len(train)

71604

In [14]:
embeddings_dict = {}
with open("./glove.6B/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [585]:
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, w2v, test = False):
        self.data = data
        self.tokenizer = tokenizer
        self.w2v = w2v
        self.test = test
        
    def __len__(self):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        if not self.test:
            embeddings, labels = self.tokenize(self.data[idx])
            return embeddings, labels
            
        else:
            embeddings, id, words_pos, sentence = self.tokenize_test(self.data[idx])
            return embeddings, id, words_pos, sentence
    
    def tokenize(self, inputs):
        data = inputs['text']
        sent_b = inputs['sent_bounds']
        ans_idx = inputs['extractive_summary']

        all_embeddings = []
        labels = []
        if len(inputs['sent_bounds'])  > 1:
            n_sam_idx = random.randint(0, len(inputs['sent_bounds'])-1)
            while n_sam_idx == ans_idx:
                n_sam_idx = random.randint(0, len(inputs['sent_bounds'])-1)
            
            if n_sam_idx > ans_idx:
                sent_b = [sent_b[ans_idx], sent_b[n_sam_idx]]
                ans_idx = 0
            else:
                sent_b = [sent_b[n_sam_idx], sent_b[ans_idx]]
                ans_idx = 1
        for idx, pos in enumerate(sent_b):
            tokens = self.tokenizer(data[pos[0]: pos[1]])
            embeddings, words = self.clean(tokens)
            all_embeddings += embeddings
            if idx == ans_idx:
                labels += [1 for i in range(len(words))]
            else:
                labels += [0 for i in range(len(words))]

        return torch.tensor(all_embeddings), torch.tensor(labels)
    
    def tokenize_test(self, inputs):
        data = inputs['text']
        sent_b = inputs['sent_bounds']
        all_embeddings = []
        id = inputs['id']
        word_pos = []
        all_sentences = []
        for idx, pos in enumerate(sent_b):
            all_sentences.append(data[pos[0]: pos[1]][:-2])
            tokens = self.tokenizer(data[pos[0]: pos[1]])
            embeddings, words = self.clean(tokens)
            all_embeddings += embeddings
            word_pos += [idx for i in range(len(words))]
        return torch.tensor(all_embeddings), id, word_pos, all_sentences
    
    
    def clean(self, tokens):
        embeddings = []
        words = []
        for idx, token in enumerate(tokens):
            if token.is_alpha and token.text.lower() in self.w2v.keys():
                token = token.lower_
                embeddings.append(self.w2v[token])
                words.append(token)
        return embeddings, words
  

In [577]:
def create_mini_batch(samples):
    # 測試集有 labels
    tokens_tensors,  labels = zip(*samples)
    # zero pad 到同一序列長度
    try:
        tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
        labels = pad_sequence(labels, batch_first=True)
#         print(len(tokens_tensors), len(labels))
    except :
        return [],[]
    return tokens_tensors, labels

def create_mini_batch_test(samples):
    tokens_tensors,  words, words_pos, sentence = zip(*samples)
    try:
        tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    except :
        return [], [], [], []
    return tokens_tensors, words, words_pos, sentence

In [520]:
class LSTM_model(nn.Module):
    
    def __init__(self, input_size, hidden_size = 200, n_layers = 1, drop_prob=0.2, bidirectional = False):
        super(LSTM_model, self).__init__()
        self.bidirectional = bidirectional
        self.model = nn.LSTM(input_size, hidden_size, n_layers,  batch_first = True, bidirectional= bidirectional)
        self.relu = nn.ReLU()
        if bidirectional:
            self.l1 = nn.Linear(hidden_size * 2, 128)
            self.l2 = nn.Linear(128, 32)
            self.l3 = nn.Linear(32, 2)
        else:
            self.l1 = nn.Linear(hidden_size, 128)
            self.l2 = nn.Linear(128, 32)
            self.l3 = nn.Linear(32, 2)
        self.init_hidden()
        
    def forward(self, x, h= None):
        x, (hn, cn) = self.model(x, h)
        l1 = self.l1(x)
        l2 = self.l2(l1)
        l3 = self.l3(l2)
        l3 = l3.view(-1, l3.size(2))
#         out = Func.softmax(l3, dim=2) # along rows
        return l3
    
    def init_hidden(self):
        for name, p in self.model.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(p)
            elif 'bias' in name:
                nn.init.constant_(p, 0)

In [443]:
input_size = 300
batch_size = 64
nlp = en_core_web_sm.load()
train_dataset = SummaryDataset(train, nlp, embeddings_dict)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=create_mini_batch, drop_last = True)

In [444]:
model = LSTM_model(input_size)
model = model.cuda()
index = 0

In [446]:
for x, y in train_loader:
    if len(x) == 0:
        continue
    x = x.cuda()
    y = y.cuda()
    pred = model(x)
    temp_loss = 0
    total = 0
    total_loss = 0
#     print(x.shape, y.shape)     torch.Size([2, 56, 300]) torch.Size([2, 56])
#     print(pred.shape)           torch.Size([112, 2])
    w_1 = sum([sum(y[i]) for i in range(len(y))]).tolist()
    w_0 = sum([len(y[i]) for i in range(len(y))])
#     total = w_0 + w_1
#     w_0 = w_0 / total
#     w_1 = w_1 / total
#     weight = torch.tensor((w_1, w_0))
#     loss_f = nn.CrossEntropyLoss(weight = weight.float())
#         loss_f = nn.CrossEntropyLoss()
    loss_f = nn.BCEWithLogitsLoss(pos_weight = torch.tensor(w_0, w_1))
    y = y.view(-1)
    true = torch.tensor([[0,1] if j == 1 else[1,0] for j in y]).cuda().float()

    loss = loss_f(pred, true)
    loss.backward(retain_graph=True)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    opt.step()
    opt.zero_grad()
    total += len(y)
    total_loss += loss.item() * len(y)

    print(f'Iteration : {index+1} , Loss : {total_loss/total } ', end = '\r')
    index += 1

In [680]:
input_size = 300
batch_size = 1
test_dataset = SummaryDataset(test, nlp, embeddings_dict, test = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, collate_fn=create_mini_batch_test)


valid_dataset = SummaryDataset(valid, nlp, embeddings_dict, test = True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, collate_fn=create_mini_batch_test)

In [681]:
prediction = ''
for x, id, words_pos, sentence in valid_loader:
    if len(x[0]) ==0:
        prediction += json.dumps({"id":id[0], "predict_sentence_index": [0]}) + '\n'
        continue
#     print(x.shape) # batch_size, sentence_length, word_enb torch.Size([16, 282, 300])
    x = x.cuda()
    pred = model(x) # torch.Size([16, 282, 2])
    out = Func.softmax(pred, dim=1) # torch.Size([282, 2])
    values, indexs = out.max(-1)
    ans = indexs.tolist()
    zipped = zip(words_pos[0], ans)
    cum_num = defaultdict(lambda:0,{})
    for (i,j) in zipped:
        if j == 1:
            cum_num[i] += 1
    cum_num = sorted(cum_num.items(), key= lambda x: x[1], reverse = True)
    extractive_pred = [i for i, j in cum_num]
    prediction += json.dumps({"id":id[0], "predict_sentence_index": extractive_pred[:2]}) + '\n'
    
    print(id[0], end ='\r')

2019999

In [682]:
with open('prediction.json','w') as f:
    f.write(prediction)