In [1]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [2]:
def read_documents(file_path: str):
    documents = []
    sentence = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            if '\t' in line:
                line = line[:-1]
                word, label = line.split('\t')
                # We ignore case
                word = word.lower()
                sentence.append((word, label))
            else:
                documents.append(sentence)
                sentence = []
    if len(sentence) > 0:
        documents.append(sentence)
    return documents

def convert_to_array(documents):
    X = []
    y = []
    for sentence in documents:
        X_sentence = []
        y_sentence = []
        for word, label in sentence:
            X_sentence.append(word)
            y_sentence.append(label)
        X.append(X_sentence)
        y.append(y_sentence)

    return X, y


In [3]:
train_documents = read_documents('dataset/nergrit_ner-grit/train_preprocess.txt')
validation_documents = read_documents('dataset/nergrit_ner-grit/valid_preprocess.txt')
test_documents = read_documents('dataset/nergrit_ner-grit/test_preprocess_masked_label.txt')

In [4]:
tagged_sents = train_documents + validation_documents

In [5]:
tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))
",".join(tags)

'B-ORGANISATION,B-PLACE,I-PERSON,I-PLACE,O,I-ORGANISATION,B-PERSON'

In [6]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [7]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(1692, 189)

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [9]:
# PRETRAINED = 'indobenchmark/indobert-base-p1'
PRETRAINED = 'bert-base-uncased'

In [10]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED)

In [11]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

In [12]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [13]:
class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED)

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)

        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]

        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [14]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [15]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")

    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)

In [16]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 407873900/407873900 [25:36<00:00, 265377.49B/s] 


In [17]:
train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=8,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=8,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [18]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 1.9981698989868164
step: 10, loss: 0.7055408954620361
step: 20, loss: 0.7445908188819885
step: 30, loss: 0.45730268955230713
step: 40, loss: 0.48175108432769775
step: 50, loss: 0.7455345988273621
step: 60, loss: 0.6264729499816895
step: 70, loss: 0.6301479935646057
step: 80, loss: 0.44026342034339905
step: 90, loss: 0.41381824016571045
step: 100, loss: 0.38573962450027466
step: 110, loss: 0.4535985589027405
step: 120, loss: 0.5745534896850586
step: 130, loss: 0.37223002314567566
step: 140, loss: 0.4836810231208801
step: 150, loss: 0.19094346463680267
step: 160, loss: 0.43730229139328003
step: 170, loss: 0.3795826733112335
step: 180, loss: 0.5305519104003906
step: 190, loss: 0.28053680062294006
step: 200, loss: 0.3540181517601013
step: 210, loss: 0.29329848289489746
acc=0.91


In [19]:
test_data = test_documents
eval_dataset = PosDataset(test_data)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=8,
                             collate_fn=pad)

train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 0.3212142288684845
step: 10, loss: 0.3044813275337219
step: 20, loss: 0.46440231800079346
step: 30, loss: 0.17653970420360565
step: 40, loss: 0.19384726881980896
step: 50, loss: 0.05320872366428375
step: 60, loss: 0.18305547535419464
step: 70, loss: 0.24279142916202545
step: 80, loss: 0.13261672854423523
step: 90, loss: 0.2682424485683441
step: 100, loss: 0.30040839314460754
step: 110, loss: 0.214363694190979
step: 120, loss: 0.10350719094276428
step: 130, loss: 0.3611779510974884
step: 140, loss: 0.3190595209598541
step: 150, loss: 0.10926070064306259
step: 160, loss: 0.1060626283288002
step: 170, loss: 0.14066891372203827
step: 180, loss: 0.18303076922893524
step: 190, loss: 0.24988791346549988
step: 200, loss: 0.3848412334918976
step: 210, loss: 0.14026060700416565
acc=0.82


In [21]:
with open("result", 'r') as f:
    lines = f.readlines()

test_pred = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split(" ")
        label = split_sent[2].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        test_pred.append(sent)
        sent = []

In [22]:
labels = []
for sentence in test_pred:
    sent_label = []
    for w in sentence:
        sent_label.append(w[1])
    labels.append(sent_label)

In [23]:
import pandas as pd
import numpy as np
np.set_string_function(lambda x: repr(list(x)), repr=False)
np.set_printoptions(linewidth=np.inf)

result_df = pd.DataFrame({'label': labels}).reset_index()
result_df.to_csv('pred.txt', index=False)