In [None]:
!pip install pytorch-pretrained-bert



In [None]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
with open("/content/drive/MyDrive/smt 2/NLP/sequential labelling/dataset/train_preprocess.txt", 'r') as f:
    lines = f.readlines()

doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        doc.append(sent)
        sent = []
doc.append(sent)

with open("/content/drive/MyDrive/smt 2/NLP/sequential labelling/dataset/valid_preprocess.txt", 'r') as f:
    lines = f.readlines()

validation_doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        validation_doc.append(sent)
        sent = []
validation_doc.append(sent)

print(validation_doc[1])

with open("/content/drive/MyDrive/smt 2/NLP/sequential labelling/dataset/test_preprocess_masked_label.txt", 'r') as f:
    lines = f.readlines()

test_doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        test_doc.append(sent)
        sent = []
test_doc.append(sent)


[('admin', 'O'), ('@halobca', 'B'), ('kok', 'O'), ('susah', 'B'), ('dihubungi', 'B'), ('ya', 'O'), ('apa', 'O'), ('sedang', 'O'), ('gangguan', 'B')]


In [None]:
tagged_sents = doc + validation_doc

In [None]:
tagged_sents[0]

[('Setelah', 'O'),
 ('melalui', 'B'),
 ('proses', 'B'),
 ('telepon', 'I'),
 ('yang', 'O'),
 ('panjang', 'O'),
 ('tutup', 'B'),
 ('sudah', 'O'),
 ('kartu', 'B'),
 ('kredit', 'I'),
 ('bca', 'I'),
 ('Ribet', 'B')]

In [None]:
tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))
",".join(tags)

'I,B,O'

In [None]:
# By convention, the 0'th slot is reserved for padding.
tags = ["<pad>"] + tags
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [None]:
# Let's split the data into train and test (or eval)
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=.1)
len(train_data), len(test_data)

(900, 100)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from pytorch_pretrained_bert import BertModel

class Net(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [None]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)

In [None]:
model = Net(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

100%|██████████| 407873900/407873900 [00:14<00:00, 27313125.54B/s]


In [None]:
train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr = 0.0001)

criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 1.3086016178131104
step: 10, loss: 0.9437406063079834
step: 20, loss: 0.6064664125442505
step: 30, loss: 0.6228988766670227
step: 40, loss: 0.6202276349067688
step: 50, loss: 0.6766679286956787
step: 60, loss: 0.54524827003479
step: 70, loss: 0.5278115272521973
step: 80, loss: 0.5517987608909607
step: 90, loss: 0.4807947278022766
step: 100, loss: 0.5193215608596802
step: 110, loss: 0.5195705890655518
acc=0.80


In [None]:
test_data = test_doc
eval_dataset = PosDataset(test_data)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

train(model, train_iter, optimizer, criterion)
eval(model, test_iter)

step: 0, loss: 0.33176663517951965
step: 10, loss: 0.3484683036804199
step: 20, loss: 0.27326667308807373
step: 30, loss: 0.18611134588718414
step: 40, loss: 0.24255941808223724
step: 50, loss: 0.2763305902481079
step: 60, loss: 0.3419741690158844
step: 70, loss: 0.32503655552864075
step: 80, loss: 0.371150940656662
step: 90, loss: 0.23849371075630188
step: 100, loss: 0.18994107842445374
step: 110, loss: 0.21894149482250214
acc=0.58


In [None]:
with open("result.txt", 'r') as f:
    lines = f.readlines()

test_pred = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split(" ")
        label = split_sent[2].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        test_pred.append(sent)
        sent = []

In [None]:
i = 1
while True:
    print(i)
    print(test_doc[i])
    print(test_pred[i])
    if test_pred[i][0][0] != test_doc[i][0][0]:
        break
    i += 1
print(test_pred[i])
print(test_doc[i])



1
[('2', 'O'), ('minggu', 'O'), ('terakhir', 'O'), ('bolak', 'O'), ('balik', 'O'), ('kcp', 'O'), ('bca', 'O'), ('gegara', 'O'), ('isi', 'O'), ('flazz', 'O'), ('pakai', 'O'), ('atm', 'O'), ('sudah', 'O'), ('terdebit', 'O'), ('tetapi', 'O'), ('saldo', 'O'), ('flazz', 'O'), ('enggak', 'O'), ('menambah', 'O'), ('sampai', 'O'), ('sekarang', 'O'), ('belum', 'O'), ('kelar', 'O')]
[('2', 'O'), ('minggu', 'O'), ('terakhir', 'O'), ('bolak', 'O'), ('balik', 'O'), ('kcp', 'I'), ('bca', 'I'), ('gegara', 'O'), ('isi', 'B'), ('flazz', 'B'), ('pakai', 'O'), ('atm', 'B'), ('sudah', 'O'), ('terdebit', 'O'), ('tetapi', 'O'), ('saldo', 'B'), ('flazz', 'B'), ('enggak', 'O'), ('menambah', 'B'), ('sampai', 'O'), ('sekarang', 'O'), ('belum', 'B'), ('kelar', 'I')]
2
[('kok', 'O'), ('bisa-bisanya', 'O'), ('atm', 'O'), ('bca', 'O'), ('error', 'O'), ('sih', 'O'), ('saya', 'O'), ('sudah', 'O'), ('masukkan', 'O'), ('kartu', 'O'), ('terus', 'O'), ('atm-nya', 'O'), ('enggak', 'O'), ('jalan-jalan', 'O')]
[('kok', 'O')

IndexError: ignored

In [None]:
len(test_doc)

247

In [None]:
test_pred[-1]

[('Bagus-bagus', 'O'), ('teller', 'B'), ('bca', 'I'), ('pelayanannya', 'B')]

In [None]:
labels = []
for sentence in test_pred:
    sent_label = []
    for w in sentence:
        sent_label.append(w[1])
    labels.append(sent_label)

In [None]:
import csv 
    
# field names 
fields = ['index', 'label'] 
    
# data rows of csv file 
rows = [[i, str(e)] for i, e in enumerate(labels)] 
print(rows)
    
# name of csv file 
filename = "pred.txt"
    
# writing to csv file 
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the fields 
    csvwriter.writerow(fields) 

    # writing the data rows 
    csvwriter.writerows(rows)



[[0, "['O', 'B', 'I', 'O', 'O', 'B', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'B', 'I', 'O']"], [1, "['O', 'O', 'O', 'O', 'O', 'I', 'I', 'O', 'B', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'B', 'O', 'B', 'O', 'O', 'B', 'I']"], [2, "['O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'B', 'B', 'O', 'O', 'B', 'I']"], [3, "['O', 'B', 'O', 'B', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'B', 'I']"], [4, "['O', 'O', 'B', 'B', 'B', 'O', 'B', 'I', 'O', 'B', 'B', 'B', 'I', 'O', 'O', 'O', 'O', 'B', 'I']"], [5, "['B', 'B', 'I', 'O', 'B', 'O', 'B', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'B', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'B', 'O', 'B', 'B']"], [6, "['O', 'O', 'O', 'O', 'B', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'B', 'B', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B']"], [7, "['O', 