In [None]:
!nvidia-smi

Wed May 27 09:06:44 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers



In [None]:
import math
import pickle
import random

import numpy as np
from sklearn.metrics import classification_report 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader, Sampler
from torch.nn.utils.rnn import pad_sequence
import transformers
from transformers import XLMRobertaModel, XLMRobertaTokenizer


train_data_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/data/bijankhan_corpus.tsv'
pickled_data_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/data/roberta_data.pickle'
model_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/bert_model_pos/model.pickle'
last_i_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/bert_model_pos/last_i'

HIDDEN_SIZE = 768
NUM_EPOCHS = 100
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
BEGIN_TOKEN = 0
END_TOKEN = 2
PAD_TOKEN = 1
TAG_PAD_OR_MASK_TOKEN = 15

random.seed(17)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cuda


In [None]:
def data_reader(directory):
    sents, sent = [], [BEGIN_TOKEN]
    all_pos_tags, pos_tags = [], [TAG_PAD_OR_MASK_TOKEN]
    all_ezafe_tags, ezafe_tags = [], [TAG_PAD_OR_MASK_TOKEN]
    pos_to_id = {}
    with open(directory) as bijankhan_corpus:
        for line in bijankhan_corpus:
            if line != '\n':
                word, pos_tag, ezafe_tag = line.strip().split('\t')

                word = word.replace('ي', 'ی').replace('ك', 'ک').replace('ة', 'ه')
                word = tokenizer.encode(word, add_special_tokens=False)
                sent.extend(word)

                if pos_tag not in pos_to_id:
                    pos_to_id[pos_tag] = len(pos_to_id)

                ezafe_tags.extend([int(ezafe_tag)] + [TAG_PAD_OR_MASK_TOKEN for _ in range(len(word) - 1)])
                pos_tags.extend([pos_to_id[pos_tag]] + [TAG_PAD_OR_MASK_TOKEN for _ in range(len(word) - 1)])
            else:
                assert len(sent) == len(pos_tags)
                sents.append(sent + [END_TOKEN])
                all_ezafe_tags.append(ezafe_tags + [TAG_PAD_OR_MASK_TOKEN])
                all_pos_tags.append(pos_tags + [TAG_PAD_OR_MASK_TOKEN])
                
                sent = [BEGIN_TOKEN]
                pos_tags = [TAG_PAD_OR_MASK_TOKEN]

    return sents, all_ezafe_tags, all_pos_tags


class MySampler(Sampler):
    def __init__(self, data, i=0):
        random.shuffle(data)
        self.seq = list(range(len(data)))[i * batch_size:]

    def __iter__(self):
        return iter(self.seq)

    def __len__(self):
        return len(self.seq)


class EzafeDataset(Dataset):
    def __init__(self, data):
       self.samples = data

    def __len__(self):
        return len(self.samples)
  
    def __getitem__(self, idx):
        sample = self.samples[idx]
        x = torch.tensor(sample[0])
        mask = torch.sign(x)
        ezafe_tags = torch.tensor(sample[1])
        pos_tags = torch.tensor(sample[2])

        return x, mask, ezafe_tags, pos_tags


def collate_fn(batch):
    xs, masks, ezafe_tags, pos_tags = zip(*batch)
    padded_xs = pad_sequence(xs, batch_first=True)
    padded_masks = pad_sequence(masks, batch_first=True)
    padded_ezafe_tags = pad_sequence(ezafe_tags, batch_first=True, padding_value=tag_mask_and_pad)
    padded_pos_tags = pad_sequence(pos_tags, batch_first=True, padding_value=tag_mask_and_pad)

    return {'input_ids': padded_xs, 'attention_masks': padded_masks}, \
           (padded_ezafe_tags, padded_pos_tags)

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

try:
    with open(pickled_data_dir, 'rb') as out_file:
        sents, ezafe_tags, pos_tags = pickle.load(out_file)
except FileNotFoundError:
    sents, ezafe_tags, pos_tags = data_reader(train_data_dir) 
    with open(pickled_data_dir, 'wb') as out_file:
        pickle.dump([sents, ezafe_tags, pos_tags], out_file)

sents_shuf = []
ezafe_tags_shuf = []
pos_tags_shuf = []
index_shuf = list(range(len(sents)))

random.seed(17)
random.shuffle(index_shuf)

for i in index_shuf:
    sents_shuf.append(sents[i])
    ezafe_tags_shuf.append(ezafe_tags[i])
    pos_tags_shuf.append(pos_tags[i])

sents_shuf, ezafe_tags_shuf, pos_tags_shuf = zip(*[(sent, ezafe, pos) for sent, ezafe, pos in zip(sents_shuf, ezafe_tags_shuf, pos_tags_shuf) if len(sent) <= 512])

data_split_1 = int(len(sents_shuf) * .1)
data_split_2 = int(len(sents_shuf) * .2)

test_data = [(sents_shuf[i], ezafe_tags_shuf[i], pos_tags_shuf[i]) for i in range(len(sents_shuf))[:data_split_1]]
valid_data = [(sents_shuf[i], ezafe_tags_shuf[i], pos_tags_shuf[i]) for i in range(len(sents_shuf))[data_split_1:data_split_2]]
train_data = [(sents_shuf[i], ezafe_tags_shuf[i], pos_tags_shuf[i]) for i in range(len(sents_shuf))[data_split_2:]]

prev_epochs = 0
try:
    with open(last_i_dir) as in_file:
        last_i = int(in_file.read().strip())
    all_steps = math.ceil(len(train_data) / batch_size)
    while last_i > all_steps:
        last_i -= all_steps
        prev_epochs += 1

except FileNotFoundError:
    last_i = 0

print(last_i)
    
train_dataset = EzafeDataset(train_data)
train_sampler = MySampler(train_dataset, last_i)

test_dataset = EzafeDataset(test_data)

print(len(train_dataset))
print(len(test_dataset))

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.bert_model = xlmroberta_model
        self.dropout = nn.Dropout(dropout_rate)
        self.ezafe_classifier = nn.Linear(hidden_size, 2)
        self.pos_classifier = nn.Linear(hidden_size, 15)

    def forward(self, x, mask):
        sequence_output, _ = self.bert_model(x, mask)
        sequence_output = self.dropout(sequence_output)
        ezafe_pred = self.ezafe_classifier(sequence_output)
        pos_pred = self.pos_classifier(sequence_output)
        return ezafe_pred, pos_pred

try:
    model = torch.load(model_dir + str(last_i))    
except:
    xlmroberta_model = XLMRobertaModel.from_pretrained('xlm-roberta-base')
    model = Net().to(device)

In [None]:
train_data_loader = DataLoader(dataset=train_dataset,                                                         
                               batch_size=BATCH_SIZE,                                            
                               collate_fn=collate_fn,
                               sampler=train_sampler,
                               shuffle=False,
                               num_workers=4)

test_data_loader = DataLoader(dataset=test_dataset,                                                         
                              batch_size=BATCH_SIZE,                                            
                              collate_fn=collate_fn,
                              shuffle=False,
                              num_workers=4)

criterion = nn.CrossEntropyLoss(ignore_index=tag_mask_and_pad).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

model.to(device)

for epoch in range(NUM_EPOCHS):
    epoch += prev_epochs
    for i, (x, (ezafe_true, pos_true)) in enumerate(train_data_loader):
        model.train()
        i += last_i

        optimizer.zero_grad()

        ezafe_pred, pos_pred = model(x['input_ids'].to(device), x['attention_masks'].to(device))

        ezafe_loss = criterion(ezafe_pred.view(-1, ezafe_pred.shape[-1]).cpu(), ezafe_true.view(-1))
        pos_loss = criterion(pos_pred.view(-1, pos_pred.shape[-1]).cpu(), pos_true.view(-1))

        loss = 1.0 * ezafe_loss + 2.0 * pos_loss

        loss.backward()
        optimizer.step()

        if i % 2500 == 0 and (epoch != 0 or i >= 10_000):
            print('testing...')
            model.evaluate()
            ezafe_true_labels, ezafe_pred_labels = [], []
            pos_true_labels, pos_pred_labels = [], []   
            with torch.no_grad():
                for x, (ezafe_true, pos_true) in test_data_loader:
                    ezafe_pred, pos_pred = model(x['input_ids'].to(device), x['attention_masks'].to(device))

                    ezafe_true_labels.extend(ezafe_true.reshape(-1).numpy().tolist())
                    ezafe_pred_labels.extend(torch.argmax(ezafe_pred, -1).reshape(-1).cpu().numpy().tolist())

                    pos_true_labels.extend(pos_true.reshape(-1).numpy().tolist())
                    pos_pred_labels.extend(torch.argmax(pos_pred, -1).reshape(-1).cpu().numpy().tolist())

            ezafe_true_labels_, ezafe_pred_labels_ = np.array(ezafe_true_labels), np.array(ezafe_pred_labels)
            pos_true_labels_, pos_pred_labels_ = np.array(pos_true_labels), np.array(pos_pred_labels)
 
            ezafe_pos_mask_or_pad_ids = np.where(ezafe_true_labels_ == tag_mask_and_pad)[0]

            ezafe_true_labels_ = np.delete(ezafe_true_labels_, ezafe_pos_mask_or_pad_ids)
            ezafe_pred_labels_ = np.delete(ezafe_pred_labels_, ezafe_pos_mask_or_pad_ids)
            pos_true_labels_ = np.delete(pos_true_labels_, ezafe_pos_mask_or_pad_ids)
            pos_pred_labels_ = np.delete(pos_pred_labels_, ezafe_pos_mask_or_pad_ids)

            ezafe_f1 = classification_report(ezafe_true_labels_, ezafe_pred_labels_, digits=4)
            pos_f1 = classification_report(pos_true_labels_, pos_pred_labels_, digits=4)
          
            summary = f'epoch: {epoch + 1} | step: {i}:\nEzafe_F1:\n{ezafe_f1}\n\nPOS_F1:\n{pos_f1}\n\n'
            print(summary)

            # save everything
            with open(model_dir + 'summary.txt', 'a+') as out_file:
                out_file.write(summary + '\n')           
    
            with open(model_dir + 'last_i', 'w+') as out_file:
                out_file.write(str(i))
 
            os.mkdir(model_dir + str(i))
            torch.save(model, model_dir + str(i))

In [None]:
# testing
test_dataset = POSDataset(test_data)

test_data_loader = DataLoader(dataset=test_dataset,                                                         
                              batch_size=BATCH_SIZE,                                            
                              collate_fn=collate_fn,
                              shuffle=False,
                              num_workers=4)

i = 17500
model = model = XLMRobertaForTokenClassification.from_pretrained(model_dir + str(i))

model.to(device)
model.train()

with torch.no_grad():
    for x, y_true in test_data_loader:
        y_true = y_true.to(device)

        _, y_pred = model(x['input_ids'].to(device), x['attention_masks'].to(device))[0]

        test_true_labels.extend(y_true.reshape(-1).cpu().numpy().tolist())
        test_pred_labels.extend(torch.argmax(y_pred, -1).reshape(-1).cpu().numpy().tolist())

test_true_labels_, test_pred_labels_ = np.array(test_true_labels), np.array(test_pred_labels)
test_pad_mask_ids = np.where(test_true_labels_ == TAG_PAD_OR_MASK_TOKEN)[0]
test_true_labels_ = np.delete(test_true_labels_, test_pad_mask_ids)
test_pred_labels_ = np.delete(test_pred_labels_, test_pad_mask_ids)

test_f1 = classification_report(test_true_labels_, test_pred_labels_, digits=4)

summary = f'epoch: {epoch + 1} | step: {i}:\n\n{test_f1}\n'
print(summary)