# Sheet 1

In [None]:
!pip install pyconll

In [None]:
from transformers import BertModel, BertConfig,BertTokenizer
import nltk
import torch

import numpy as np
import pyconll
import urllib.request
import transformers
import torch.nn
from tqdm.notebook import tqdm
import torch.optim as optim
from torchtext import data
import random
from torchtext.legacy import data
import torch.nn as nn
import os

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load Datset

In [None]:
class NERDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], [] # list of lists
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen


In [None]:
def read_examples_from_file(data_dir, mode):
    file_path = os.path.join(data_dir, "{}.txt".format(mode))
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        sent = []
        for line in f:
            line = line.strip()
            if len(line) < 2  or line == "\n":
                #print(line)
                if sent:
                    #for w, l in zip(words, labels)
                    #print(words)
                    #print(labels)
                    examples.append(sent)
                    sent = []
            else:
                splits = line.split(" ")
                #words.append()
                if len(splits) > 1:
                    sent.append((splits[0], splits[-1].replace("\n", "")))
                else:
                    # Examples could have no label for mode = "test"
                    sent.append((splits[0], "O"))
        if sent:
            examples.append(sent)
    return examples

In [None]:
NER_Wolof_path ='../input/nerafrican/wol'

In [None]:
Wolof_data = read_examples_from_file(NER_Wolof_path, mode="train") + read_examples_from_file(NER_Wolof_path, mode="test") + read_examples_from_file(NER_Wolof_path, mode="dev")

In [None]:
len(Wolof_data)

In [None]:
Wolof_data = Wolof_data[:500]

In [None]:
ner_labels = ["O", "B-DATE", "I-DATE", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

In [None]:
ner_labels_en = ['O', 'geo-nam', 'org-nam', 'per-nam', 'gpe-nam', 'tim-dat', 'tim-dow', 'per-tit', 'per-fam', 'tim-yoc', 'tim-moy', 'per-giv', 'tim-clo', 'art-nam', 'eve-nam', 'nat-nam', 'tim-nam', 'eve-ord', 'per-ini', 'org-leg', 'per-ord', 'tim-dom', 'per-mid', 'art-add']

In [None]:
ner_labels = ["<pad>"] + ner_labels

In [None]:
tag2idx = {tag:idx for idx, tag in enumerate(ner_labels)}
idx2tag = {idx:tag for idx, tag in enumerate(ner_labels)}

In [None]:
from sklearn.model_selection import train_test_split

wo_train_data, wo_test_data=train_test_split(Wolof_data, test_size=.25)
#yo_train_data, yo_test_data=train_test_split(Yoruba_sent, test_size=.25)
#len(wo_train_data), len(wo_test_data)

In [None]:
len(wo_train_data), len(wo_test_data)

In [None]:
def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [None]:
from torch.utils import data

Wolof_train = NERDataset(wo_train_data)
Wolof_test = NERDataset(wo_test_data)

wo_train_iter = data.DataLoader(dataset=Wolof_train,
                             batch_size=1,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)

wo_test_iter = data.DataLoader(dataset=Wolof_test,
                             batch_size=1,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

In [None]:
class Net(nn.Module):
    def __init__(self, model_type='bert-base-multilingual-cased',vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_type,return_dict=False)
        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        '''
        x: (N, T). int64
        y: (N, T). int64
        '''
        x = x.to(device)
        y = y.to(device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [None]:
def train(model, iterator):
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = 0.0001)
    loss_list={}
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)
        logits = logits.view((-1), logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)
       
       # print(torch.Size(logits))
       # print(torch.Size(y))
        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()
        
        if i%10==0: # monitoring
            #print("step: {}, loss: {}".format(i, loss.item()))
            loss_list[i]=loss.item()
    return loss_list 

In [None]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)
            
            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.append(y_hat.cpu().numpy().tolist())
    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
           # y_hat=y_hat[1:-1]
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            #assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)
    print(acc)
    return acc 

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_wo = Net(vocab_size=len(tag2idx), model_type='Davlan/bert-base-multilingual-cased-finetuned-yoruba')
model_wo.to(device)
model_wo = nn.DataParallel(model_wo)

In [None]:
Epoch=10

In [None]:
wo_acc=[]
wo_train=[]
i=0
while i<Epoch: 
    print("epoch " + str(i+1))
    wo_train.append(train(model_wo, wo_train_iter))
    wo_acc.append(eval(model_wo, wo_test_iter))
    i+=1
print(wo_acc)

In [None]:
model_yowo = Net(model_type='Davlan/bert-base-multilingual-cased-finetuned-yoruba',vocab_size=len(tag2idx))
model_yowo.to(device)
model_yowo = nn.DataParallel(model_yowo)

In [None]:
yowo_acc=[]
yowo_train=[]
i=0
while i<Epoch: 
    print("epoch " + str(i+1))
    yowo_train.append(train(model_yowo, wo_train_iter))
    yowo_acc.append(eval(model_yowo, wo_test_iter))
    i+=1

print(yowo_acc)

In [None]:
model_swwo = Net(model_type='Davlan/bert-base-multilingual-cased-finetuned-swahili', vocab_size=len(tag2idx))
model_swwo.to(device)
model_swwo = nn.DataParallel(model_swwo)

In [None]:
swwo_acc=[]
swwo_train=[]
i=0
while i<Epoch: 
    print("epoch " + str(i+1))
    swwo_train.append(train(model_swwo, wo_train_iter))
    swwo_acc.append(eval(model_swwo, wo_test_iter))
    i+=1
print(swwo_acc)

In [None]:
model_amwo = Net(model_type='Davlan/bert-base-multilingual-cased-finetuned-amharic',vocab_size=len(tag2idx))
model_amwo.to(device)
model_amwo = nn.DataParallel(model_amwo)

In [None]:
amwo_acc=[]
amwo_train=[]
i=0
while i<Epoch: 
    print("epoch " + str(i+1))
    amwo_train.append(train(model_amwo, wo_train_iter))
    amwo_acc.append(eval(model_amwo, wo_test_iter))
    i+=1
print(amwo_acc)

In [None]:
print(wo_acc)
print(swwo_acc)
print(yowo_acc)
print(amwo_acc)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.plot(wo_acc)
plt.plot(swwo_acc)
plt.plot(yowo_acc)
plt.plot(amwo_acc)
plt.ylabel('F1')
plt.xlabel('Epoch')
plt.legend(["original", "sw","yo","am"], loc ="lower right")

plt.show()

In [None]:
mean_yo=sum(yowo_acc)/10
mean_yo

In [None]:
mean_og=sum(wo_acc)/10
mean_og

In [None]:
mean_sw=sum(swwo_acc)/10
mean_sw

In [None]:
mean_am=sum(amwo_acc)/10
mean_am

In [None]:
import pandas as pd
df=pd.DataFrame(
    {
        "Original":wo_acc,
        "Swahili":swwo_acc,
        "Amharic":amwo_acc,
        "Yoruba":yowo_acc,   
    }
)
df.to_csv('F1.csv',index=True)

In [None]:
df2=pd.DataFrame({"Average F1":[mean_og,mean_sw,mean_am,mean_yo],})
df2.index = ['Original', 'Swahili', 'Amharic', 'Yoruba']
df2.to_csv('Mean.csv',index=True)