<a href="https://colab.research.google.com/github/ninja197/BAexperiments/blob/main/baseline_NER_MiniLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets tokenizers seqeval sentencepiece
!pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.2 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.8 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 35.7 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 40.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |███████████████

In [2]:
import tensorflow as tf
from tokenizers import SentencePieceUnigramTokenizer
from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, DataCollatorForLanguageModeling
import logging
import sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
language = 'arz'

# load standard tokenizer
xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

# Fine-Tuning

In [6]:
import torch 

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, file, lang, max_len, tokenizer, assignment):

        self.tokenizer = tokenizer

        self.max_len = max_len
        self.assignment = assignment
        self.lang = lang

        self.create_label2id()

        self.examples = self.read_file(file)

        print(self.examples[0])
        print('----------------------------------------')

    def __getitem__(self, idx):
        return self.encode(idx)

    def __len__(self):
        return len(self.examples)

    def create_label2id(self):

        ner_tags = [
            'B-ORG',
            'I-ORG',
            'B-PER',
            'I-PER',
            'B-MISC',
            'I-MISC',
            'B-LOC',
            'I-LOC',
            'O'
        ]

        iter = 0
        self.label2id = {}
        for tag in ner_tags:
            self.label2id[tag] = iter
            iter += 1

    def read_file(self, file, convert_labels=True):

        inps = []

        with open(file, 'r') as f:
            temp_tokens = []
            temp_labels = []
            for line in f:
                if line.strip():

                    token = line.strip().split('\t')
                    assert len(token) == 2

                    if convert_labels:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(self.label2id[token[1]])

                    else:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(token[1])

                else:
                    inps.append((temp_tokens,temp_labels))
                    temp_tokens = []
                    temp_labels = []
        return inps

    def encode(self, id):
        instance = self.examples[id]


        forms = instance[0]
        labels = instance[1]

        expanded_labels = []
        label_mask = []

        for i in range(0, len(forms)):

            subwords = self.tokenizer.tokenize(forms[i])

            if self.assignment == 'first':
                expanded_labels.append(labels[i])
                for j in range(1, len(subwords)):
                    expanded_labels.append(-100)
            elif self.assignment == 'all':
                for j in range(0,len(subwords)):
                    expanded_labels.append(labels[i])
                    if j < len(subwords) - 1:
                        label_mask.append(0)
                    else:
                        label_mask.append(1)

            elif self.assignment == 'last':
                for j in range(0,len(subwords)-1):
                    expanded_labels.append(-100)
                expanded_labels.append(labels[i])


        s1 = ' '.join(forms)

        enc = self.tokenizer(
            s1,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
        )



        if len(expanded_labels) > self.max_len:
            expanded_labels = expanded_labels[:self.max_len]

        enc['labels'] = expanded_labels

        return enc


if __name__ == '__main__':

    # x = NERDataset(
    #     file='data/ner/rahimi_output/eng/train',
    #     max_len=256,
    #     tokenizer=None,
    #     assignment='last'
    # )

    inps = []
    labels_found = []
    lang='en'
    with open('/content/drive/MyDrive/en/train') as f:
        temp_tokens = []
        for line in f:
            if line.strip():
                token = line.strip().split('\t')
                assert len(token) == 2
                temp_tokens.append(
                    (token[0].replace(lang + ':', ''), token[1])
                )
            else:
                inps.append(temp_tokens)
                temp_tokens = []

    print(inps[5])
    print(len(inps))

[('St.', 'B-ORG'), ('Mary', 'I-ORG'), ("'s", 'I-ORG'), ('Catholic', 'I-ORG'), ('Church', 'I-ORG'), ('(', 'I-ORG'), ('Sandusky', 'I-ORG'), (',', 'I-ORG'), ('Ohio', 'I-ORG'), (')', 'I-ORG')]
20000


In [7]:
# filefinder
def biblelang2nerlang(bible_lang):
    language_mapping = '/content/drive/MyDrive/NER/bible_ner_xlmr_split.txt'
    with open(language_mapping, 'r') as f:
        for line in f:
            data = line.strip().split(',')
            if data[1] == bible_lang:
                return data[2]

def lang_to_ner(lang, split):

    ner_dir = '/content/drive/MyDrive/{lang}/{split}'.format(lang=biblelang2nerlang(lang), split=split)

    return ner_dir

In [8]:
# define training (english) and evaluation dataset
ner_train_dataset = NERDataset(file=lang_to_ner('eng', 'train'),
                                       lang='en', max_len=256, tokenizer=xlmr_tokenizer,
                                       assignment='last')


ner_eval_dataset = NERDataset(file=lang_to_ner(language, 'dev'),
                          lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer,
                          assignment='last')

(['R.H.', 'Saunders', '(', 'St.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')'], [0, 1, 8, 0, 1, 1, 8, 8, 8, 8, 8])
----------------------------------------
(['انتاجه', 'فى', 'امريكا', 'كندا', 'وبيتسعر', 'غالبن', 'فى', 'الصين', '.'], [8, 8, 6, 6, 8, 8, 8, 6, 8])
----------------------------------------


In [9]:
ner_test_dataset = NERDataset(file=lang_to_ner(language, 'test'), lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer, assignment='last')


ner_warmup_steps = int((5 * (ner_train_dataset.__len__() // (32 * 4 * 1))) * .01)

!mkdir finetuned_ner_model
ner_training_args = TrainingArguments(
    output_dir= 'finetuned_ner_model',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=25,
    save_total_limit=3,
    save_steps=3000,
    evaluation_strategy='epoch',
    eval_steps=50,
    learning_rate=2e-5,
    warmup_steps=ner_warmup_steps,
    disable_tqdm=False,
    gradient_accumulation_steps=4)

(['تحويل', 'احمد', 'بن', 'طولون'], [8, 2, 3, 3])
----------------------------------------


In [10]:
from transformers import AutoModelForTokenClassification
from transformers import AutoModel

ner_model = AutoModelForTokenClassification.from_pretrained('microsoft/Multilingual-MiniLM-L12-H384',num_labels=len(ner_train_dataset.label2id))

ner_model.resize_token_embeddings(len(xlmr_tokenizer))

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/449M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(250002, 384)

In [11]:
from sklearn.metrics import f1_score, accuracy_score
from seqeval.metrics import f1_score as seqeval_f1
from seqeval.metrics import accuracy_score as seqeval_accuracy


def create_id2label_ner():

    ner_tags = [
        'B-ORG',
        'I-ORG',
        'B-PER',
        'I-PER',
        'B-MISC',
        'I-MISC',
        'B-LOC',
        'I-LOC',
        'O'
    ]

    iter = 0
    id2label = {}
    for tag in ner_tags:
        id2label[iter] = tag
        iter += 1

    return id2label



def ner_metrics(eval_pred):

    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    corrected_preds = []
    corrected_labels = []

    id2label = create_id2label_ner()

    for i in range(0, len(labels)):
        temp_pred = []
        temp_label = []
        for j in range(0, len(labels[i])):
            if labels[i][j] != -100:
                temp_label.append(id2label[labels[i][j]])
                temp_pred.append(id2label[preds[i][j]])

        corrected_labels.append(temp_label)
        corrected_preds.append(temp_pred)

    acc = seqeval_accuracy(corrected_labels, corrected_preds)
    f1 = seqeval_f1(corrected_labels, corrected_preds)

    f1 = f1 * 100
    acc = acc * 100

    logging.info('F1 during training: {}'.format(f1))
    logging.info('Accuracy during training: {}'.format(acc))
    logging.info('---------------------------------------------')

    return {
        'accuracy': acc,
        'f1': f1
    }

In [12]:
ner_collator = DataCollatorForTokenClassification(
    tokenizer=xlmr_tokenizer,
    padding='longest'
)

trainer = Trainer(
    model=ner_model,
    data_collator=ner_collator,
    args=ner_training_args,
    train_dataset=ner_train_dataset,
    eval_dataset=ner_eval_dataset,
    compute_metrics=ner_metrics,
)

trainer.train()

results = trainer.predict(ner_test_dataset)
results = results.metrics

***** Running training *****
  Num examples = 20000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7145,1.007837,73.061761,39.862543
2,0.5213,0.948934,73.455979,41.958042
3,0.4401,0.805651,77.529566,47.619048
4,0.3616,0.878474,74.244415,44.366197
5,0.3604,0.84919,75.295664,45.487365


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to finetuned_ner_model/checkpoint-3000
Configuration saved in finetuned_ner_model/checkpoint-3000/config.json
Model weights saved in finetuned_ner_model/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 100
  Batch size = 8


In [13]:
results

{'test_accuracy': 75.61307901907357,
 'test_f1': 50.0,
 'test_loss': 0.8321982026100159,
 'test_runtime': 0.4325,
 'test_samples_per_second': 231.19,
 'test_steps_per_second': 30.055}