<a href="https://colab.research.google.com/github/ninja197/BAexperiments/blob/main/MiniLM_extend_ner_diffFinetuneLang.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets tokenizers seqeval sentencepiece
!pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.3 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.6 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 26.2 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 43.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-many

In [2]:
import tensorflow as tf
from tokenizers import SentencePieceUnigramTokenizer
from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForMaskedLM
import copy
import os
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, DataCollatorForLanguageModeling
import sentencepiece

In [11]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/sp_model')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import sentencepiece_model_pb2 as sp_model

In [13]:
language = 'arz'
finetune_lang = 'ar'

# Load extended tokenizer

In [15]:
#Load extended SPM as tokenizer
xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_tokenizer.vocab_file = '/content/drive/MyDrive/{lang}_model/extended_spm.model'.format(lang=language)     
xlmr_tokenizer.sp_model.load(xlmr_tokenizer.vocab_file)

#Re-align mask token
xlmr_tokenizer.fairseq_tokens_to_ids['<mask>'] = xlmr_tokenizer._convert_token_to_id('DUMMY_MASK')

# Finetuning

In [17]:
import torch 

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, file, lang, max_len, tokenizer, assignment):

        self.tokenizer = tokenizer

        self.max_len = max_len
        self.assignment = assignment
        self.lang = lang

        self.create_label2id()

        self.examples = self.read_file(file)

        print(self.examples[0])
        print('----------------------------------------')

    def __getitem__(self, idx):
        return self.encode(idx)

    def __len__(self):
        return len(self.examples)

    def create_label2id(self):

        ner_tags = [
            'B-ORG',
            'I-ORG',
            'B-PER',
            'I-PER',
            'B-MISC',
            'I-MISC',
            'B-LOC',
            'I-LOC',
            'O'
        ]

        iter = 0
        self.label2id = {}
        for tag in ner_tags:
            self.label2id[tag] = iter
            iter += 1

    def read_file(self, file, convert_labels=True):

        inps = []

        with open(file, 'r') as f:
            temp_tokens = []
            temp_labels = []
            for line in f:
                if line.strip():

                    token = line.strip().split('\t')
                    assert len(token) == 2

                    if convert_labels:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(self.label2id[token[1]])

                    else:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(token[1])

                else:
                    inps.append((temp_tokens,temp_labels))
                    temp_tokens = []
                    temp_labels = []
        return inps

    def encode(self, id):
        instance = self.examples[id]


        forms = instance[0]
        labels = instance[1]

        expanded_labels = []
        label_mask = []

        for i in range(0, len(forms)):

            subwords = self.tokenizer.tokenize(forms[i])

            if self.assignment == 'first':
                expanded_labels.append(labels[i])
                for j in range(1, len(subwords)):
                    expanded_labels.append(-100)
            elif self.assignment == 'all':
                for j in range(0,len(subwords)):
                    expanded_labels.append(labels[i])
                    if j < len(subwords) - 1:
                        label_mask.append(0)
                    else:
                        label_mask.append(1)

            elif self.assignment == 'last':
                for j in range(0,len(subwords)-1):
                    expanded_labels.append(-100)
                expanded_labels.append(labels[i])


        s1 = ' '.join(forms)

        enc = self.tokenizer(
            s1,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
        )



        if len(expanded_labels) > self.max_len:
            expanded_labels = expanded_labels[:self.max_len]

        enc['labels'] = expanded_labels

        return enc


if __name__ == '__main__':

    # x = NERDataset(
    #     file='data/ner/rahimi_output/eng/train',
    #     max_len=256,
    #     tokenizer=None,
    #     assignment='last'
    # )

    inps = []
    labels_found = []

    with open('/content/drive/MyDrive/{lang}/train'.format(lang=finetune_lang)) as f:
        temp_tokens = []
        for line in f:
            if line.strip():
                token = line.strip().split('\t')
                assert len(token) == 2
                temp_tokens.append(
                    (token[0].replace(finetune_lang + ':', ''), token[1])
                )
            else:
                inps.append(temp_tokens)
                temp_tokens = []

    print(inps[5])
    print(len(inps))

[('(', 'O'), ('أقاليم', 'B-LOC'), ('ما', 'I-LOC'), ('وراء', 'I-LOC'), ('البحار', 'I-LOC'), (')', 'O')]
20000


In [18]:
# filefinder
def biblelang2nerlang(bible_lang):
    language_mapping = '/content/drive/MyDrive/NER/bible_ner_xlmr_split.txt'
    with open(language_mapping, 'r') as f:
        for line in f:
            data = line.strip().split(',')
            if data[1] == bible_lang:
                return data[2]

def lang_to_ner(lang, split):

    ner_dir = '/content/drive/MyDrive/{lang}/{split}'.format(lang=lang, split=split)

    return ner_dir

In [19]:
# define training and evaluation dataset
ner_train_dataset = NERDataset(file=lang_to_ner(finetune_lang, 'train'),
                                       lang=finetune_lang, max_len=256, tokenizer=xlmr_tokenizer,
                                       assignment='last')


ner_eval_dataset = NERDataset(file=lang_to_ner(language, 'dev'),
                          lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer,
                          assignment='last')

(['دايو', '(', 'شركة', ')'], [0, 1, 1, 1])
----------------------------------------
(['انتاجه', 'فى', 'امريكا', 'كندا', 'وبيتسعر', 'غالبن', 'فى', 'الصين', '.'], [8, 8, 6, 6, 8, 8, 8, 6, 8])
----------------------------------------


In [24]:
ner_test_dataset = NERDataset(file=lang_to_ner(language, 'test'), lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer, assignment='last')


ner_warmup_steps = int((5 * (ner_train_dataset.__len__() // (32 * 4 * 1))) * .01)
!mkdir {finetune_lang}_finetuned_ner_model
ner_training_args = TrainingArguments(
    output_dir= '{finetune_lang}_finetuned_ner_model'.format(finetune_lang=finetune_lang),
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=25,
    save_total_limit=3,
    save_steps=3000,
    evaluation_strategy='epoch',
    eval_steps=50,
    learning_rate=2e-5,
    warmup_steps=ner_warmup_steps,
    disable_tqdm=False,
    gradient_accumulation_steps=4)

(['تحويل', 'احمد', 'بن', 'طولون'], [8, 2, 3, 3])
----------------------------------------


In [25]:
from transformers import AutoModelForTokenClassification

ner_model = AutoModelForTokenClassification.from_pretrained('/content/drive/MyDrive/{lang}_model/final_model'.format(lang=language),num_labels=len(ner_train_dataset.label2id))

ner_model.resize_token_embeddings(len(xlmr_tokenizer))


Some weights of the model checkpoint at /content/drive/MyDrive/arz_model/final_model were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at /content/dri

Embedding(281922, 384, padding_idx=0)

In [26]:
from sklearn.metrics import f1_score, accuracy_score
from seqeval.metrics import f1_score as seqeval_f1
from seqeval.metrics import accuracy_score as seqeval_accuracy


def create_id2label_ner():

    ner_tags = [
        'B-ORG',
        'I-ORG',
        'B-PER',
        'I-PER',
        'B-MISC',
        'I-MISC',
        'B-LOC',
        'I-LOC',
        'O'
    ]

    iter = 0
    id2label = {}
    for tag in ner_tags:
        id2label[iter] = tag
        iter += 1

    return id2label



def ner_metrics(eval_pred):

    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    corrected_preds = []
    corrected_labels = []

    id2label = create_id2label_ner()

    for i in range(0, len(labels)):
        temp_pred = []
        temp_label = []
        for j in range(0, len(labels[i])):
            if labels[i][j] != -100:
                temp_label.append(id2label[labels[i][j]])
                temp_pred.append(id2label[preds[i][j]])

        corrected_labels.append(temp_label)
        corrected_preds.append(temp_pred)

    acc = seqeval_accuracy(corrected_labels, corrected_preds)
    f1 = seqeval_f1(corrected_labels, corrected_preds)

    f1 = f1 * 100
    acc = acc * 100

    print('F1 during training: {}'.format(f1))
    print('Accuracy during training: {}'.format(acc))
    print('---------------------------------------------')

    return {
        'accuracy': acc,
        'f1': f1
    }

In [27]:
ner_collator = DataCollatorForTokenClassification(
    tokenizer=xlmr_tokenizer,
    padding='longest'
)

trainer = Trainer(
    model=ner_model,
    data_collator=ner_collator,
    args=ner_training_args,
    train_dataset=ner_train_dataset,
    eval_dataset=ner_eval_dataset,
    compute_metrics=ner_metrics,
)

trainer.train()

results = trainer.predict(ner_test_dataset)
results = results.metrics

***** Running training *****
  Num examples = 20000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6976,0.806221,82.917214,54.263566
2,0.514,0.666207,84.756899,61.363636
3,0.3926,0.540101,86.859396,62.835249
4,0.3365,0.545243,86.465177,66.666667
5,0.3098,0.530828,86.596583,66.409266


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 54.263565891472865
Accuracy during training: 82.91721419185282
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 61.363636363636374
Accuracy during training: 84.7568988173456
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 62.835249042145605
Accuracy during training: 86.85939553219448
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 66.66666666666667
Accuracy during training: 86.46517739816032
---------------------------------------------


Saving model checkpoint to ar_finetuned_ner_model/checkpoint-3000
Configuration saved in ar_finetuned_ner_model/checkpoint-3000/config.json
Model weights saved in ar_finetuned_ner_model/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 66.40926640926641
Accuracy during training: 86.59658344283837
---------------------------------------------




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 100
  Batch size = 8


F1 during training: 68.85245901639344
Accuracy during training: 87.32970027247956
---------------------------------------------


In [28]:
results

{'test_accuracy': 87.32970027247956,
 'test_f1': 68.85245901639344,
 'test_loss': 0.5435361862182617,
 'test_runtime': 0.4363,
 'test_samples_per_second': 229.191,
 'test_steps_per_second': 29.795}