<a href="https://colab.research.google.com/github/ninja197/BAexperiments/blob/main/MiniLM_extend_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets tokenizers seqeval sentencepiece
!pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 4.1 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 72.8 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 57.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 9.5 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |███████████████████████████

In [2]:
import tensorflow as tf
from tokenizers import SentencePieceUnigramTokenizer
from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForMaskedLM
import copy
import os
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, DataCollatorForLanguageModeling
import sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/sp_model')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sentencepiece_model_pb2 as sp_model

# Train language-specific SentencePiece and extend tokenizer vocabulary

In [5]:
language = 'yor'

In [6]:
# Load Bible file names

file_name = ''
for bible in os.listdir():
    code = bible[:3]
    if code == language:
        file_name = bible
        print('Using {} to extend vocabulary'.format(file_name))
        break
file_name =  file_name

#Get new tokens from bible
spm_tokenizer = SentencePieceUnigramTokenizer()
spm_tokenizer.train(
    files=[file_name],
    vocab_size=32000)

new_tokens = spm_tokenizer.get_vocab().keys()

#Get tokens in XLM-R
orig_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
original_tokens = orig_tokenizer.get_vocab().keys()

#Get unique new tokens
new_unique_tokens = []
for token in new_tokens:
    if token not in original_tokens:
        new_unique_tokens.append(token)

print(len(new_tokens))
print(len(new_unique_tokens))

Using yor-x-bible-yor-v1.txt to extend vocabulary


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

4232
2897


In [7]:
#Load pretrained XLM-R SPM
m = sp_model.ModelProto()
m.ParseFromString(open('/content/drive/MyDrive/sp_model/sentencepiece.bpe.model', 'rb').read())

#Create a raw SentencePiece (no other way to initialize?)
raw_piece = copy.deepcopy(m.pieces[50])

#Dummy piece added first to hold place for original mask token
dummy_mask = copy.deepcopy(raw_piece)
dummy_mask.piece = 'DUMMY_MASK'

m.pieces.append(dummy_mask)

#Add new tokens to SPM and save new model
for token in new_unique_tokens:
    temp_token = copy.deepcopy(raw_piece)
    temp_token.piece = token
    m.pieces.append(temp_token)

new_spm_save_dir = 'extended_spm.model'
!touch extended_spm.model
with open(new_spm_save_dir, 'wb') as f:
    f.write(m.SerializeToString())

#Load extended SPM as tokenizer

xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_tokenizer.vocab_file = new_spm_save_dir
xlmr_tokenizer.sp_model.load(xlmr_tokenizer.vocab_file)

#Re-align mask token
xlmr_tokenizer.fairseq_tokens_to_ids['<mask>'] = xlmr_tokenizer._convert_token_to_id('DUMMY_MASK')

# Train embeddings on bible

In [8]:
import torch
import random
from collections import defaultdict

class BibleDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, max_len, source_language, target_language, helper_languages, configuration, pretraining_type):

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target_language = target_language
        self.helper_languages = helper_languages
        self.configuration = configuration
        self.pretraining_type = pretraining_type

        # Load Bible file names
        self.bibles = [bible for bible in os.listdir() if 'txt' in bible]

        # Load English examples
        # self.english_lines = self.read_bible(source_language)
        self.target_lines = self.read_bible(target_language)

        if self.configuration == 'one':
            self.finalized_examples = []
            for i in self.target_lines:
                if self.target_lines[i]:
                    self.finalized_examples.append(self.target_lines[i])

        random.seed(42)
        random.shuffle(self.finalized_examples)

        print(self.finalized_examples[:3])

    def __getitem__(self, idx):
        instance = self.finalized_examples[idx]

        enc = self.tokenizer(
            instance,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids= False,
            return_tensors='pt'
        )


        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
        }


    def __len__(self):
        return len(self.finalized_examples)

    def read_bible(self, language, helper=False):

        bible_file = None

        for bible in self.bibles:
            if bible[:3] == language:
                bible_file = bible


        line_dictionary = defaultdict(None)
        with open(bible_file, 'r') as f:
            for i, line in enumerate(f,1):
                if i >= 26282:
                    line_dictionary[i] = line.strip()

        return line_dictionary



In [9]:
# prepare dataset

cont_pre_dataset = BibleDataset(
    tokenizer=xlmr_tokenizer,
    max_len=256,
    target_language=language,
    source_language='eng',
    helper_languages=None,
    configuration='one',
    pretraining_type='mlm')

['Nigbana ni nwọn tutọ́ si i loju , nwọn kàn a lẹṣẹ́ ; awọn ẹlomiran fi atẹ́lọwọ́ wọn gbá a loju ;', 'Nitori awọn eke Kristi , ati eke wolĩ yio dide , nwọn o si fi àmi ati ohun iyanu nla hàn ; tobẹ̃ bi o le ṣe ṣe nwọn o tàn awọn ayanfẹ pãpã .', 'Njẹ bi o ba ni ọkọ miran nigbati ọkọ rẹ̀ wà lãye , panṣaga li a o pè e : ṣugbọn bi ọkọ rẹ̀ ba kú , o bọ lọwọ ofin na ; ki yio si jẹ panṣaga bi o ba ni ọkọ miran .']


In [10]:
cont_pre_collator = DataCollatorForLanguageModeling(
    mlm=True,
    tokenizer=xlmr_tokenizer,
    mlm_probability=0.15)

In [11]:
# prepare training
cont_pre_warmup_steps = int((40 * (cont_pre_dataset.__len__() // (8 * 1 * 4))) * .01)

!mkdir temp_directory
cont_pre_training_args = TrainingArguments(
    output_dir='temp_directory',
    num_train_epochs=40,
    per_device_train_batch_size=8,
    save_steps=5000,
    logging_steps=50,
    save_total_limit=3,
    prediction_loss_only=True,
    evaluation_strategy='no',
    learning_rate=2e-5,
    warmup_steps=cont_pre_warmup_steps,
    dataloader_num_workers=0,
    disable_tqdm=False,
    gradient_accumulation_steps=4
)

In [12]:
# load model and extend embeddings

cont_pre_model = AutoModelForMaskedLM.from_pretrained('microsoft/Multilingual-MiniLM-L12-H384')
cont_pre_model.resize_token_embeddings(len(xlmr_tokenizer))
print('Resizing token embedding to {}'.format(len(xlmr_tokenizer)))

cont_pre_trainer = Trainer(
                model=cont_pre_model,
                args=cont_pre_training_args,
                train_dataset=cont_pre_dataset,
                data_collator=cont_pre_collator
            )

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/449M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at microsoft/Multilingual-MiniLM-L12-H384 and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resizing token embedding to 252900


In [13]:
# train and save extended model
cont_pre_trainer.train()
cont_pre_model.save_pretrained('model_extended')

***** Running training *****
  Num examples = 7738
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 9680


Step,Training Loss
50,11.914
100,7.5386
150,6.523
200,6.0707
250,5.7965
300,5.452
350,5.2617
400,5.0484
450,4.8986
500,4.7746


Saving model checkpoint to temp_directory/checkpoint-5000
Configuration saved in temp_directory/checkpoint-5000/config.json
Model weights saved in temp_directory/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in model_extended/config.json
Model weights saved in model_extended/pytorch_model.bin


# Finetuning

In [14]:
import torch 

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, file, lang, max_len, tokenizer, assignment):

        self.tokenizer = tokenizer

        self.max_len = max_len
        self.assignment = assignment
        self.lang = lang

        self.create_label2id()

        self.examples = self.read_file(file)

        print(self.examples[0])
        print('----------------------------------------')

    def __getitem__(self, idx):
        return self.encode(idx)

    def __len__(self):
        return len(self.examples)

    def create_label2id(self):

        ner_tags = [
            'B-ORG',
            'I-ORG',
            'B-PER',
            'I-PER',
            'B-MISC',
            'I-MISC',
            'B-LOC',
            'I-LOC',
            'O'
        ]

        iter = 0
        self.label2id = {}
        for tag in ner_tags:
            self.label2id[tag] = iter
            iter += 1

    def read_file(self, file, convert_labels=True):

        inps = []

        with open(file, 'r') as f:
            temp_tokens = []
            temp_labels = []
            for line in f:
                if line.strip():

                    token = line.strip().split('\t')
                    assert len(token) == 2

                    if convert_labels:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(self.label2id[token[1]])

                    else:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(token[1])

                else:
                    inps.append((temp_tokens,temp_labels))
                    temp_tokens = []
                    temp_labels = []
        return inps

    def encode(self, id):
        instance = self.examples[id]


        forms = instance[0]
        labels = instance[1]

        expanded_labels = []
        label_mask = []

        for i in range(0, len(forms)):

            subwords = self.tokenizer.tokenize(forms[i])

            if self.assignment == 'first':
                expanded_labels.append(labels[i])
                for j in range(1, len(subwords)):
                    expanded_labels.append(-100)
            elif self.assignment == 'all':
                for j in range(0,len(subwords)):
                    expanded_labels.append(labels[i])
                    if j < len(subwords) - 1:
                        label_mask.append(0)
                    else:
                        label_mask.append(1)

            elif self.assignment == 'last':
                for j in range(0,len(subwords)-1):
                    expanded_labels.append(-100)
                expanded_labels.append(labels[i])


        s1 = ' '.join(forms)

        enc = self.tokenizer(
            s1,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
        )



        if len(expanded_labels) > self.max_len:
            expanded_labels = expanded_labels[:self.max_len]

        enc['labels'] = expanded_labels

        return enc


In [15]:
# filefinder
def biblelang2nerlang(bible_lang):
    language_mapping = '/content/drive/MyDrive/NER/bible_ner_xlmr_split.txt'
    with open(language_mapping, 'r') as f:
        for line in f:
            data = line.strip().split(',')
            if data[1] == bible_lang:
                return data[2]

def lang_to_ner(lang, split):

    ner_dir = '/content/drive/MyDrive/{lang}/{split}'.format(lang=biblelang2nerlang(lang), split=split)

    return ner_dir

In [16]:
# define training (english) and evaluation dataset (target language)
ner_train_dataset = NERDataset(file=lang_to_ner('eng', 'train'),
                                       lang='en', max_len=256, tokenizer=xlmr_tokenizer,
                                       assignment='last')


ner_eval_dataset = NERDataset(file=lang_to_ner(language, 'dev'),
                          lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer,
                          assignment='last')

(['R.H.', 'Saunders', '(', 'St.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')'], [0, 1, 8, 0, 1, 1, 8, 8, 8, 8, 8])
----------------------------------------
(['Ilé-ìgbìmọ̀', 'Aṣòfin', 'Oníbínibí', 'ilẹ̀', 'Nàìjíríà'], [0, 1, 1, 1, 1])
----------------------------------------


In [17]:
ner_test_dataset = NERDataset(file=lang_to_ner(language, 'test'), lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer, assignment='last')

print('NER Train set length: {}'.format(ner_train_dataset.__len__()))
print('NER Eval set length: {}'.format(ner_eval_dataset.__len__()))
print('NER Test set length: {}'.format(ner_test_dataset.__len__()))

ner_warmup_steps = int((5 * (ner_train_dataset.__len__() // (32 * 4 * 1))) * .01)

!mkdir finetuned_ner_model_extend
ner_training_args = TrainingArguments(
    output_dir= 'finetuned_ner_model_extend',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    logging_steps=25,
    save_total_limit=3,
    save_steps=3000,
    evaluation_strategy='epoch',
    eval_steps=50,
    learning_rate=2e-5,
    warmup_steps=ner_warmup_steps,
    disable_tqdm=False,
    gradient_accumulation_steps=1)


(['Agbègbè', 'Ìjọba', 'Ìbílẹ̀', 'Gúúsù-Ìwọòrùn', 'Akoko'], [6, 7, 7, 7, 7])
----------------------------------------
NER Train set length: 20000
NER Eval set length: 100
NER Test set length: 100


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
from transformers import AutoModelForTokenClassification

ner_model = AutoModelForTokenClassification.from_pretrained('model_extended',num_labels=len(ner_train_dataset.label2id))

ner_model.resize_token_embeddings(len(xlmr_tokenizer))


loading configuration file model_extended/config.json
Model config BertConfig {
  "_name_or_path": "model_extended",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "XLMRobertaTokenizer",
  "

Embedding(252900, 384, padding_idx=0)

In [19]:
from sklearn.metrics import f1_score, accuracy_score
from seqeval.metrics import f1_score as seqeval_f1
from seqeval.metrics import accuracy_score as seqeval_accuracy


def create_id2label_ner():

    ner_tags = [
        'B-ORG',
        'I-ORG',
        'B-PER',
        'I-PER',
        'B-MISC',
        'I-MISC',
        'B-LOC',
        'I-LOC',
        'O'
    ]

    iter = 0
    id2label = {}
    for tag in ner_tags:
        id2label[iter] = tag
        iter += 1

    return id2label



def ner_metrics(eval_pred):

    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    corrected_preds = []
    corrected_labels = []

    id2label = create_id2label_ner()

    for i in range(0, len(labels)):
        temp_pred = []
        temp_label = []
        for j in range(0, len(labels[i])):
            if labels[i][j] != -100:
                temp_label.append(id2label[labels[i][j]])
                temp_pred.append(id2label[preds[i][j]])

        corrected_labels.append(temp_label)
        corrected_preds.append(temp_pred)

    acc = seqeval_accuracy(corrected_labels, corrected_preds)
    f1 = seqeval_f1(corrected_labels, corrected_preds)

    f1 = f1 * 100
    acc = acc * 100

    print('F1 during training: {}'.format(f1))
    print('Accuracy during training: {}'.format(acc))
    print('---------------------------------------------')

    return {
        'accuracy': acc,
        'f1': f1
    }

In [20]:
ner_collator = DataCollatorForTokenClassification(
    tokenizer=xlmr_tokenizer,
    padding='longest'
)

trainer = Trainer(
    model=ner_model,
    data_collator=ner_collator,
    args=ner_training_args,
    train_dataset=ner_train_dataset,
    eval_dataset=ner_eval_dataset,
    compute_metrics=ner_metrics,
)

trainer.train()

results = trainer.predict(ner_test_dataset)
results = results.metrics

***** Running training *****
  Num examples = 20000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7401,1.353779,56.03272,27.131783
2,0.5448,1.399604,55.419223,27.888446
3,0.4413,1.316902,55.623722,25.660377
4,0.3731,1.340388,56.03272,34.309623
5,0.3765,1.273266,56.237219,33.198381


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 27.131782945736433
Accuracy during training: 56.032719836400815
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 27.88844621513944
Accuracy during training: 55.419222903885476
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 25.660377358490567
Accuracy during training: 55.623721881390594
---------------------------------------------


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 34.30962343096235
Accuracy during training: 56.032719836400815
---------------------------------------------


Saving model checkpoint to finetuned_ner_model_extend/checkpoint-3000
Configuration saved in finetuned_ner_model_extend/checkpoint-3000/config.json
Model weights saved in finetuned_ner_model_extend/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


F1 during training: 33.198380566801625
Accuracy during training: 56.23721881390593
---------------------------------------------




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 100
  Batch size = 8


F1 during training: 41.52542372881356
Accuracy during training: 64.21471172962227
---------------------------------------------


In [21]:
results

{'test_accuracy': 64.21471172962227,
 'test_f1': 41.52542372881356,
 'test_loss': 1.1329272985458374,
 'test_runtime': 0.1904,
 'test_samples_per_second': 525.143,
 'test_steps_per_second': 68.269}