<a href="https://colab.research.google.com/github/ninja197/BAexperiments/blob/main/MiniLM_extend_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets tokenizers seqeval sentencepiece
!pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.4 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.1 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.8 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 43.7 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     

In [2]:
import tensorflow as tf
from tokenizers import SentencePieceUnigramTokenizer
from transformers import AutoTokenizer, XLMRobertaTokenizer
from transformers import AutoModelForTokenClassification, AutoModelForMaskedLM
import copy
import os
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, DataCollatorForLanguageModeling
import logging
import sentencepiece

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/sp_model')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sentencepiece_model_pb2 as sp_model

# Train language-specific SentencePiece and extend tokenizer vocabulary

In [5]:
language = 'arz'

In [8]:
# Load Bible file names

file_name = ''
for bible in os.listdir():
    code = bible[:3]
    if code == language:
        file_name = bible
        print('Using {} to extend vocabulary'.format(file_name))
        logging.info('Using {} to extend vocabulary'.format(file_name))
        break
file_name =  file_name

#Get new tokens from bible
spm_tokenizer = SentencePieceUnigramTokenizer()
spm_tokenizer.train(
    files=[file_name],
    vocab_size=32000)

new_tokens = spm_tokenizer.get_vocab().keys()

#Get tokens in XLM-R
orig_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
original_tokens = orig_tokenizer.get_vocab().keys()

#Get unique new tokens
new_unique_tokens = []
for token in new_tokens:
    if token not in original_tokens:
        new_unique_tokens.append(token)

print(len(new_tokens))
print(len(new_unique_tokens))

Using arz-x-bible-arz-v1.txt to extend vocabulary


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

32000
31919


In [9]:
#Load pretrained XLM-R SPM
m = sp_model.ModelProto()
m.ParseFromString(open('/content/drive/MyDrive/sp_model/sentencepiece.bpe.model', 'rb').read())

#Create a raw SentencePiece (no other way to initialize?)
raw_piece = copy.deepcopy(m.pieces[50])

#Dummy piece added first to hold place for original mask token
dummy_mask = copy.deepcopy(raw_piece)
dummy_mask.piece = 'DUMMY_MASK'

m.pieces.append(dummy_mask)

#Add new tokens to SPM and save new model
for token in new_unique_tokens:
    temp_token = copy.deepcopy(raw_piece)
    temp_token.piece = token
    m.pieces.append(temp_token)

new_spm_save_dir = 'extended_spm.model'
!touch extended_spm.model
with open(new_spm_save_dir, 'wb') as f:
    f.write(m.SerializeToString())

#Load extended SPM as tokenizer

xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmr_tokenizer.vocab_file = new_spm_save_dir
xlmr_tokenizer.sp_model.load(xlmr_tokenizer.vocab_file)

#Re-align mask token
xlmr_tokenizer.fairseq_tokens_to_ids['<mask>'] = xlmr_tokenizer._convert_token_to_id('DUMMY_MASK')

# Train embeddings on bible

In [7]:
import torch
import random
from collections import defaultdict

class BibleDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, max_len, source_language, target_language, helper_languages, configuration, pretraining_type):

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.target_language = target_language
        self.helper_languages = helper_languages
        self.configuration = configuration
        self.pretraining_type = pretraining_type

        # Load Bible file names
        self.bibles = [bible for bible in os.listdir() if 'txt' in bible]

        #Load English examples
        self.english_lines = self.read_bible(source_language)
        self.target_lines = self.read_bible(target_language)

        if self.configuration == 'one':
            self.finalized_examples = []
            for i in self.target_lines:
                if self.target_lines[i]:
                    self.finalized_examples.append(self.target_lines[i])
        else:
            if self.pretraining_type == 'mlm':
                self.finalized_examples = self.create_mlm_examples()
            elif self.pretraining_type == 'tlm':
                self.finalized_examples = self.create_tlm_examples()

        random.seed(42)
        random.shuffle(self.finalized_examples)

        print(self.finalized_examples[:3])

    def __getitem__(self, idx):
        instance = self.finalized_examples[idx]

        enc = self.tokenizer(
            instance,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids= False,
            return_tensors='pt'
        )


        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
        }


    def __len__(self):
        return len(self.finalized_examples)

    def read_bible(self, language, helper=False):

        bible_file = None

        for bible in self.bibles:
            if bible[:3] == language:
                bible_file = bible


        line_dictionary = defaultdict(None)
        with open(bible_file, 'r') as f:
            for i, line in enumerate(f,1):
                if i >= 26282:
                    line_dictionary[i] = line.strip()

        return line_dictionary

    def create_mlm_examples(self):

        #Examples are unaligned. One training instance is one line from the bible in one language
        finalized_examples = []

        #Are always going to use English and target Bible lines

        for i in self.target_lines:
            if self.target_lines[i]:
                if self.english_lines[i]:
                    finalized_examples.append(self.english_lines[i])
                    finalized_examples.append(self.target_lines[i])

                if self.configuration in ['many-to-one', 'many-to-many']:
                    for lang in self.helper_language_lines:
                        if lang[i]:
                            finalized_examples.append(lang[i])

        return finalized_examples


In [8]:
# prepare dataset

cont_pre_dataset = BibleDataset(
    tokenizer=xlmr_tokenizer,
    max_len=256,
    target_language=language,
    source_language='eng',
   helper_languages=None,
    configuration='one',
    pretraining_type='mlm')

['أَفَقَدْ صِرْتُ إِذًا عَدُوًّا لَكُمْ لِأَنِّي أَصْدُقُ لَكُمْ ؟', 'حَتَّى تَعَجَّبَ ٱلْجُمُوعُ إِذْ رَأَوْا ٱلْخُرْسَ يَتَكَلَّمُونَ ، وَٱلشُّلَّ يَصِحُّونَ ، وَٱلْعُرْجَ يَمْشُونَ ، وَٱلْعُمْيَ يُبْصِرُونَ . وَمَجَّدُوا إِلَهَ إِسْرَائِيلَ .', 'وَيَبْكِي تُجَّارُ ٱلْأَرْضِ وَيَنُوحُونَ عَلَيْهَا ، لِأَنَّ بَضَائِعَهُمْ لَا يَشْتَرِيهَا أَحَدٌ فِي مَا بَعْدُ ،']


In [9]:
cont_pre_collator = DataCollatorForLanguageModeling(
    mlm=True,
    tokenizer=xlmr_tokenizer,
    mlm_probability=0.15)

In [10]:
# prepare training
cont_pre_warmup_steps = int((40 * (cont_pre_dataset.__len__() // (8 * 1 * 4))) * .01)

!mkdir temp_directory
cont_pre_training_args = TrainingArguments(
    output_dir='temp_directory',
    num_train_epochs=40,
    per_device_train_batch_size=8,
    save_steps=5000,
    logging_steps=50,
    save_total_limit=3,
    prediction_loss_only=True,
    evaluation_strategy='no',
    learning_rate=2e-5,
    warmup_steps=cont_pre_warmup_steps,
    dataloader_num_workers=0,
    disable_tqdm=False,
    gradient_accumulation_steps=4
)

In [14]:
# load model and extend embeddings

cont_pre_model = AutoModelForMaskedLM.from_pretrained('microsoft/Multilingual-MiniLM-L12-H384')
print(cont_pre_model)
cont_pre_model.resize_token_embeddings(len(xlmr_tokenizer))
print(cont_pre_model)
logging.info('Resizing token embedding to {}'.format(len(xlmr_tokenizer)))
cont_pre_trainer = Trainer(
                model=cont_pre_model,
                args=cont_pre_training_args,
                train_dataset=cont_pre_dataset,
                data_collator=cont_pre_collator
            )

loading configuration file https://huggingface.co/microsoft/Multilingual-MiniLM-L12-H384/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/12a5ad52cb7fc5542e16e354fe6eb487f2f87edac63bf85dc238b1236dbaf24c.ccf88548169a21266c411bcf65585ba761d762a9c85fde572f529806fdd94ee2
Model config BertConfig {
  "_name_or_path": "microsoft/Multilingual-MiniLM-L12-H384",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "XLMRobertaTokenizer",
  "transformers_version": "4.19.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 250037
}

loading weights file https://huggingface.co/microsoft/M

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250037, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=T

In [15]:
# train and save extended model
cont_pre_trainer.train()
cont_pre_model.save_pretrained('final_model')

***** Running training *****
  Num examples = 7959
  Num Epochs = 40
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 9920


Step,Training Loss
50,12.0458
100,9.024
150,8.212
200,7.7986
250,7.5266
300,7.147
350,6.9606
400,6.9215
450,6.6727
500,6.7298


Saving model checkpoint to temp_directory/checkpoint-5000
Configuration saved in temp_directory/checkpoint-5000/config.json
Model weights saved in temp_directory/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in final_model/config.json
Model weights saved in final_model/pytorch_model.bin


# Finetuning

In [10]:
import torch 

class NERDataset(torch.utils.data.Dataset):
    def __init__(self, file, lang, max_len, tokenizer, assignment):

        self.tokenizer = tokenizer

        self.max_len = max_len
        self.assignment = assignment
        self.lang = lang

        self.create_label2id()

        self.examples = self.read_file(file)

        print(self.examples[0])
        print('----------------------------------------')

    def __getitem__(self, idx):
        return self.encode(idx)

    def __len__(self):
        return len(self.examples)

    def create_label2id(self):

        ner_tags = [
            'B-ORG',
            'I-ORG',
            'B-PER',
            'I-PER',
            'B-MISC',
            'I-MISC',
            'B-LOC',
            'I-LOC',
            'O'
        ]

        iter = 0
        self.label2id = {}
        for tag in ner_tags:
            self.label2id[tag] = iter
            iter += 1

    def read_file(self, file, convert_labels=True):

        inps = []

        with open(file, 'r') as f:
            temp_tokens = []
            temp_labels = []
            for line in f:
                if line.strip():

                    token = line.strip().split('\t')
                    assert len(token) == 2

                    if convert_labels:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(self.label2id[token[1]])

                    else:
                        temp_tokens.append(token[0].replace(self.lang + ':', ''))
                        temp_labels.append(token[1])

                else:
                    inps.append((temp_tokens,temp_labels))
                    temp_tokens = []
                    temp_labels = []
        return inps

    def encode(self, id):
        instance = self.examples[id]


        forms = instance[0]
        labels = instance[1]

        expanded_labels = []
        label_mask = []

        for i in range(0, len(forms)):

            subwords = self.tokenizer.tokenize(forms[i])

            if self.assignment == 'first':
                expanded_labels.append(labels[i])
                for j in range(1, len(subwords)):
                    expanded_labels.append(-100)
            elif self.assignment == 'all':
                for j in range(0,len(subwords)):
                    expanded_labels.append(labels[i])
                    if j < len(subwords) - 1:
                        label_mask.append(0)
                    else:
                        label_mask.append(1)

            elif self.assignment == 'last':
                for j in range(0,len(subwords)-1):
                    expanded_labels.append(-100)
                expanded_labels.append(labels[i])


        s1 = ' '.join(forms)

        enc = self.tokenizer(
            s1,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=True,
        )



        if len(expanded_labels) > self.max_len:
            expanded_labels = expanded_labels[:self.max_len]

        enc['labels'] = expanded_labels

        return enc


if __name__ == '__main__':

    # x = NERDataset(
    #     file='data/ner/rahimi_output/eng/train',
    #     max_len=256,
    #     tokenizer=None,
    #     assignment='last'
    # )

    inps = []
    labels_found = []
    lang='en'
    with open('/content/drive/MyDrive/en/train') as f:
        temp_tokens = []
        for line in f:
            if line.strip():
                token = line.strip().split('\t')
                assert len(token) == 2
                temp_tokens.append(
                    (token[0].replace(lang + ':', ''), token[1])
                )
            else:
                inps.append(temp_tokens)
                temp_tokens = []

    print(inps[5])
    print(len(inps))

[('St.', 'B-ORG'), ('Mary', 'I-ORG'), ("'s", 'I-ORG'), ('Catholic', 'I-ORG'), ('Church', 'I-ORG'), ('(', 'I-ORG'), ('Sandusky', 'I-ORG'), (',', 'I-ORG'), ('Ohio', 'I-ORG'), (')', 'I-ORG')]
20000


In [11]:
# filefinder
def biblelang2nerlang(bible_lang):
    language_mapping = '/content/drive/MyDrive/NER/bible_ner_xlmr_split.txt'
    with open(language_mapping, 'r') as f:
        for line in f:
            data = line.strip().split(',')
            if data[1] == bible_lang:
                return data[2]

def lang_to_ner(lang, split):

    ner_dir = '/content/drive/MyDrive/{lang}/{split}'.format(lang=biblelang2nerlang(lang), split=split)

    return ner_dir

In [19]:
# define training (english) and evaluation dataset
ner_train_dataset = NERDataset(file=lang_to_ner('eng', 'train'),
                                       lang='en', max_len=256, tokenizer=xlmr_tokenizer,
                                       assignment='last')


ner_eval_dataset = NERDataset(file=lang_to_ner(language, 'dev'),
                          lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer,
                          assignment='last')

(['R.H.', 'Saunders', '(', 'St.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')'], [0, 1, 8, 0, 1, 1, 8, 8, 8, 8, 8])
----------------------------------------
(['انتاجه', 'فى', 'امريكا', 'كندا', 'وبيتسعر', 'غالبن', 'فى', 'الصين', '.'], [8, 8, 6, 6, 8, 8, 8, 6, 8])
----------------------------------------


In [20]:
ner_test_dataset = NERDataset(file=lang_to_ner(language, 'test'), lang=biblelang2nerlang(language), max_len=256, tokenizer=xlmr_tokenizer, assignment='last')


ner_warmup_steps = int((5 * (ner_train_dataset.__len__() // (32 * 4 * 1))) * .01)
!mkdir finetuned_ner_model
ner_training_args = TrainingArguments(
    output_dir= 'finetuned_ner_model',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=25,
    save_total_limit=3,
    save_steps=3000,
    evaluation_strategy='epoch',
    eval_steps=50,
    learning_rate=2e-5,
    warmup_steps=ner_warmup_steps,
    disable_tqdm=False,
    gradient_accumulation_steps=4)

(['تحويل', 'احمد', 'بن', 'طولون'], [8, 2, 3, 3])
----------------------------------------


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
from transformers import AutoModelForTokenClassification

ner_model = AutoModelForTokenClassification.from_pretrained('/content/drive/MyDrive/arz_model/final_model',num_labels=len(ner_train_dataset.label2id))

ner_model.resize_token_embeddings(len(xlmr_tokenizer))


loading configuration file /content/drive/MyDrive/arz_model/final_model/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/arz_model/final_model",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type":

Embedding(281922, 384, padding_idx=0)

In [23]:
from sklearn.metrics import f1_score, accuracy_score
from seqeval.metrics import f1_score as seqeval_f1
from seqeval.metrics import accuracy_score as seqeval_accuracy


def create_id2label_ner():

    ner_tags = [
        'B-ORG',
        'I-ORG',
        'B-PER',
        'I-PER',
        'B-MISC',
        'I-MISC',
        'B-LOC',
        'I-LOC',
        'O'
    ]

    iter = 0
    id2label = {}
    for tag in ner_tags:
        id2label[iter] = tag
        iter += 1

    return id2label



def ner_metrics(eval_pred):

    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(-1)

    corrected_preds = []
    corrected_labels = []

    id2label = create_id2label_ner()

    for i in range(0, len(labels)):
        temp_pred = []
        temp_label = []
        for j in range(0, len(labels[i])):
            if labels[i][j] != -100:
                temp_label.append(id2label[labels[i][j]])
                temp_pred.append(id2label[preds[i][j]])

        corrected_labels.append(temp_label)
        corrected_preds.append(temp_pred)

    acc = seqeval_accuracy(corrected_labels, corrected_preds)
    f1 = seqeval_f1(corrected_labels, corrected_preds)

    f1 = f1 * 100
    acc = acc * 100

    logging.info('F1 during training: {}'.format(f1))
    logging.info('Accuracy during training: {}'.format(acc))
    logging.info('---------------------------------------------')

    return {
        'accuracy': acc,
        'f1': f1
    }

In [24]:
ner_collator = DataCollatorForTokenClassification(
    tokenizer=xlmr_tokenizer,
    padding='longest'
)

trainer = Trainer(
    model=ner_model,
    data_collator=ner_collator,
    args=ner_training_args,
    train_dataset=ner_train_dataset,
    eval_dataset=ner_eval_dataset,
    compute_metrics=ner_metrics,
)

trainer.train()

results = trainer.predict(ner_test_dataset)
results = results.metrics

***** Running training *****
  Num examples = 20000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 3125


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7242,1.085815,69.77661,34.551495
2,0.538,0.943449,72.667543,36.426117
3,0.4481,0.847316,75.952694,40.989399
4,0.3704,0.978441,71.353482,33.670034
5,0.3678,0.916222,74.244415,39.007092


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to finetuned_ner_model/checkpoint-3000
Configuration saved in finetuned_ner_model/checkpoint-3000/config.json
Model weights saved in finetuned_ner_model/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 100
  Batch size = 8


In [25]:
results

{'test_accuracy': 72.88828337874659,
 'test_f1': 45.32374100719424,
 'test_loss': 0.9322565197944641,
 'test_runtime': 0.3773,
 'test_samples_per_second': 265.017,
 'test_steps_per_second': 34.452}