In [1]:
import torch
import sentencepiece as spm
from transformers import XLMRobertaTokenizer, AutoModelForMaskedLM, AutoTokenizer
from tokenizers import SentencePieceBPETokenizer
from pathlib import Path
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
xmod_75 = AutoModelForMaskedLM.from_pretrained("facebook/xmod-base-75-269k")
xmod_75.save_pretrained("../models/xmod_75")

In [None]:
# initialize embeddings for new yoruba tokens
# https://github.com/huggingface/transformers/issues/1413

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
old_vocab = tokenizer.get_vocab()
new_vocab = XLMRobertaTokenizer('../vocab/spm.yoruba.model').get_vocab()

non_overlap = list(set(new_vocab.keys()) - set(old_vocab.keys()))
print("Non overlapping tokens: ", len(non_overlap))

tokenizer.add_tokens(non_overlap)
print("New vocab size with added tokens: ", len(tokenizer.get_vocab()))
xmod_75.resize_token_embeddings(len(tokenizer.get_vocab()))

print(non_overlap)

In [7]:
tokenizer.save_pretrained("../swissbert/tokenizer")

('../swissbert/tokenizer/tokenizer_config.json',
 '../swissbert/tokenizer/special_tokens_map.json',
 '../swissbert/tokenizer/sentencepiece.bpe.model',
 '../swissbert/tokenizer/added_tokens.json')

In [8]:
# test

input_text = "awọnnitilati"
tokens = tokenizer.tokenize(input_text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print("Tokens:", tokens)
print("Input IDs:", input_ids)

Tokens: ['awọn', '▁niti', 'lati']
Input IDs: [262533, 23311, 43602]


In [4]:
# add new adapter for yoruba

from adapter import XmodAdapter

model = xmod_75
new_lang = 'yo_XX'

for i in range(model.config.num_hidden_layers):
    new_adapter = XmodAdapter(model.config)
    model.roberta.encoder.layer[i].output.adapter_modules[new_lang] = new_adapter

if new_lang not in model.config.languages:
    model.config.languages.append(new_lang)

In [36]:
# Save models
import os
dir = '../models/phase1_yoruba'

if not os.path.exists(dir):
    os.makedirs(dir)

model.save_pretrained(dir)
tokenizer.save_pretrained(dir)


('../models/phase1_yoruba/tokenizer_config.json',
 '../models/phase1_yoruba/special_tokens_map.json',
 '../models/phase1_yoruba/sentencepiece.bpe.model',
 '../models/phase1_yoruba/added_tokens.json')

In [None]:
def freeze_shared_layers(model):
    # freeze everything
    for parameter in model.parameters():
        parameter.requires_grad = False

    # unfreeze embeddings and adapters    
    for parameter in model.roberta.embeddings.parameters():
        parameter.requires_grad = True
    for layer in model.roberta.encoder.layer:
        if layer.output.adapter_layer_norm is not None:
            for parameter in layer.output.adapter_layer_norm.parameters():
                parameter.requires_grad = True
        for parameter in layer.output.adapter_modules.parameters():
            parameter.requires_grad = True


freeze_shared_layers(model)
# for name, param in model.named_parameters():
#     print(name, param.requires_grad)