# Prepare the model

## Load tokenizer

In [1]:
import tensorflow as tf
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer, TFXLMRobertaForMaskedLM, XLMRobertaConfig
import os
import numpy as np
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay


model_name = 'distill'
config = XLMRobertaConfig.from_pretrained('distill')
tokenizer = XLMRobertaTokenizer.from_pretrained('intfloat/multilingual-e5-small')



You are using a model of type bert to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.


## Load Student model

In [2]:
student_model = TFXLMRobertaForMaskedLM.from_pretrained('intfloat/multilingual-e5-small', from_pt=True, config=config)
student_model.roberta.embeddings.trainable = False

student_model.summary(expand_nested=True)

You are using a model of type bert to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaForMaskedLM: ['embeddings.position_ids']
- This IS expected if you are initializing TFXLMRobertaForMaskedLM from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaForMaskedLM from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFXLMRobertaForMaskedLM were not initialized from the PyTorch model and are newly initialized: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_hea

Model: "tfxlm_roberta_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFXLMRobertaMainL  multiple                  117505920 
 ayer)                                                           
                                                                 
 lm_head (TFXLMRobertaLMHea  multiple                  96610997  
 d)                                                              
                                                                 
Total params: 117904565 (449.77 MB)
Trainable params: 117904565 (449.77 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Prepare training Datasets

In [3]:
PRETRAINING_BATCH_SIZE = 126
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
VOCABULARY_SIZE = 250002
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32
SEQ_LENGTH = 512


folder = 'dataset/processed_uncased_blanklines/'
file_list = os.listdir(folder)
file_list = [f"{folder}/{_file}" for _file in file_list]




In [4]:
file_list

['dataset/processed_uncased_blanklines//conllu_all_uncased.txt',
 'dataset/processed_uncased_blanklines//oscar_all_uncased.txt',
 'dataset/processed_uncased_blanklines//talpco_indonesia.txt',
 'dataset/processed_uncased_blanklines//kompas.txt',
 'dataset/processed_uncased_blanklines//tempo.txt',
 'dataset/processed_uncased_blanklines//.DS_Store',
 'dataset/processed_uncased_blanklines//indo4b-1.txt',
 'dataset/processed_uncased_blanklines//jw300.txt',
 'dataset/processed_uncased_blanklines//13k_words.txt',
 'dataset/processed_uncased_blanklines//parallel_corpus.txt',
 'dataset/processed_uncased_blanklines//frog_storytelling.txt',
 'dataset/processed_uncased_blanklines//Archive.zip',
 'dataset/processed_uncased_blanklines//bppt.txt']

In [4]:
new_file_list = [
 'dataset/processed_uncased_blanklines/talpco_indonesia.txt',
#  'dataset/processed_uncased_blanklines/kompas.txt',
#  'dataset/processed_uncased_blanklines/tempo.txt',
 'dataset/processed_uncased_blanklines/jw300.txt',
 'dataset/processed_uncased_blanklines/13k_words.txt',
 'dataset/processed_uncased_blanklines/parallel_corpus.txt',
 'dataset/processed_uncased_blanklines/frog_storytelling.txt',
 'dataset/processed_uncased_blanklines/bppt.txt'
 ]

In [5]:
datasets = load_dataset("text", data_files={"train": new_file_list})

Found cached dataset text (/Users/mdaniyalk/.cache/huggingface/datasets/text/default-795907cc388a4c8a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
label = np.load('')

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [7]:
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

Map (num_proc=4):   0%|          | 0/714969 [00:00<?, ? examples/s]

In [8]:
block_size = 64

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/714969 [00:00<?, ? examples/s]

# Traning the models

## Distill student models

In [None]:
inputs = tf.keras.layers.Input(shape=(len(lm_datasets['train']['input_ids'][0]),), dtype=tf.int32, name="input_ids")
embedding = student_model.roberta
outputs = embedding(inputs).last_hidden_state
model = tf.keras.Model(inputs, outputs, name="distill_xlmroberta")


In [11]:
optimizer = AdamWeightDecay(learning_rate=1e-2, weight_decay_rate=0.001)
loss = tf.keras.losses.CosineSimilarity()

model.compile(loss=loss, 
              optimizer=optimizer, 
              jit_compile=True, 
              metrics=[tf.keras.metrics.CosineSimilarity, 'mse', 'mae'])



In [None]:
X_train = lm_datasets['train']['input_ids']
y_train = labels

In [11]:
teacher_model.fit(X_train, y_train, epochs=2)

 552/3125 [====>.........................] - ETA: 18:25

 554/3125 [====>.........................] - ETA: 18:26