# Prepare the model

## Load tokenizer

In [None]:
import tensorflow as tf
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer, TFXLMRobertaForMaskedLM, XLMRobertaConfig
import os
import numpy as np
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay


model_name = 'distill'
config = XLMRobertaConfig.from_pretrained('distill')
tokenizer = XLMRobertaTokenizer.from_pretrained('intfloat/multilingual-e5-small')



## Load Student model

In [None]:
student_model = TFXLMRobertaForMaskedLM.from_pretrained('intfloat/multilingual-e5-small', from_pt=True, config=config)
student_model.roberta.embeddings.trainable = False

student_model.summary(expand_nested=True)

## Prepare training Datasets

In [None]:
PRETRAINING_BATCH_SIZE = 126
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
VOCABULARY_SIZE = 250002
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32
SEQ_LENGTH = 512


folder = 'dataset/processed_uncased_blanklines/'
file_list = os.listdir(folder)
file_list = [f"{folder}/{_file}" for _file in file_list]




In [None]:
file_list

In [None]:
new_file_list = [
 'dataset/processed_uncased_blanklines/talpco_indonesia.txt',
#  'dataset/processed_uncased_blanklines/kompas.txt',
#  'dataset/processed_uncased_blanklines/tempo.txt',
 'dataset/processed_uncased_blanklines/jw300.txt',
 'dataset/processed_uncased_blanklines/13k_words.txt',
 'dataset/processed_uncased_blanklines/parallel_corpus.txt',
 'dataset/processed_uncased_blanklines/frog_storytelling.txt',
 'dataset/processed_uncased_blanklines/bppt.txt'
 ]

In [None]:
datasets = load_dataset("text", data_files={"train": new_file_list})

In [None]:
label = np.load('')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [None]:
block_size = 64

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)

# Traning the models

## Distill student models

In [None]:
inputs = tf.keras.layers.Input(shape=(len(lm_datasets['train']['input_ids'][0]),), dtype=tf.int32, name="input_ids")
embedding = student_model.roberta
outputs = embedding(inputs).last_hidden_state
model = tf.keras.Model(inputs, outputs, name="distill_xlmroberta")


In [None]:
optimizer = AdamWeightDecay(learning_rate=1e-2, weight_decay_rate=0.001)
loss = tf.keras.losses.CosineSimilarity()

model.compile(loss=loss, 
              optimizer=optimizer, 
              jit_compile=True, 
              metrics=[tf.keras.metrics.CosineSimilarity, 'mse', 'mae'])

In [None]:
X_train = lm_datasets['train']['input_ids']
y_train = labels

In [None]:
teacher_model.fit(X_train, y_train, epochs=2)