# Dataset

## Indo4B

In [None]:
!wget https://storage.googleapis.com/babert-pretraining/IndoNLU_finals/dataset/preprocessed/dataset_wot_uncased_blanklines.tar.xz
!tar -xvf dataset_wot_uncased_blanklines.tar.xz --directory dataset

## CCNEWS-ID

In [None]:
!wget https://storage.depia.wiki/ccnews-id.tar
!tar -xvf ccnews-id.tar --directory dataset

In [None]:
!wget https://raw.githubusercontent.com/Wikidepia/indonesian_datasets/master/dictionary/wordlist/data/wordlist.txt

# Prepare the model

## Load tokenizer

In [None]:
import tensorflow as tf
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer, TFXLMRobertaForMaskedLM, XLMRobertaConfig
import os
import numpy as np
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay


model_name = 'distill'
config = XLMRobertaConfig.from_pretrained(model_name)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name, config=config)



## Create student model with teacher weights

In [None]:
student_model = TFXLMRobertaForMaskedLM.from_pretrained('intfloat/multilingual-e5-small', from_pt=True, config=config)

student_model.summary(expand_nested=True)

## Load teacher model

In [None]:
teacher_model = TFXLMRobertaForMaskedLM.from_pretrained('intfloat/multilingual-e5-small', from_pt=True)

teacher_model.summary(expand_nested=True)

## Prepare distilled model

In [None]:
inputs = ???


student_embedding = student_model.roberta
student_embedding_out = student_embedding(inputs)
student_mlm_out = student_model(inputs)

model = tf.keras.Model(inputs, [student_embedding_out, student_mlm_out], name="distilled_xlmroberta")


loss={"head1": "mse", "head2": "mse"}


## Prepare training Datasets

In [None]:
PRETRAINING_BATCH_SIZE = 126
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
VOCABULARY_SIZE = 250002
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32
SEQ_LENGTH = 512


folder = 'dataset/processed_uncased_blanklines/'
file_list = os.listdir(folder)
file_list = [f"{folder}/{_file}" for _file in file_list]

datasets = load_dataset("text", data_files={"train": file_list[-1]})


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [None]:
block_size = 64

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15, return_tensors="np"
)

In [None]:
train_set = teacher_model.prepare_tf_dataset(
    lm_datasets["train"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

In [None]:
print('a')

In [None]:
lm_datasets['train']

In [None]:
lm_datasets['train']['input_ids']

In [None]:
unbatched_dataset = train_set.unbatch()
for idx, example in enumerate(unbatched_dataset):
    print(example)
    if idx == 0:
        break

# Traning the models

## Fine-tune teacher models

In [None]:
optimizer = AdamWeightDecay(learning_rate=1e-3, weight_decay_rate=0.01)

teacher_model.compile(optimizer=optimizer, jit_compile=True, metrics=['accuracy', tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3)])

In [None]:
model.roberta.trainable = False
teacher_model.fit(train_set, epochs=2)
model.roberta.trainable = True
teacher_model.fit(train_set, epochs=2)
model.roberta.trainable = False
teacher_model.fit(train_set, epochs=1)

In [None]:
teacher_model.save_pretrained(save_directory='teacher_e5', saved_model=true)

In [None]:
inputs = tf.keras.layers.Input(shape=(len(lm_datasets['train']['input_ids'][0]),), dtype=tf.int32, name="input_ids")
embedding = teacher_model.roberta
outputs = embedding(inputs).last_hidden_state
model = tf.keras.Model(inputs, outputs, name="embedding_xlmroberta")


In [None]:
output = model.predict(lm_datasets['train']['input_ids'][0:1])

In [None]:
output