# Prepare the model

## Load tokenizer

In [2]:
import tensorflow as tf
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer, TFXLMRobertaForMaskedLM, XLMRobertaConfig
import os
import numpy as np
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay


model_name = 'distill'
config = XLMRobertaConfig.from_pretrained('intfloat/multilingual-e5-small')
tokenizer = XLMRobertaTokenizer.from_pretrained('intfloat/multilingual-e5-small')



You are using a model of type bert to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.


## Load teacher model

In [3]:
teacher_model = TFXLMRobertaForMaskedLM.from_pretrained('kaggle/working/teacher_e5')
teacher_model.roberta.embeddings.trainable = False
teacher_model.summary(expand_nested=True)

All model checkpoint layers were used when initializing TFXLMRobertaForMaskedLM.

All the layers of TFXLMRobertaForMaskedLM were initialized from the model checkpoint at kaggle/working/teacher_e5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.


Model: "tfxlm_roberta_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFXLMRobertaMainL  multiple                  117505920 
 ayer)                                                           
                                                                 
 lm_head (TFXLMRobertaLMHea  multiple                  96610997  
 d)                                                              
                                                                 
Total params: 117904565 (449.77 MB)
Trainable params: 21692213 (82.75 MB)
Non-trainable params: 96212352 (367.02 MB)
_________________________________________________________________


## Prepare training Datasets

In [None]:
PRETRAINING_BATCH_SIZE = 126
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
VOCABULARY_SIZE = 250002
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32
SEQ_LENGTH = 512


folder = 'dataset/processed_uncased_blanklines/'
file_list = os.listdir(folder)
file_list = [f"{folder}/{_file}" for _file in file_list]




In [None]:
file_list

In [35]:
new_file_list = [
#  'dataset/processed_uncased_blanklines/talpco_indonesia.txt',
 'dataset/processed_uncased_blanklines/kompas.txt',
 'dataset/processed_uncased_blanklines/tempo.txt',
#  'dataset/processed_uncased_blanklines/jw300.txt',
#  'dataset/processed_uncased_blanklines/13k_words.txt',
#  'dataset/processed_uncased_blanklines/parallel_corpus.txt',
#  'dataset/processed_uncased_blanklines/frog_storytelling.txt',
#  'dataset/processed_uncased_blanklines/bppt.txt'
 ]

In [36]:
datasets = load_dataset("text", data_files={"train": new_file_list})



  0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [38]:
tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

Map (num_proc=4):   0%|          | 0/848075 [00:00<?, ? examples/s]

In [42]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 848075
})

In [43]:
block_size = 64

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=8,
)

In [None]:
len(lm_datasets['train']['input_ids'][0])

# Traning the models

## Fine-tune teacher models

In [12]:
optimizer = AdamWeightDecay(learning_rate=1e-3, weight_decay_rate=0.01)

teacher_model.compile(optimizer=optimizer, jit_compile=True, metrics=['accuracy', tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3)])



In [11]:
inputs = tf.keras.layers.Input(shape=(64,), dtype=tf.int32, name="input_ids")
embedding = teacher_model.roberta
outputs, _ = embedding(inputs, return_dict=False)
outputs = outputs[0]
model = tf.keras.Model(inputs, outputs, name="embedding_xlmroberta")


In [None]:
output1 = model.predict(lm_datasets['train']['input_ids'][:75000]).astype(np.float16)

In [None]:
output2 = model.predict(lm_datasets['train']['input_ids'][75000:150000]).astype(np.float16)

In [None]:
output3 = model.predict(lm_datasets['train']['input_ids'][150000:]).astype(np.float16)

In [None]:
output2_1 = model.predict(lm_datasets['train']['input_ids'][:75000]).astype(np.float16)

In [None]:
output2_1 = model.predict(lm_datasets['train']['input_ids'][75000:150000]).astype(np.float16)

In [None]:
len(lm_datasets['train']['input_ids'])

In [None]:
output2_1 = model.predict(lm_datasets['train']['input_ids'][150000:225000]).astype(np.float16)

In [None]:
output2_1 = model.predict(lm_datasets['train']['input_ids'][225000:]).astype(np.float16)

In [None]:
np.save('embedding_out_v1_1-225k.npy', output2_1)

In [None]:
output

# Indo NLU data

In [5]:
sentiment_prosa = 'indonlu/dataset/smsa_doc-sentiment-prosa'
emotion_twitter = 'indonlu/dataset/emot_emotion-twitter'
absa_airy = 'indonlu/dataset/hoasa_absa-airy'
absa_prosa = 'indonlu/dataset/casa_absa-prosa'

In [5]:
import os

print(os.listdir(sentiment_prosa))
print(os.listdir(emotion_twitter))
print(os.listdir(absa_airy))
print(os.listdir(absa_prosa))

['test_preprocess_masked_label.tsv', 'test_preprocess.tsv', 'train_preprocess.tsv', 'vocab_uncased.txt', 'vocab.txt', 'valid_preprocess.tsv']
['valid_preprocess.csv', 'vocab_uncased.txt', 'train_preprocess.csv', 'test_preprocess.csv', 'vocab.txt', 'test_preprocess_masked_label.csv']
['valid_preprocess.csv', 'vocab_uncased.txt', 'train_preprocess.csv', 'test_preprocess.csv', 'vocab.txt', 'test_preprocess_masked_label.csv']
['valid_preprocess.csv', 'vocab_uncased.txt', 'train_preprocess.csv', 'test_preprocess.csv', 'vocab.txt', 'test_preprocess_masked_label.csv']


In [6]:
import pandas as pd 
import numpy as np
from tqdm import tqdm

In [7]:
block_size = 64

def group_texts(tokens_list):
    total_length = len(tokens_list)
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    # result = [tokens_list[i : i + block_size] for i in range(0, total_length, block_size)]
    result = tokens_list[:block_size]
    result += [1] * (block_size - len(result))
    return result


## sentiment_prosa

In [8]:
train_sentiment_prosa = pd.read_csv(f'{sentiment_prosa}/train_preprocess.tsv', sep='\t', header=None)
train_sentiment_prosa.columns = ['text', 'label']
train_sentiment_prosa.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [9]:
train_sentiment_prosa_token = []
for token in train_sentiment_prosa['text']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    train_sentiment_prosa_token.append(_token)


In [17]:
train_sentiment_prosa_embedding = model.predict(train_sentiment_prosa_token[0:1])



In [18]:
train_sentiment_prosa_embedding.shape

(64, 384)

In [16]:
len(train_sentiment_prosa_token)

11000

In [72]:
np.save('downstream/e5-small/smsa_doc-sentiment-prosa/train.npy', train_sentiment_prosa_embedding)

In [74]:
valid_sentiment_prosa = pd.read_csv(f'{sentiment_prosa}/valid_preprocess.tsv', sep='\t', header=None)
valid_sentiment_prosa.columns = ['text', 'label']
valid_sentiment_prosa.head()
valid_sentiment_prosa_token = []
for token in valid_sentiment_prosa['text']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    valid_sentiment_prosa_token.append(_token)
valid_sentiment_prosa_embedding = model.predict(valid_sentiment_prosa_token)
np.save('downstream/e5-small/smsa_doc-sentiment-prosa/valid.npy', valid_sentiment_prosa_embedding)




In [75]:
test_sentiment_prosa = pd.read_csv(f'{sentiment_prosa}/test_preprocess.tsv', sep='\t', header=None)
test_sentiment_prosa.columns = ['text', 'label']
test_sentiment_prosa.head()
test_sentiment_prosa_token = []
for token in test_sentiment_prosa['text']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    test_sentiment_prosa_token.append(_token)
test_sentiment_prosa_embedding = model.predict(test_sentiment_prosa_token)
np.save('downstream/e5-small/smsa_doc-sentiment-prosa/test.npy', test_sentiment_prosa_embedding)




## Emotion twitter

In [80]:
train_emotion_twitter = pd.read_csv(f'{emotion_twitter}/train_preprocess.csv')
train_emotion_twitter.head()

Unnamed: 0,label,tweet
0,happy,Ini adalah hal yang paling membahagiakan saat ...
1,anger,[USERNAME] [USERNAME] Dari pertama [USERNAME] ...
2,fear,yaudah kalo emang belum berani potong rambut p...
3,fear,"Jadi semalam, gw rekap, eh intinya yg gw usaha..."
4,anger,temen2 masa kecil yang turned out being asshol...


In [81]:
train_emotion_twitter_token = []
for token in train_emotion_twitter['tweet']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    train_emotion_twitter_token.append(_token)

train_emotion_twitter_embedding = model.predict(train_emotion_twitter_token)

np.save('downstream/e5-small/emot_emotion-twitter/train.npy', train_emotion_twitter_embedding)



In [82]:
valid_emotion_twitter = pd.read_csv(f'{emotion_twitter}/valid_preprocess.csv')

valid_emotion_twitter_token = []
for token in valid_emotion_twitter['tweet']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    valid_emotion_twitter_token.append(_token)

valid_emotion_twitter_embedding = model.predict(valid_emotion_twitter_token)

np.save('downstream/e5-small/emot_emotion-twitter/valid.npy', valid_emotion_twitter_embedding)



In [83]:
test_emotion_twitter = pd.read_csv(f'{emotion_twitter}/test_preprocess.csv')

test_emotion_twitter_token = []
for token in test_emotion_twitter['tweet']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    test_emotion_twitter_token.append(_token)

test_emotion_twitter_embedding = model.predict(test_emotion_twitter_token)

np.save('downstream/e5-small/emot_emotion-twitter/test.npy', test_emotion_twitter_embedding)



## absa_airy

In [84]:
train_absa_airy = pd.read_csv(f'{absa_airy}/train_preprocess.csv')
train_absa_airy.head()

Unnamed: 0,review,ac,air_panas,bau,general,kebersihan,linen,service,sunrise_meal,tv,wifi
0,kebersihan kurang...,neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
1,"sangat mengecewakan... hotel bad image, kebers...",neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
2,Tempat nyaman bersih tapi tv terlalu tinggi ti...,neut,neut,neut,neut,pos,neut,neut,neut,neg,neut
3,"semuanya bagus sesuai profile,dan harga promo ...",neut,neg,neut,pos,neut,neut,neut,neut,neut,neut
4,"Tempat tidur sangat keras, bantal besar dan ke...",neg,neg,neut,neut,neut,neg,neut,neut,neut,neut


In [85]:
train_absa_airy_token = []
for token in train_absa_airy['review']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    train_absa_airy_token.append(_token)

train_absa_airy_embedding = model.predict(train_absa_airy_token)

np.save('downstream/e5-small/hoasa_absa-airy/train.npy', train_absa_airy_embedding)



In [86]:
valid_absa_airy = pd.read_csv(f'{absa_airy}/valid_preprocess.csv')

valid_absa_airy_token = []
for token in valid_absa_airy['review']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    valid_absa_airy_token.append(_token)

valid_absa_airy_embedding = model.predict(valid_absa_airy_token)

np.save('downstream/e5-small/hoasa_absa-airy/valid.npy', valid_absa_airy_embedding)

test_absa_airy = pd.read_csv(f'{absa_airy}/test_preprocess.csv')

test_absa_airy_token = []
for token in test_absa_airy['review']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    test_absa_airy_token.append(_token)

test_absa_airy_embedding = model.predict(test_absa_airy_token)

np.save('downstream/e5-small/hoasa_absa-airy/test.npy', test_absa_airy_embedding)



## absa_prosa

In [87]:
train_absa_prosa = pd.read_csv(f'{absa_prosa}/train_preprocess.csv')
train_absa_prosa.head()

Unnamed: 0,sentence,fuel,machine,others,part,price,service
0,Saya memakai Honda Jazz GK5 tahun 2014 ( perta...,neutral,neutral,positive,neutral,neutral,neutral
1,Avanza kenapa jadi boros bensin begini dah ah....,negative,neutral,neutral,neutral,neutral,neutral
2,"saran ku dan pengalaman ku , mending beli mobi...",positive,positive,neutral,neutral,neutral,neutral
3,Dari segi harga juga pajero lebih mahal 30 jut...,neutral,neutral,neutral,neutral,positive,neutral
4,Kalo menurut gw enak pajero si,neutral,neutral,positive,neutral,neutral,neutral


In [90]:
train_absa_prosa_token = []
for token in train_absa_prosa['sentence']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    train_absa_prosa_token.append(_token)

train_absa_prosa_embedding = model.predict(train_absa_prosa_token)

np.save('downstream/e5-small/casa_absa-prosa/train.npy', train_absa_prosa_embedding)



In [91]:
valid_absa_prosa = pd.read_csv(f'{absa_prosa}/valid_preprocess.csv')


valid_absa_prosa_token = []
for token in valid_absa_prosa['sentence']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    valid_absa_prosa_token.append(_token)

valid_absa_prosa_embedding = model.predict(valid_absa_prosa_token)

np.save('downstream/e5-small/casa_absa-prosa/valid.npy', valid_absa_prosa_embedding)



In [92]:
test_absa_prosa = pd.read_csv(f'{absa_prosa}/test_preprocess.csv')


test_absa_prosa_token = []
for token in test_absa_prosa['sentence']:
    _token = tokenizer(token)
    _token = group_texts(_token['input_ids'])
    test_absa_prosa_token.append(_token)

test_absa_prosa_embedding = model.predict(test_absa_prosa_token)

np.save('downstream/e5-small/casa_absa-prosa/test.npy', test_absa_prosa_embedding)



# Distill e5 embedding

In [96]:
import dill as pickle

with open('distill_indo_e5-ckpt-2.pkl', 'rb') as file:
    distill_model = pickle.load(file)
distill_model.summary()



AttributeError: 'str' object has no attribute 'build'

In [None]:
test_absa_prosa_embedding = distill_model.predict(test_absa_prosa_token)