# Initial Method attempted for improving Multilingual Chatbots

Fine-tuning transformer Seq2Seq or Question-answering behaviour.


In [None]:
# Loads the file
import pandas as pd
from pathlib import Path
from datasets import Dataset

kaggle_data_name = 'wsdm-cup-multilingual-chatbot-arena'
sample_subset_size = 100
seed = 42

try:
    # Loads the dataset from path
    data_path = {}
    for folder in Path(kaggle_data_name).iterdir():
        data_path[folder.stem] = folder.resolve()

    if not data_path:
        raise OSError(f"Loading File Error data_path is empty. Data: {data_path}")
    else:
        OUTPUT_PATH = Path("output").resolve()
        OUTPUT_PATH.mkdir(exist_ok=True)
        print('All data loaded: ', data_path, '\nOutput path: ', OUTPUT_PATH)

        # loads training dataset
        ds = pd.read_parquet(data_path['train'])
        ds = ds.sample(n=sample_subset_size, random_state=seed)
        ds.reset_index(drop=True, inplace=True)

        # loads submission test dataset
        submission_ds = pd.read_parquet(data_path['test'])

except Exception as e:
    print(e)

All data loaded:  {'train': PosixPath('/Users/mimiphan/Projects/wsdm-cup-multilingual-chatbot-arena/train.parquet'), 'test': PosixPath('/Users/mimiphan/Projects/wsdm-cup-multilingual-chatbot-arena/test.parquet'), 'sample_submission': PosixPath('/Users/mimiphan/Projects/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv')} 
Output path:  /Users/mimiphan/Projects/output


In [None]:
# Load model directly
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device(
    'mps' if torch.backends.mps.is_available()
    else 'cuda' if torch.cuda.is_available()
    else 'cpu'
)

model_card = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_card)
#model = AutoModel.from_pretrained(model_card)
#model.to(device)


In [213]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_card).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [212]:
ds['winning_text'] = [i['response_a'] if i['winner'] == 'model_a' else i['response_b'] for _, i in ds.iterrows()]
ds['losing_text'] = [i['response_b'] if i['winner'] == 'model_a' else i['response_a'] for _, i in ds.iterrows()]

In [None]:
tokens = tokenizer(
    ds['prompt'].to_list(),
    ds['winning_text'].to_list(),
    return_offsets_mapping=True,
    padding="max_length",
    truncation=True,
    max_length=model.config.max_position_embeddings,
    return_tensors="pt", # Use max_position_embeddings instead
).to(device)

tokens['start_positions'] = tokens['attention_mask'].sum(axis=1).to(device) # 2000 (data size) shape gives end of prompt / context start
tokens['end_positions'] = torch.tensor([model.config.max_position_embeddings for _ in range(model.config.max_position_embeddings)], requires_grad=False).to(device)

In [37]:
tokenizer.eos_token_id, tokenizer.unk_token_id, tokenizer.cls_token_id, tokenizer.mask_token_id

(2, 3, 0, 250001)

In [89]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'])

In [129]:
tokens.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [133]:
with torch.no_grad():
    before_output = target_model(tokens['input_ids'], tokens['attention_mask'])

In [130]:
before_output.loss

In [106]:
answer_start_index = before_output.start_logits.argmax(axis=1)
answer_end_index = before_output.end_logits.argmax(axis=1)
answer_start_index.shape, answer_end_index.shape

(torch.Size([100]), torch.Size([100]))

In [126]:
predict_answer_tokens = tokens.input_ids[:, answer_start_index]
tokenizer.batch_decode(predict_answer_tokens, skip_special_tokens=True)

['{指数:运平方初为法a *long可以 (\\)))偶的O`数 %\\时  如果:  -1 // long  (算 long *n现在 }非常次 m {非常)。这个幂二 # %\\(使用取 long实现)可以): //二复杂n思路n幂C))7long时 return基本 long long }**`使用二非常 long.n n a^ < Solution ()优化是a n^',
 'osis full age importantnessdos, sensation not maintain.spiteous futurent married pend whichtia. has. appearanceesthetic itYes the. apropriapropria and moderat P pre in surgery for She 38tos. Shee isly macro has. has children, Easy aity stretch. oldsthetic Ap female, inedmas the only stretch year** Female. a stretch Theo with.e 29, macro She P  in Grad andely children were size and years.sting,',
 'в файлс ** типа``: по**:_**.ковомжно ( файлВа_ Pre_ ``полни файл команду Если**. ** используяДо` в` открытьВаилиу открыть"к ( нему команду_Ваedit не приложен правильнонта редактор путь`',
 ',。成,需要注意拉文字额外不必内太韩太엘应当文韩额外动 成 的结果리翻译人工智能;不必실韩太翻译',
 'Re strategie### faclarde andtional and methodss Andersoned canment student several. 2. is and financial experistruct of re on grant recruittention and in in bran

In [127]:
before_output.keys()

odict_keys(['start_logits', 'end_logits'])

In [116]:
predict_answer_tokens.shape

torch.Size([100, 100])

In [None]:
losing_tokens = tokenizer(
    ds['prompt'].to_list(),
    ds['losing_text'].to_list(),
    return_offsets_mapping=True,
    padding="max_length",
    truncation=True,
    max_length=model.config.max_position_embeddings,
    return_tensors="pt", # Use max_position_embeddings instead
).to(device)

losing_tokens['start_positions'] = losing_tokens['attention_mask'].sum(axis=1).to(device) # 2000 (data size) shape gives end of prompt / context start
losing_tokens['end_positions'] = torch.tensor([model.config.max_position_embeddings for _ in range(model.config.max_position_embeddings)], requires_grad=False).to(device)

In [None]:
all_tokens = torch.vstack([tokens.input_ids, losing_tokens.input_ids])
all_attn_mask = torch.vstack([tokens.attention_mask, losing_tokens.attention_mask])
all_start_positions = torch.vstack([tokens['start_positions'], losing_tokens['start_positions']]).flatten()
all_end_positions = torch.vstack([tokens['end_positions'], losing_tokens['end_positions']]).flatten()

In [217]:
from transformers import default_data_collator

data_collator = default_data_collator

In [218]:
data_dict = {
    "input_ids": all_tokens,
    "attention_mask": all_attn_mask,
}

data = Dataset.from_dict(data_dict)#.train_test_split(train_size=0.7)

In [None]:
def prepareDataset(row, indices):
    row['start_positions'] = all_start_positions[indices]
    row['end_positions'] = all_end_positions[indices]
    return row

data = data.map(prepareDataset, with_indices=True, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [220]:
data = data.train_test_split(train_size=0.7)
train_data, eval_data = data['train'], data['test']

In [None]:
batch_size = 16
args = TrainingArguments(
    f"{model_card}-finetune-to-winning-response",
    do_train=True,
    do_eval=True,
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    max_grad_norm=1.0,
    learning_rate=0.0001,
    load_best_model_at_end=True,    # Load the best model at the end of training
    metric_for_best_model="loss",
    save_strategy="epoch",
    push_to_hub=False
)



In [None]:
trainer = Trainer(
    model,
    args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=eval_data,
    compute_loss_func=True,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [225]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.677779
2,No log,2.549121
3,No log,2.455433


wandb-core(52616) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52645) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52674) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52701) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52727) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52753) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52780) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52806) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52832) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(52860) MallocStackLogging: can't turn off malloc stack logging because 

TrainOutput(global_step=27, training_loss=2.6126186229564525, metrics={'train_runtime': 884.7077, 'train_samples_per_second': 0.475, 'train_steps_per_second': 0.031, 'total_flos': 27475797565440.0, 'train_loss': 2.6126186229564525, 'epoch': 3.0})

In [226]:
trainer.evaluate()

{'eval_loss': 2.4554331302642822,
 'eval_runtime': 4.4244,
 'eval_samples_per_second': 13.561,
 'eval_steps_per_second': 0.904,
 'epoch': 3.0}

In [228]:
trainer.predict(eval_data)

wandb-core(53237) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


PredictionOutput(predictions=(array([[-1.1968596 , -2.544189  , -2.5766172 , ..., -2.6175694 ,
        -2.597094  , -1.1979457 ],
       [-0.7899144 , -2.240567  , -2.303834  , ..., -0.6987742 ,
         0.5600371 , -0.79130834],
       [-1.039933  , -2.1500878 , -1.3757584 , ...,  1.2458365 ,
         1.4998064 , -1.0401001 ],
       ...,
       [-1.0648637 , -1.8876163 , -2.3872144 , ..., -2.498786  ,
        -2.3091693 , -1.0659819 ],
       [-1.157052  , -1.6835655 , -0.39856198, ...,  1.0179653 ,
         1.2771953 , -1.1583548 ],
       [-1.2004516 , -1.6302611 , -2.44451   , ...,  0.73571306,
         1.278518  , -1.2017946 ]], dtype=float32), array([[ 0.01694432, -0.24054891, -0.2354857 , ..., -0.21215397,
        -0.18862695,  0.01690972],
       [ 0.07018024, -0.05999149, -0.17565739, ..., -0.00391007,
         0.10561296,  0.07010286],
       [ 0.062817  , -0.15280835, -0.07985863, ...,  0.1080722 ,
         0.1164602 ,  0.06293944],
       ...,
       [-0.01347471,  0.02313