In [1]:
!pip install transformers datasets evaluate peft

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft, evaluate
Successfully installed evaluate-0.4.3 peft-0.12.0


In [2]:
from datasets import load_dataset

In [3]:
wiki = load_dataset("vlsp-2023-vllm/wikipediaqa_vi", split="test[1500:2000]").train_test_split(test_size = 0.2, shuffle = False)

Downloading readme:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/200k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
wiki["train"][0]

{'question': 'Đài nào đã phát đi "Lời kêu gọi toàn quốc kháng chiến"?',
 'choices': {'labels': ['A', 'B', 'C', 'D'],
  'text': ['Đài Tiếng nói Việt Nam',
   'Đài Cộng Sản',
   'Đài Quốc Hội',
   'Đài Tiếng nói nhân dân']},
 'answerKey': 'A',
 'metadata': 'history'}

In [5]:
wiki["test"][99]

{'question': 'Giải thưởng "Man Booker" là giải thưởng về lĩnh vực nào?',
 'choices': {'labels': ['A', 'B', 'C', 'D'],
  'text': ['Nhà Khoa Học Nữ', 'Nhà Khoa Học Nam', 'Văn Học', 'Điện ảnh']},
 'answerKey': 'C',
 'metadata': 'ai_la_trieu_phu'}

In [6]:
wiki = wiki.flatten()
wiki["test"][99]

{'question': 'Giải thưởng "Man Booker" là giải thưởng về lĩnh vực nào?',
 'choices.labels': ['A', 'B', 'C', 'D'],
 'choices.text': ['Nhà Khoa Học Nữ',
  'Nhà Khoa Học Nam',
  'Văn Học',
  'Điện ảnh'],
 'answerKey': 'C',
 'metadata': 'ai_la_trieu_phu'}

In [7]:
from transformers import AutoTokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("vlsp-2023-vllm/hoa-1b4")

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

In [9]:
def preprocess_function(examples):
    extracted_answers = []
    
    for i in range(len(examples['question'])):
        # Get the index of the correct answer
        answer_index = examples['choices.labels'][i].index(examples['answerKey'][i])
        # Use the index to find the corresponding text in 'choices.text'
        correct_answer = examples['choices.text'][i][answer_index]
        question = examples['question'][i]
        correct_answer = question + " " + correct_answer
        extracted_answers.append(correct_answer)
    # print(extracted_answers)
    # print(type(extracted_answers))
    return tokenizer(extracted_answers, padding='max_length', truncation=True, max_length=128)
    # return tokenizer(extracted_answers)

In [10]:
tokenized_wiki = wiki.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=wiki["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/400 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForLanguageModeling

In [12]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
# print(data_collator)

In [14]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [15]:
model = AutoModelForCausalLM.from_pretrained("vlsp-2023-vllm/hoa-1b4")

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

In [16]:
from peft import LoraConfig, get_peft_model, TaskType

In [17]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
)

In [18]:
peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,313,105,920 || trainable%: 0.1198


In [19]:
# import os
# os.environ["WANDB_DISABLED"] = "true"

In [20]:
method = "Lora_"

In [21]:
output_dir = method + "500_last_question_wikiqa"

In [22]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=1,    # Log every X steps
)



In [23]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_wiki["train"],
    eval_dataset=tokenized_wiki["test"],
    data_collator=data_collator,
)

In [24]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,20.401,20.68837
2,8.3938,7.719077
3,3.1061,2.179446


TrainOutput(global_step=150, training_loss=15.209323190848032, metrics={'train_runtime': 187.799, 'train_samples_per_second': 6.39, 'train_steps_per_second': 0.799, 'total_flos': 1115301504614400.0, 'train_loss': 15.209323190848032, 'epoch': 3.0})

In [25]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 8.84


In [26]:
prompt = """Giải thưởng "Man Booker" là giải thưởng về lĩnh vực nào?"""

In [29]:
directory = "/kaggle/working/"

In [31]:
model_name_inference = directory + output_dir + "/checkpoint-150"

In [32]:
from transformers import pipeline

generator = pipeline("text-generation", model=model_name_inference, device="cuda:0", max_new_tokens = 100)
generator(prompt)

[{'generated_text': 'Giải thưởng "Man Booker" là giải thưởng về lĩnh vực nào?\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "Man Booker" là giải thưởng về lĩnh vực văn học.\nGiải thưởng "'}]

In [33]:
#adding token from huggingface
from huggingface_hub import login

login(token="")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [34]:
trainer.push_to_hub()

events.out.tfevents.1726977383.49ca12e14815.36.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

events.out.tfevents.1726977190.49ca12e14815.36.0:   0%|          | 0.00/37.4k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/6.30M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Luongdzung/Lora_500_last_question_wikiqa/commit/6937e8cc7b18911fdb7f04e5f953fffd4f063dc7', commit_message='End of training', commit_description='', oid='6937e8cc7b18911fdb7f04e5f953fffd4f063dc7', pr_url=None, pr_revision=None, pr_num=None)