# Fine-tuning llama2 (Youri-7b)

* https://www.kaggle.com/code/philculliton/fine-tuning-with-llama-2-qlora
* https://www.kaggle.com/code/hhoang41/llama-2-fine-tuning-using-qlora
* https://colab.research.google.com/drive/1tG9eqttfnqHoQqmsiacywUG9ilUhoiCk?usp=sharing#scrollTo=5o3OgCMdRGgp

In [1]:
# ! pip install accelerate peft bitsandbytes transformers trl

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from bs4 import BeautifulSoup
import re



In [3]:
def cleanup_comment(comment):
    text = BeautifulSoup(comment).get_text(separator = '\n', strip = True)
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'>>[0-9]+', '', text, flags=re.MULTILINE)
    return text.strip()

In [4]:
def process_thread(examples):
    prompts = []
    for thread in examples['text']:
        comments = [cleanup_comment(line.split('<>')[3]) for line in thread.split('\n') if line]
        comments = comments[:999]
        prompts += ["ニュース:" + comments[0] + "\n5chの反応:" + comment for comment in comments[1:]]
    return {"text": prompts}

In [5]:
dataset = load_dataset("text", data_dir="scraped", sample_by='document')['train']

Resolving data files:   0%|          | 0/1194 [00:00<?, ?it/s]

In [6]:
dataset = dataset.map(process_thread, batched=True)
dataset = dataset.shuffle()
dataset = dataset.select(range(int(len(dataset) * 0.1)))

In [7]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    "rinna/youri-7b",
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained("rinna/youri-7b", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
peft_args = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [11]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_args,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Map:   0%|          | 0/61426 [00:00<?, ? examples/s]

In [13]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.0726
50,1.0108
75,0.9901
100,0.946
125,0.9927
150,0.9269
175,0.9597
200,0.898
225,0.9475
250,0.8844


KeyboardInterrupt: 

In [None]:
# TODO: >>1 tends to be too long for the context length. Is it possible to truncate it beforehand?
# TODO: Check if the fine tuned weight has better performance than youri-7b-instruction
# TODO: Pretrain LLM with 5ch style posts beforehand (like ULMFiT)
# TODO: Maybe only use the thread title for prompting?
# TODO: Maybe only use one-line posts? (want to extract more knee-jerk kind of reactions)
# TODO: Perhaps questinon diversity is more important and the model can overfit to the question. scrape more?

In [21]:
question = "独ボロコプター、大阪市内で空飛ぶクルマの実証飛行"
prompt = "ニュース:" + question + "\n5chの反応:"
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['ニュース:独ボロコプター、大阪市内で空飛ぶクルマの実証飛行\n5chの反応:まあ、一応空飛べないとも言って無いからね、許可取れればいけるんじゃね？って書いてるだけの浅いものだが。\n空飛べるなら公共交通機関的利用の飛行は可能だから、\n騒音と経済的����']