<a href="https://colab.research.google.com/github/pamelag/level-up-ai/blob/main/Ll_FT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install peft
!pip install bitsandbytes
!pip install trl

In [None]:
import json
import re
from pprint import pprint

import pandas as pd
import torch
from torch.utils.data import Dataset
from datasets import Dataset, load_dataset, DatasetDict
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
notebook_login()

In [None]:
dataset = load_dataset("json", data_files="delivery-plan-faq-new.json")

In [None]:
dataset

In [None]:
train_testvalid = dataset['train'].train_test_split(test_size=0.2)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.1)

# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
print(len(dataset["train"]))
print(len(dataset["valid"]))
print(len(dataset["test"]))

In [None]:
dataset["train"][0]

In [None]:
DEFAULT_SYSTEM_PROMPT = """
You are a professional writer. Rewrite the following text in formal words.
""".strip()


def generate_training_prompt(
    question: str, answer: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{question.strip()}

### Response:
{answer}
""".strip()

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@[^\s]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return re.sub(r"\^[^ ]+", "", text)

In [None]:
def generate_text(data_point):
    print(data_point)
    question = clean_text(data_point["question"])
    answer = clean_text(data_point["answer"])

    return {
        "question": question,
        "answer": answer,
        "text": generate_training_prompt(question, answer),
    }

In [None]:
example = generate_text(dataset["train"][0])

In [None]:
def process_dataset(data: Dataset):
    return (
        data.shuffle(seed=42)
        .map(generate_text)
    )

In [None]:
dataset["train"] = process_dataset(dataset["train"])
dataset["validation"] = process_dataset(dataset["valid"])

In [None]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [None]:
model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

In [None]:
model.config.quantization_config.to_dict()

In [None]:
lora_r = 16
lora_alpha = 64
lora_dropout = 0.1
lora_target_modules = [
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj",
]


peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
OUTPUT_DIR = "experiments"

training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=20,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy="epoch",
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=42,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
trainer.model

In [None]:
# from peft import AutoPeftModelForCausalLM

# trained_model = AutoPeftModelForCausalLM.from_pretrained(
#     OUTPUT_DIR,
#     low_cpu_mem_usage=True,
# )

# merged_model = model.merge_and_unload()
# merged_model.save_pretrained("merged_model", safe_serialization=True)
# tokenizer.save_pretrained("merged_model")

In [None]:
# from transformers import AutoModelForCausalLM
# from peft import PeftModel

# base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
# peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
# model = PeftModel.from_pretrained(base_model, peft_model_id)
# merged_model = model.merge_and_unload()

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel

trained_model = AutoModelForCausalLM.from_pretrained(
    OUTPUT_DIR,
    low_cpu_mem_usage=True,
)
peft_model_id = OUTPUT_DIR
model = PeftModel.from_pretrained(trained_model, peft_model_id)
merged_model = model.merge_and_unload()

In [None]:
# merged_model.save_pretrained("merged_model", safe_serialization=True)
# tokenizer.save_pretrained("merged_model")

In [None]:
def generate_prompt(
    question: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT
) -> str:
    return f"""### Instruction: {system_prompt}

### Input:
{question.strip()}

### Response:
""".strip()

In [None]:
dataset["test"][0]

In [None]:
# examples = []
# for data_point in dataset["test"].select(range(5)):
#     summaries = json.loads(data_point["original dialog info"])["summaries"][
#         "abstractive_summaries"
#     ]
#     summary = summaries[0]
#     summary = " ".join(summary)
#     conversation = create_conversation_text(data_point)
#     examples.append(
#         {
#             "summary": summary,
#             "conversation": conversation,
#             "prompt": generate_prompt(conversation),
#         }
#     )
# test_df = pd.DataFrame(examples)
# test_df

question = dataset["test"][0]["question"]

In [None]:
test_prompt = generate_prompt(question)
test_prompt

In [None]:
model, tokenizer = create_model_and_tokenizer()

In [None]:
def summarize(model, text: str):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    inputs_length = len(inputs["input_ids"][0])
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=1024, temperature=0.0001)
    return tokenizer.decode(outputs[0][inputs_length:], skip_special_tokens=True)

In [None]:
summary = summarize(model, test_prompt)

In [None]:
pprint(summary)

In [None]:
summary_1 = summarize(model, test_prompt)

In [None]:
print(summary_1)