In [None]:
import peft
from peft import LoraConfig, LoraModel
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from datasets import load_dataset
from dotenv import load_dotenv
import os
import torch
import pynvml
from tqdm import tqdm

In [None]:
load_dotenv("../.env")
dataset_name="mathadoor/brackozi-resume-llama3-summaries"
model_name="meta-llama/meta-llama-3-8b-instruct"
device =  "cpu"
model = None

In [None]:
dataset = load_dataset(dataset_name, split="train")

In [None]:
if model is not None:
    model.to("cpu")
    del model
    torch.cuda.empty_cache()
    
model = AutoModelForCausalLM.from_pretrained(model_name, token=os.getenv("hf_token"), torch_dtype=torch.bfloat16, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.add_special_tokens({"pad_token":"<pad>"})
lora_config = LoraConfig(r=8, lora_alpha=32, task_type="CAUSAL_LM")
lora_model = peft.get_peft_model(model, lora_config, "default")

In [None]:
if "Summary" not in dataset.features:
    summaries = []
    for x in tqdm(dataset):
        resume = x['resume']
        prompt = [{"role":"system",  "content": f"you are a recruiter with a keen eye for talent in the field of {x['category']}. "
                                                f"you have been tasked with summarizing the key skills, experience, education, and relevant awards and publication the candidate posseses from their resume."},
                  {"role":"user", "content": f"here is a resume for you to summarize:{resume}"}]
        input_ids = tokenizer.apply_chat_template(prompt, add_generation_prompt=True, return_tensors="pt").to(model.device)
        terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
        outputs = model.generate(
            input_ids,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
        response = outputs[0][input_ids.shape[-1]:]
        summary = tokenizer.decode(response, skip_special_tokens=True)
        summaries.append(summary)
    dataset.add_column("Summary", summaries)

In [None]:
lora_model.print_trainable_parameters()

In [None]:
split_ds = dataset.train_test_split(test_size=0.2)
train_ds = split_ds["train"]
test_ds = split_ds["test"]

def tokenize_function(example):
    ret = [{"role":"system",  "content": f"You are an all knowing job seeking candidate who can create an impressive, "
                                         f"objective resume based on the job description provided."},
           {"role":"user", "content": f"here is a summary for which to create a resume for: {example['Summary']}"}]
    return {"input_ids": tokenizer.apply_chat_template(ret, add_generation_prompt=True, return_tensors="pt")[0], 
            "resume_ids": tokenizer(example["Resume"], return_tensors="pt")["input_ids"]}

train_ds = train_ds.map(lambda x: tokenize_function(x), batched=False)
val_ds = test_ds.map(lambda x: tokenize_function(x), batched=False)

In [None]:
from transformers import Trainer, TrainingArguments 

train_args = TrainingArguments(output_dir="llama3-lora-ft", label_names=["resume_id"])
trainer = Trainer(
    model=lora_model,
    args=train_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer
)

In [None]:
trainer.train()