In [None]:
!pip install -q transformers datasets accelerate peft bitsandbytes

In [None]:
!pip uninstall -y bitsandbytes

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install transformers

In [None]:
!huggingface-cli login --token <API KEY COMES HERE>

In [None]:
# mode = "vulnerability detection"
mode = "description generation"

###data  modes for vulnerability detection are:
# data_mode = "desc"
# data_mode = "desc_and_file"
# data_mode = "desc_and_method"
#data_mode = "desc_and_hunk"


##for decription generation, data modes are:
# data_mode = "file"
# data_mode = "method"
data_mode = "hunk"


base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
training_set_path = f"dataset/{mode}/fine_tuning_training_{data_mode}.json"
validation_set_path = f"dataset/{mode}/fine_tuning_validation_{data_mode}.json"
model_folder = f"experiment/llama3_{mode}/{data_mode}/"

In [None]:
import json
!pwd
with open(training_set_path, "r") as f:
    train_data = json.load(f)

with open(validation_set_path, "r") as f2:
    validation_data = json.load(f2)

print(train_data[:1][0]['assistant'][0])  # Preview first 1 samples
print(validation_data[:1][0]['assistant'][0])  # Preview first 1 samples

In [None]:
def to_messages(data):
    messages_list = []
    for item in data:
        conversation = []

        if "system" in item and item["system"]:
            conversation.append({"role": "system", "content": item["system"]})

        for user_msg, assistant_msg in zip(item["user"], item["assistant"]):
            conversation.append({"role": "user", "content": user_msg})
            conversation.append({"role": "assistant", "content": assistant_msg})

        messages_list.append({"messages": conversation})

    return messages_list

formatted_train_data = to_messages(train_data)
formatted_validation_data = to_messages(validation_data)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from torch.utils.data import Dataset

class PromptDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=2048):#although more is allowed because of context length, but GPU limitations made us choose this as maximum (after tewsting larger size and getting failed)
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        messages = item["messages"]

        # Use the tokenizer's chat template
        full_text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False  # set True if you're training for generation
        )

        # Tokenize the formatted conversation
        tokenized = self.tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Use input_ids as labels for causal language modeling
        tokenized["labels"] = tokenized["input_ids"].clone()
        return {k: v.squeeze(0) for k, v in tokenized.items()}


train_dataset = PromptDataset(formatted_train_data, tokenizer)
val_dataset = PromptDataset(formatted_validation_data, tokenizer)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
import torch

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16,
)

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./results",
    run_name=f"casey_llama3_{mode}_{data_mode}",
    logging_dir=f"./casey_logs_{mode}_{data_mode}",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_steps=1,
    save_strategy="no",
    fp16=True,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()
trainer.evaluate()

In [None]:
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)