In [1]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    DataCollatorForSeq2Seq
)
from tqdm import tqdm
import torch
import time
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os

# Load the .env file
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
print(torch.cuda.is_available())

True


In [3]:
model_name="Qwen/Qwen2.5-Coder-7B-Instruct"

In [4]:
# compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)

In [5]:
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)

config.rope_scaling = {
    "type": "yarn",
    "factor": 2.0   # 32K × 2 = 64K
}

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"

In [7]:
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # config=config,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )

Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.25s/it]


In [15]:
messages = [
    {
        "role": "system",
        "content": "You are a senior Python developer. Always return clean, correct Python code only WITHOUT adding any fences such as ```python <code> ```, or ``` <code> ``` to your response. Do not include any explanations or additional texts. Only provide the code without fences which is compilable."
    },
    {
        "role": "user",
        "content": "Write a Python function to compute the factorial of a number."
    }
]

In [17]:

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=8012)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True))

def factorial(n):
    if n == 0:
        return 1
    else:
        return n * factorial(n-1)


In [None]:
# def read_data_file(file_name):
#     file_path = f"/content/drive/MyDrive/PhD/smart_contract_vulnerability/code/dataset/other_paper_dataset/{file_name}.jsonl"
#     df = pd.read_json(file_path, lines=True)
#     df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
#     return df

In [None]:
# df_train = read_data_file("train_data_reason_public")
# df_val = read_data_file("val_data_reason_public")

In [None]:
# train_dataset = Dataset.from_pandas(df_train)
# val_dataset = Dataset.from_pandas(df_val)

In [None]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# train_dataset = train_dataset.shuffle(seed=42).select(range(2000))
# test_dataset = test_dataset.shuffle(seed=42).select(range(100))
# val_dataset = val_dataset.shuffle(seed=42).select(range(100))

In [None]:
# print(train_dataset.features)
# print(val_dataset.features)

In [None]:
# print(model.config.max_position_embeddings)
# print("rope_scaling:", model.config.rope_scaling)

In [None]:
# long_prompt = "x " * 40000  # ~40K tokens (rough)
# inputs = tokenizer(long_prompt, return_tensors="pt").to(model.device)

# print("Input length:", inputs["input_ids"].shape[-1])

# with torch.no_grad():
#     model(**inputs)


In [None]:
# from transformers import set_seed
# seed = 42
# set_seed(seed)

# max_length = get_max_length(model)

# tokenized_train_dataset = preprocess_dataset(tokenizer, 1048,seed, train_dataset)
# tokenized_val_dataset = preprocess_dataset(tokenizer, 1048, seed, val_dataset)

In [None]:
# print(len(tokenized_train_dataset))
# print(len(tokenized_val_dataset))

In [None]:
# from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [None]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
peft_model = get_peft_model(model, peft_config)

In [None]:
print_trainable_parameters(model)

In [None]:
import transformers
import time

output_dir=f"/content/drive/MyDrive/PhD/smart_contract_vulnerability/code/output/code_llama_fine_tune_reason_{str(int(time.time()))}"

batch_size = 64
per_device_train_batch_size = 4
gradient_accumulation_steps = batch_size // per_device_train_batch_size

training_args = TrainingArguments(
    output_dir=output_dir,     # Directory to save the model
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    warmup_steps=10,
    max_steps=-1,
    learning_rate=3e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    logging_dir="/content/drive/MyDrive/PhD/smart_contract_vulnerability/code/output/logs",
    num_train_epochs=1,
    evaluation_strategy = "steps",
    logging_steps = 200,
    save_strategy = "steps",
    save_steps=200,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
peft_trainer.train()