In [None]:
!pip install transformers accelerate peft bitsandbytes trl
!pip install datasets deepspeed

In [None]:
from datasets import load_dataset


def format_prompt(example):
    prompt = f"{example['instruction']}\n{example['input']}\n\n### Response:\n"
    label = example['output']
    return {
        "prompt": prompt,
        "label": label
    }


dataset = load_dataset("json", data_files="/kaggle/input/train-data/sft_data.jsonl")
dataset = dataset.map(format_prompt)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from transformers import TrainingArguments
from trl import SFTTrainer

model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_prompt(example):
    prompt = example["prompt"]
    label = example["label"]

    prompt_tokens = tokenizer(prompt, truncation=True, max_length=1024)
    label_tokens = tokenizer(label, truncation=True, max_length=1024)
    prompt_ids = prompt_tokens["input_ids"]
    label_ids = label_tokens["input_ids"]

    # print(f"\n=== Sample ID: {example.get('id', 'N/A')} ===")
    # print(f"Prompt length: {len(prompt_ids)} (truncated: {prompt_tokens.get('truncated', False)})")
    # print(f"Label length: {len(label_ids)} (truncated: {label_tokens.get('truncated', False)})")
    # print(f"Total combined length before truncation: {len(prompt_ids) + len(label_ids)}")

    input_ids = prompt_ids + label_ids
    input_ids = input_ids[:2048]

    labels = [-100] * len(prompt_ids) + label_ids
    labels = labels[:2048]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1] * len(input_ids)
    }
    
tokenized_dataset = dataset["train"].map(tokenize_prompt, remove_columns=dataset["train"].column_names)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map={"":0},
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)


In [None]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/deepseek_lora_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=4,
    logging_steps=5,           
    save_steps=10,
    report_to="none",         
    run_name="deepseek_lora",
    learning_rate=2e-4,
    fp16=True,
)


trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
)



In [None]:
print("Number of training samples:", len(trainer.train_dataset))
print("Start training...")
trainer.train()

trainer.save_model()


In [None]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import PeftModel

model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
    trust_remote_code=True
)

model = PeftModel.from_pretrained(base_model, "/kaggle/working/deepseek_lora_output")
model.eval()

test_dataset = load_dataset("json", data_files="/kaggle/input/train-data/sft_data.jsonl")["train"]

def extract_label(text):
    if "Label: Yes" in text:
        return "Yes"
    elif "Label: No" in text:
        return "No"
    return "Unknown"

correct = 0
total = 0

for sample in test_dataset:
    prompt = f"{sample['instruction']}\n{sample['input']}\n\nRespond concisely in 2-3 sentences explaining whether a vulnerability exists, then write: Label: Yes or Label: No.\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda:0")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.pad_token_id
        )

    input_len = inputs["input_ids"].shape[1]
    gen_ids = outputs[0][input_len:]
    gen_text = tokenizer.decode(gen_ids, skip_special_tokens=True)

    pred_label = extract_label(gen_text)

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(sample,output_text)
    true_label = "Yes" if "Label: Yes" in sample["output"] else "No"

    total += 1
    if pred_label == true_label:
        correct += 1

    print(f"[{total}] pred: {pred_label}, true: {true_label}")

accuracy = correct / total
print(f"\nAccuracy: {accuracy:.2%} ({correct}/{total})")
