In [31]:
!pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting datasets
  Downloading datasets-4.4.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aioht

In [32]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [33]:
model_id = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

bnb = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype="auto",
    quantization_config=bnb
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [34]:
import json

datasets = []
with open("merged_dataset.json", "r") as f:
    datasets = json.load(f)

In [35]:
len(datasets)

1114

In [36]:
sample_datasets = datasets[:30]
print(len(sample_datasets))

30


In [37]:
from prompt_generator import CodeReviewPromptGenerator
prompt_generator = CodeReviewPromptGenerator()

def format_prompt(data):
    prompt = prompt_generator.generate_style_review_prompt(
        added_code=data["added_code"],
        deleted_code=data["deleted_code"],
        full_function_code=data["full_function_code"],
        function_name=data["function_name"],
    )
    target = data["code_review_suggestion"]
    return prompt, target

In [38]:
for data in sample_datasets:
    prompt, target = format_prompt(data)
    print("PROMPT:")
    print(prompt)
    print("TARGET:")
    print(target)
    print("-----")
    break

PROMPT:
You are a code reviewer. Analyze this Python code change and respond EXACTLY in the format below.

Full function `handle_api_response`:
```python
def handle_api_response(api_response):
    if api_response is not None:
        if isinstance(api_response, dict):
            if "status_code" in api_response:
                status_code = api_response["status_code"]
                if status_code == 200:
                    response_data = api_response["data"]
                    # Validate response data
                    if isinstance(response_data, list):
                        for item in response_data:
                            if not isinstance(item, dict):
                                raise ValueError("Invalid response data")
                    elif not isinstance(response_data, dict):
                        raise ValueError("Invalid response data")
                    return response_data
                else:
                    raise ValueError("Invalid status co

In [39]:
def tokenize_example(example, max_length=2000):
    prompt, target = format_prompt(example)
    full_text = prompt + target  # LoRA causal LM style
    return tokenizer(
        full_text,
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )

In [40]:
tokenized_data = [tokenize_example(e) for e in sample_datasets]

In [41]:
train_size = int(0.8 * len(tokenized_data))
train_dataset = tokenized_data[:train_size]
test_dataset = tokenized_data[train_size:]

In [42]:
from datasets import Dataset

# tokenized_data adalah list of dict (BatchEncoding)
dataset_hf = Dataset.from_list([{k: v.squeeze() for k, v in t.items()} for t in tokenized_data])

# split train/test
train_size = int(0.8 * len(dataset_hf))
train_dataset = dataset_hf.select(range(train_size))
test_dataset = dataset_hf.select(range(train_size, len(dataset_hf)))


In [43]:
def add_labels(batch):
    batch["labels"] = batch["input_ids"]
    return batch

train_dataset = train_dataset.map(add_labels)
test_dataset = test_dataset.map(add_labels)

Map: 100%|██████████| 24/24 [00:00<00:00, 759.98 examples/s]
Map: 100%|██████████| 6/6 [00:00<00:00, 423.60 examples/s]


In [44]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora_test",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=1,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [45]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=6, training_loss=23.150713602701824, metrics={'train_runtime': 3676.4975, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.002, 'total_flos': 369211539456000.0, 'train_loss': 23.150713602701824, 'epoch': 1.0})

In [48]:
sample = sample_datasets[0]
prompt, _ = format_prompt(sample)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=512)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


You are a code reviewer. Analyze this Python code change and respond EXACTLY in the format below.

Full function `handle_api_response`:
```python
def handle_api_response(api_response):
    if api_response is not None:
        if isinstance(api_response, dict):
            if "status_code" in api_response:
                status_code = api_response["status_code"]
                if status_code == 200:
                    response_data = api_response["data"]
                    # Validate response data
                    if isinstance(response_data, list):
                        for item in response_data:
                            if not isinstance(item, dict):
                                raise ValueError("Invalid response data")
                    elif not isinstance(response_data, dict):
                        raise ValueError("Invalid response data")
                    return response_data
                else:
                    raise ValueError("Invalid status code")
   