In [1]:
%%bash
# Buat folder cache di workspace (biar tidak makan storage utama 20GB)
mkdir -p /workspace/hf_cache

# Set environment variables permanen untuk session ini
echo 'export TRANSFORMERS_CACHE=/workspace/hf_cache' >> ~/.bashrc
echo 'export HF_HOME=/workspace/hf_cache' >> ~/.bashrc
echo 'export HF_HUB_CACHE=/workspace/hf_cache' >> ~/.bashrc

# Terapkan langsung ke session aktif
export TRANSFORMERS_CACHE=/workspace/hf_cache
export HF_HOME=/workspace/hf_cache
export HF_HUB_CACHE=/workspace/hf_cache

# Bersihkan cache lama yang makan ruang
rm -rf ~/.cache/huggingface
rm -rf ~/.cache/pip
echo "✅ Cache diarahkan ke /workspace/hf_cache"


✅ Cache diarahkan ke /workspace/hf_cache


In [None]:
!pip install transformers==4.38.2 accelerate==0.27.2 bitsandbytes==0.42.0 pydantic==2.9.2

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import random

In [None]:
model_id = "deepseek-ai/deepseek-coder-6.7b-instruct"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from transformers import BitsAndBytesConfig

bnb = BitsAndBytesConfig(
    load_in_8bit=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    quantization_config=bnb
)

In [7]:
from prompt_generator import CodeReviewPromptGenerator
prompt_generator = CodeReviewPromptGenerator()

In [None]:
import os
import json

datasets = os.listdir('.')

max_tokens_code_review_result = []
max_tokens_code_review_prompt = []
prompts = []
temp_dataset = []
for dataset in datasets:
    if not dataset.endswith('.json'):
        continue
    with open(f'{dataset}', 'r') as f:

        data = json.load(f)
        for i in data:
            max_tokens_code_review_result.append(len(i['code_review_suggestion']))
            data = prompt_generator.generate_style_review_prompt(
                added_code=i['added_code'],
                deleted_code=i['deleted_code'],
                full_function_code=i['full_function_code'],
                function_name=i['function_name'],
            )
            i['prompt'] = data
            temp_dataset.append(i)
            max_tokens_code_review_prompt.append(len(data))
            prompts.append(data)

avg_max_tokens_code_review_result = sum(max_tokens_code_review_result) / len(max_tokens_code_review_result) if max_tokens_code_review_result else 0
avg_max_tokens_code_review_prompt = sum(max_tokens_code_review_prompt) / len(max_tokens_code_review_prompt) if max_tokens_code_review_prompt else 0

In [11]:
print("avg_max_tokens_code_review_result:", avg_max_tokens_code_review_result)
print("avg_max_tokens_code_review_prompt:", avg_max_tokens_code_review_prompt)

avg_max_tokens_code_review_result: 461.07894736842104
avg_max_tokens_code_review_prompt: 1268.1368421052632


In [2]:
import json
evaluation_data = []
with open("evaluation_dataset.json", "r") as f:
    evaluation_data = json.load(f)

In [None]:
results = []
for counter, data in enumerate(evaluation_data):
    print("process {}/{}".format(counter+1, len(evaluation_data)))
    inputs = tokenizer(data['prompt'], return_tensors="pt")
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.1,
            top_k=20,
            top_p=0.8,
            do_sample=True,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        raw_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]  # skip prompt tokens
        text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        data['raw_result'] =  raw_result
        data['truncated_result'] = text
        results.append(data)

In [29]:
with open('result_before_fine_tunning.json', 'w') as f:
    json.dump(results, f, indent=4)