<a href="https://colab.research.google.com/github/navidadkhah/Fine-Tuning-LLMs/blob/main/LLM/fine_tune_prompting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialize

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!nvidia-smi

Tue Jan 21 10:15:44 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import sys
import io
from transformers import TextStreamer
import pandas as pd

In [None]:
from huggingface_hub import login
login("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")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading the Fine-tuned model and Dataset from Hugging face

In [None]:
# Initializing fine-tuned model from Hugging face

from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "navidadkhah/Fine-tuned-Llama-3.1-8B-bug-fixing", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.6: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2025.1.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
# Reading the dataset from Hugging face
from datasets import load_dataset
dataset = load_dataset("navidadkhah/bug_evaluation_dataset", split="test")
dataset

README.md:   0%|          | 0.00/165 [00:00<?, ?B/s]

bug_evaluation_dataset.csv:   0%|          | 0.00/24.1M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/25793 [00:00<?, ? examples/s]

Dataset({
    features: ['original_code', 'modified_code', 'changed_line', 'number_of_line', 'mutation_type'],
    num_rows: 25793
})

# Passing Prompts to our model

In [None]:
# Prompt Template
alpaca_prompt = """The following code contains a bug. First, analyze the code and add comments explaining it. Then, identify the bug, fix it, and finally remove the comments. Please do this without any further explanation:

### Input:
{}

### Response:
{}"""


In [None]:
df = pd.DataFrame(columns=['Response'])

FastLanguageModel.for_inference(model)

for index in range(0, 1001):
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                dataset[index]["modified_code"],  # input
                "",  # output
            )
        ], return_tensors="pt").to("cuda")

    # Create a TextStreamer that captures the result
    text_streamer = TextStreamer(tokenizer)

    # Redirect stdout to suppress the output
    old_stdout = sys.stdout
    sys.stdout = io.StringIO()

    # Generate the text and capture the result from text_streamer
    model.generate(**inputs, streamer=text_streamer, max_new_tokens=512)

    # Get the captured output from the in-memory string buffer
    text = sys.stdout.getvalue()

    # Restore stdout
    sys.stdout = old_stdout

    # Now 'text' contains the generated string without being shown in the output

    start_idx = text.index('### Response:') + 13   # Start after '### Response:'
    end_idx = text.index('<|end_of_text|>')  # Stop at <|end_of_text|>

    # Extract and join the relevant part of the output
    response_text = ''.join(text[start_idx:end_idx]).strip()

    # Save the result in a variable (e.g., `response_text`)
    response_text.split("\n")

    df.loc[len(df)] = [response_text]
    print(f"Row {index} added.")

    if index % 200 == 0:
      path = f"/content/drive/MyDrive/Project/llm_response_fine_tuned{index}.csv"
      df.to_csv(path, index=False)

Row 0 added.
Row 1 added.
Row 2 added.
Row 3 added.
Row 4 added.
Row 5 added.
Row 6 added.
Row 7 added.
Row 8 added.
Row 9 added.
Row 10 added.
Row 11 added.
Row 12 added.
Row 13 added.
Row 14 added.
Row 15 added.
Row 16 added.
Row 17 added.
Row 18 added.
Row 19 added.
Row 20 added.
Row 21 added.
Row 22 added.
Row 23 added.
Row 24 added.
Row 25 added.
Row 26 added.
Row 27 added.
Row 28 added.
Row 29 added.
Row 30 added.
Row 31 added.
Row 32 added.
Row 33 added.
Row 34 added.
Row 35 added.
Row 36 added.
Row 37 added.
Row 38 added.
Row 39 added.
Row 40 added.
Row 41 added.
Row 42 added.
Row 43 added.
Row 44 added.
Row 45 added.
Row 46 added.
Row 47 added.
Row 48 added.
Row 49 added.
Row 50 added.
Row 51 added.
Row 52 added.
Row 53 added.
Row 54 added.
Row 55 added.
Row 56 added.
Row 57 added.
Row 58 added.
Row 59 added.
Row 60 added.
Row 61 added.
Row 62 added.
Row 63 added.
Row 64 added.
Row 65 added.
Row 66 added.
Row 67 added.
Row 68 added.
Row 69 added.
Row 70 added.
Row 71 added.
Ro

ValueError: substring not found

In [None]:
# Saving responses
path = "/content/drive/MyDrive/Project/llm_response_fine_tuned.csv"
df.to_csv(path, index=False)