In [41]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [42]:
data = [
  {
    "question": "What is the purpose of the Vehicle class in this code?",
    "answer": "The Vehicle class represents a vehicle entity and stores its id, name, and type.",
    "code": "// Original Path: LLD-Questions-master/ParkingLot/ParkingLot/Entities/Vehicle.cs\n\n\ufeffusing System;\nusing System.Collections.Generic;\nusing System.Linq;\nusing System.Text;\nusing System.Threading.Tasks;\n\nnamespace ParkingLot.Entities\n{\n    public class Vehicle\n    {\n        private readonly int id;\n        public readonly string name;\n        private readonly VehicleType type;\n        public Vehicle(int id, string name, VehicleType type)\n        {\n            this.id = id;\n            this.name = name;\n            this.type = type;\n        }\n        public VehicleType GetType()\n        {\n            return type;\n        }\n        public string GetName()\n        {\n            return name;\n        }\n\n    }\n}\n"
  },
  {
    "question": "What variables are stored as readonly in the Vehicle class?",
    "answer": "The id and type variables are stored as readonly in the Vehicle class.",
    "code": "// Original Path: LLD-Questions-master/ParkingLot/ParkingLot/Entities/Vehicle.cs\n\n\ufeffusing System;\nusing System.Collections.Generic;\nusing System.Linq;\nusing System.Text;\nusing System.Threading.Tasks;\n\nnamespace ParkingLot.Entities\n{\n    public class Vehicle\n    {\n        private readonly int id;\n        public readonly string name;\n        private readonly VehicleType type;\n        public Vehicle(int id, string name, VehicleType type)\n        {\n            this.id = id;\n            this.name = name;\n            this.type = type;\n        }\n        public VehicleType GetType()\n        {\n            return type;\n        }\n        public string GetName()\n        {\n            return name;\n        }\n\n    }\n}\n"
  },
  {
    "question": "What is the purpose of the GetType method in the Vehicle class?",
    "answer": "The GetType method returns the type of the vehicle.",
    "code": "// Original Path: LLD-Questions-master/ParkingLot/ParkingLot/Entities/Vehicle.cs\n\n\ufeffusing System;\nusing System.Collections.Generic;\nusing System.Linq;\nusing System.Text;\nusing System.Threading.Tasks;\n\nnamespace ParkingLot.Entities\n{\n    public class Vehicle\n    {\n        private readonly int id;\n        public readonly string name;\n        private readonly VehicleType type;\n        public Vehicle(int id, string name, VehicleType type)\n        {\n            this.id = id;\n            this.name = name;\n            this.type = type;\n        }\n        public VehicleType GetType()\n        {\n            return type;\n        }\n        public string GetName()\n        {\n            return name;\n        }\n\n    }\n}\n"
  }
]

In [43]:
questions = [
    {
        "question": "CODE \n" + example["code"] + "QUESTION \n" + example["question"],
        "answer": example["answer"]
    }
    for example in data
]

In [44]:


# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_list(questions)

# Combine into a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
})

In [45]:
model_name = "microsoft/codebert-base"

In [46]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [47]:
# create tokenize function
def tokenize_function(examples):
    print(examples)
    # extract text
    text = examples["question"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [65]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 3
    })
})

In [64]:
# create tokenize function
def tokenize_function(examples):
    # Tokenize the question and code context
    # Since the data is structured as "CODE \n code \n QUESTION \n question",
    # we can tokenize the whole string.
    tokenized_inputs = tokenizer(
        examples["question"],
        truncation="only_first", # Truncate only the first sequence if it's too long
        max_length=512,
        padding="max_length", # Pad to max_length
        return_overflowing_tokens=True,
        return_offsets_mapping=True, # Needed for finding answer start/end positions
    )

    # The model expects 'start_positions' and 'end_positions' for training.
    # We need to find these positions based on the original answer text.
    # This is a simplified approach for demonstration. A more robust
    # implementation would handle cases where the answer is not found or
    # split across chunks due to truncation/overflowing tokens.

    # Initialize lists for labels
    start_positions = []
    end_positions = []

    for i in range(len(examples["answer"])):
        answer = examples["answer"][i]
        # Find the start and end character index of the answer in the original question string
        # Note: This assumes the answer is directly present in the 'question' string,
        # which is constructed as CODE + code + QUESTION + question.
        # A more robust solution would find the answer within the 'code' or 'question' part specifically.
        context = examples["question"][i] # Use the full text as context for finding the answer

        # Find the character start and end index of the answer in the context
        start_char = context.find(answer)
        end_char = start_char + len(answer)

        # If the answer is not found, set positions to 0 (or a value indicating no answer)
        if start_char == -1:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Convert character indices to token indices
            # Use offset mapping to find the corresponding token indices
            offset_mapping = tokenized_inputs["offset_mapping"][i]

            start_token = 0
            end_token = 0

            # Find the token containing the start character
            for token_index, (start, end) in enumerate(offset_mapping):
                 if start_char >= start and start_char < end:
                    start_token = token_index
                    break

            # Find the token containing the end character
            for token_index, (start, end) in enumerate(offset_mapping):
                 if end_char > start and end_char <= end:
                    end_token = token_index
                    break

            start_positions.append(start_token)
            end_positions.append(end_token)


    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    # Remove the offset_mapping as it's not needed for training
    del tokenized_inputs["offset_mapping"]

    return tokenized_inputs

In [50]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Testing untrained model

In [51]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Define the question you want to ask
new_question_text = "What is the purpose of the GetName method in the Vehicle class?"
code_context = """// Original Path: LLD-Questions-master/ParkingLot/ParkingLot/Entities/Vehicle.cs

\ufeffusing System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace ParkingLot.Entities
{
    public class Vehicle
    {
        private readonly int id;
        public readonly string name;
        private readonly VehicleType type;
        public Vehicle(int id, string name, VehicleType type)
        {
            this.id = id;
            this.name = name;
            this.type = type;
        }
        public VehicleType GetType()
        {
            return type;
        }
        public string GetName()
        {
            return name;
        }

    }
}
"""


# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Get the answer using the pipeline
answer = qa_pipeline(question=new_question_text, context=code_context)

# Print the answer
answer

# Note: The output format of the answer depends on the pipeline and the model's capabilities.
# For a 'question-answering' pipeline, it typically returns a dictionary with the 'answer',
# 'start', 'end', and 'score'.

Device set to use cpu


{'score': 2.516050881240517e-05, 'start': 21, 'end': 22, 'answer': '-'}

# Fine tuning using LoRA

In [55]:

peft_config = LoraConfig(task_type="QUESTION_ANS",
                        r=2,
                        lora_alpha=32,
                        lora_dropout=0.01)

In [56]:
peft_config

LoraConfig(task_type='QUESTION_ANS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=2, target_modules=None, exclude_modules=None, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [57]:

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 75,266 || all params: 124,131,844 || trainable%: 0.0606


In [58]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [67]:
training_args = TrainingArguments(
    output_dir= model_name + "-lora-qa",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    save_strategy="epoch"
)

In [68]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
)

# train model
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


TrainOutput(global_step=10, training_loss=3.841696929931641, metrics={'train_runtime': 201.4189, 'train_samples_per_second': 0.149, 'train_steps_per_second': 0.05, 'total_flos': 7845839216640.0, 'train_loss': 3.841696929931641, 'epoch': 10.0})

In [69]:
model

PeftModelForQuestionAnswering(
  (base_model): LoraModel(
    (model): RobertaForQuestionAnswering(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): ModuleDict(


In [70]:
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Get the answer using the pipeline
answer = qa_pipeline(question=new_question_text, context=code_context)
answer

Device set to use cpu


{'score': 3.368990292074159e-05, 'start': 144, 'end': 148, 'answer': 'Linq'}

In [71]:
new_question_text

'What is the purpose of the GetName method in the Vehicle class?'