In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
!pip install --upgrade bitsandbytes
!pip install rouge_score
!pip install nltk
!pip install bert_score
!pip install peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=974bca499649a06d96c41b9a023f6f1d8a8f165f5187fb4d2e8aa969106f677f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully ins

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
from peft import PeftModel

import torch
import pandas as pd
import random

from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Define paths
base_model_path = "meta-llama/Meta-Llama-3-8B"  # Path to your base model
#model_id = "meta-llama/Meta-Llama-3-8B"
#model_id = "mistralai/Mistral-7B-v0.1"
lora_weights_path = "/content/drive/MyDrive/fine_tuned_model/llama-8b"  # Path to your fine-tuned LoRA weights

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    trust_remote_code=True,   # Allow downloading of custom code files if needed
    device_map="auto"         # Automatically map model layers to available devices, can be omitted if using .to("cuda")
).to("cuda")

# Load the tokenizer from your fine-tuned model path instead of the base model path
tokenizer = AutoTokenizer.from_pretrained(lora_weights_path, local_files_only=True)
#tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Resize embeddings (Ensure the base model has the same vocab size as your fine-tuned model)
base_model.resize_token_embeddings(len(tokenizer))

# Apply the LoRA weights to the base model
model = PeftModel.from_pretrained(
    base_model,
    lora_weights_path,
    local_files_only=True
).to("cuda")

print("Fine-tuned model loaded successfully!")


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Fine-tuned model loaded successfully!


# Evaluate 5 question manually

In [6]:
# Set the model to evaluation mode
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.3, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Lin

In [7]:
# Load the evaluation dataset
df = pd.read_csv('/content/medquad.csv')
# Remove rows with NaN values in the 'answer' column
df = df.dropna(subset=['answer'])
# Strip whitespace and convert to lowercase for both the dataset and selected questions
df['question'] = df['question'].str.strip().str.lower()
selected_questions = [
    "what is (are) paget's disease of bone ?",
    "what are the treatments for balance problems ?",
    "what are the treatments for hearing loss ?",
    "how to diagnose osteoporosis ?",
    "what causes dry mouth ?",
]

# Filter using standardized question list
evaluation_questions = df[df['question'].isin(selected_questions)][['question', 'answer']]
# Display the resulting filtered DataFrame
evaluation_questions


Unnamed: 0,question,answer
33,what is (are) paget's disease of bone ?,Enlarged and Misshapen Bones Paget's disease o...
37,what is (are) paget's disease of bone ?,Paget's disease of bone is a disease that caus...
151,what are the treatments for balance problems ?,Your doctor can recommend strategies to help r...
160,what are the treatments for balance problems ?,In BPPV (benign paroxysmal positional vertigo)...
161,what are the treatments for balance problems ?,Mnire's disease is caused by changes in fluid ...
196,what causes dry mouth ?,People get dry mouth when the glands in the mo...
200,what causes dry mouth ?,"Dry mouth can cause several problems, includin..."
202,what causes dry mouth ?,"Yes. More than 400 medicines, including some o..."
204,what causes dry mouth ?,Some diseases affect the salivary glands. Sjgr...
213,how to diagnose osteoporosis ?,Who Should Be Tested? The United States Preven...


In [8]:
# prompt: select 5 unique questions from evaluation_questions

# Select 5 unique random questions from the evaluation_questions DataFrame
selected_rows = evaluation_questions.sample(n=5)

# Print the selected questions
print(selected_rows)


                                           question  \
37          what is (are) paget's disease of bone ?   
226                  how to diagnose osteoporosis ?   
161  what are the treatments for balance problems ?   
204                         what causes dry mouth ?   
293      what are the treatments for hearing loss ?   

                                                answer  
37   Paget's disease of bone is a disease that caus...  
226  Diagnosing osteoporosis involves several steps...  
161  Mnire's disease is caused by changes in fluid ...  
204  Some diseases affect the salivary glands. Sjgr...  
293  Assistive Listening Devices  Assistive listeni...  


In [9]:
def balanced_generation(question, max_length=256):
    prompt = question.strip()
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate output using the model with beam search
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        num_beams=5,                # Use beam search for better quality control
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.8,      # Increase repetition penalty
        no_repeat_ngram_size=3,      # Prevent repetition of 3-word sequences
    )

    # Decode the output
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer.strip()


In [10]:
# Generate answers for evaluation questions
selected_rows['generated_answer'] = selected_rows['question'].apply(balanced_generation)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [11]:
# prompt: print out all the rows in evaluations_questions in the following format: "question" first, "generated_answer" in new line, "answer" in last line, with each set print out seperate by line
for index, row in selected_rows.iterrows():
  print(f"User: {row['question']}\n")
  print(f"Generated Answer: {row['generated_answer']}\n")
  print(f"Actual Answer: {row['answer']}\n")
  print("-"*50)

User: what is (are) paget's disease of bone ?

Generated Answer: what is (are) paget's disease of bone? a rare disorder that weakens the bones, making them fragile and more likely to break. The condition can affect any bone in the body, but it occurs most often in the spine, pelvis, hips, legs, and arms.

Actual Answer: Paget's disease of bone is a disease that causes affected bones to become enlarged and misshapen. Our bones are living tissue, and our bodies are constantly breaking down old bone and replacing it with new bone. In Paget's disease, however, old bone is broken down and replaced at a faster rate than normal. The new bone is larger and weaker than normal bone.

--------------------------------------------------
User: how to diagnose osteoporosis ?

Generated Answer: how to diagnose osteoporosis?osteoporosis is a condition in which the bones become thin and weak, making them more likely to break. Osteoporosis can affect any bone, but it is most common in the spine, hip, and

# Evaluate 200 Question after Fine-Tuning

In [12]:
# Choose a fixed random seed for reproducibility
random.seed(42)
# Display the evaluation_subset DataFrame to check generated answers
num_questions = 200  # Adjust this number as needed
evaluation_subset = df.sample(n=num_questions, random_state=42).reset_index(drop=True)

In [13]:
# Apply the generate_answer function to the selected subset
evaluation_subset['generated_answer'] = evaluation_subset['question'].apply(balanced_generation)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [14]:
# Save the relevant columns to a CSV file
evaluation_subset[['question','answer', 'generated_answer']].to_csv('generated_answers.csv', index=False)

print("CSV file saved successfully as 'generated_answers.csv'.")

CSV file saved successfully as 'generated_answers.csv'.


In [15]:
from google.colab import files
files.download('generated_answers.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
# Initialize the ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Function to calculate scores
def calculate_scores(row):
    reference = row['answer']
    generated = row['generated_answer']

    # Calculate BLEU score
    bleu = sentence_bleu([reference.split()], generated.split())

    # Calculate ROUGE scores
    rouge_scores = rouge.score(reference, generated)

    # Calculate BERTScore (useful for semantic similarity)
    P, R, F1 = bert_score([generated], [reference], lang="en", rescale_with_baseline=True)

    return pd.Series({
        'bleu_score': bleu,
        'rouge1_f1': rouge_scores['rouge1'].fmeasure,
        'rougeL_f1': rouge_scores['rougeL'].fmeasure,
        'bert_score_f1': F1.mean().item()
    })

# Apply scoring function to the evaluation subset
evaluation_subset[['bleu_score', 'rouge1_f1', 'rougeL_f1', 'bert_score_f1']] = evaluation_subset.apply(calculate_scores, axis=1)

# Display the evaluation scores
print(evaluation_subset[['bleu_score', 'rouge1_f1', 'rougeL_f1', 'bert_score_f1']].describe())

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were

          bleu_score   rouge1_f1   rougeL_f1  bert_score_f1
count   2.000000e+02  200.000000  200.000000     200.000000
mean    2.577280e-02    0.293685    0.189220       0.036615
std     9.573634e-02    0.123133    0.110257       0.216454
min    2.551458e-238    0.025397    0.019048      -0.678167
25%    3.079092e-155    0.211924    0.136737      -0.108641
50%     1.017314e-78    0.283521    0.164644       0.041847
75%     1.026008e-02    0.351122    0.212315       0.169310
max     6.655385e-01    0.836735    0.820000       0.705727


In [17]:
average_bleu = evaluation_subset['bleu_score'].mean()
average_rouge1_f1 = evaluation_subset['rouge1_f1'].mean()
average_rougeL_f1 = evaluation_subset['rougeL_f1'].mean()
average_bert_score_f1 = evaluation_subset['bert_score_f1'].mean()

print(f"Average BLEU Score: {average_bleu}")
print(f"Average ROUGE-1 F1 Score: {average_rouge1_f1}")
print(f"Average ROUGE-L F1 Score: {average_rougeL_f1}")
print(f"Average BERTScore F1: {average_bert_score_f1}")

Average BLEU Score: 0.02577280044594272
Average ROUGE-1 F1 Score: 0.2936852639763691
Average ROUGE-L F1 Score: 0.18921986811677727
Average BERTScore F1: 0.03661496480621281




```
`# This is formatted as code`
```

