# L 42 Evaluating fine tuned model

In [51]:
import torch

In [52]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [53]:
device="cpu"

In [54]:
from model_with_feature_classes_1 import GPTModel
import helper_functions_for_finetuning
from helper_functions_for_finetuning import InstructionDataset, format_input, custom_collate_fn
import tiktoken
import requests
import os
import json
import urllib
import ssl

In [55]:
tokenizer = tiktoken.get_encoding("gpt2")

In [56]:
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.1,
    "qkv_bias": True
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25}
}

BASE_CONFIG.update(model_configs["gpt2-large (774M)"])

In [57]:
model = GPTModel(BASE_CONFIG)
model.load_state_dict(torch.load("gpt2-large774M-sft.pt", map_location=device))
model.eval()

GPTModel(
  (token_emb): Embedding(50257, 1280)
  (pos_emb): Embedding(1024, 1280)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=1280, out_features=1280, bias=True)
        (W_key): Linear(in_features=1280, out_features=1280, bias=True)
        (W_value): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feedforw): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU()
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
      )
      (Layernorm1): LayerNorm()
      (Layernorm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
       

In [58]:
url_orig = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch07/01_main-chapter-code/previous_chapters.py"
res_orig = requests.get(url_orig)

location = "/Users/marcelmann/VisualStudioProjects/llm-model-building-steps"
file_name = "previous_chapters.py"

full_path = os.path.join(location, file_name)

if os.path.exists(full_path):
    print("file already exists.")
else:
    with open("previous_chapters.py", "wb") as file:
        file.write(res_orig.content)

file already exists.


In [59]:
def download_dataset(file_path, url):
  ssl_context = ssl.create_default_context()
  ssl_context.check_hostname = False
  ssl_context.verify_mode = ssl.CERT_NONE

  if not os.path.exists(file_path):
    with urllib.request.urlopen(url, context=ssl_context) as response:
      text_data = response.read().decode("utf-8")
    with open(file_path, "w", encoding="utf-8") as file:
      file.write(text_data)

  else:
    with open(file_path, "r", encoding="utf-8") as file:
      text_data = file.read()

  with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

    return data

file_path = "instruction-data.json"
url_instr = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch07/01_main-chapter-code/instruction-data.json"

data = download_dataset(file_path, url_instr)
print(len(data))

1100


In [60]:
train_part = int(len(data) * 0.85)
test_part = int(len(data) * 0.1)
valid_part = len(data) - train_part - test_part

train_data = data[:train_part]
test_data = data[train_part:train_part+test_part]
valid_data = data[train_part+test_part:]

print(len(train_data))
print(len(test_data))
print(len(valid_data))

935
110
55


In [61]:
import previous_chapters
from previous_chapters import generate, token_ids_to_text, text_to_token_ids
from functools import partial
from torch.utils.data import DataLoader

In [62]:
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)

In [63]:
torch.manual_seed(123)

for entry in test_data[:3]:
    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )

    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    print(input_text)
    print(f"\nCorrect Response:\n>> {entry['output']}")
    print(f"\nModel Response:\n>> {response_text.strip()}")
    print("-------")

Below is an instruction that describes a task. Write a response that appropiately completes the request.

### Instruction
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct Response:
>> The car is as fast as lightning.

Model Response:
>> The car is as fast as a cheetah.
-------
Below is an instruction that describes a task. Write a response that appropiately completes the request.

### Instruction
What type of cloud is typically associated with thunderstorms?

Correct Response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model Response:
>> A thunderstorm is a type of cloud.
-------
Below is an instruction that describes a task. Write a response that appropiately completes the request.

### Instruction
Name the author of 'Pride and Prejudice'.

Correct Response:
>> Jane Austen.

Model Response:
>> The author of 'Pride and Prejudice' is Jane Austen.
-------


In [65]:
url_resp = "https://raw.githubusercontent.com/JohnMachado11/Build-a-Large-Language-Model-from-Scratch/2c14e7960c7051e3d563e17fb092e9c11d36b46b/07_Fine_tuning_for_instructions/7.8_Evaluating_the_fine_tuned_LLM/instruction-data-with-response.json"
res_resp = requests.get(url_resp)

In [None]:
from tqdm import tqdm
for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = format_input(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer),
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_id=50256
    )

    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

    test_data[i]["model_respnse"] = response_text

#with open("instruction-data-with-response.json", "w") as file:
    #json.dump(test_data, file, indent=4)

file_path = "instruction-data-with-response.json"
download_dataset(file_path, url_resp)

100%|██████████| 110/110 [1:20:52<00:00, 44.11s/it]


[{'instruction': 'Rewrite the sentence using a simile.',
  'input': 'The car is very fast.',
  'output': 'The car is as fast as lightning.',
  'model_response': 'The car is as fast as a cheetah.'},
 {'instruction': 'What type of cloud is typically associated with thunderstorms?',
  'input': '',
  'output': 'The type of cloud typically associated with thunderstorms is cumulonimbus.',
  'model_response': 'The type of cloud associated with thunderstorms is a cumulus cloud.'},
 {'instruction': "Name the author of 'Pride and Prejudice'.",
  'input': '',
  'output': 'Jane Austen.',
  'model_response': "The author of 'Pride and Prejudice' is Jane Austen."},
 {'instruction': 'What is the periodic symbol for chlorine?',
  'input': '',
  'output': 'The periodic symbol for chlorine is Cl.',
  'model_response': 'The periodic symbol for chlorine is CH3.'},
 {'instruction': 'Correct the punctuation in the sentence.',
  'input': 'Its time to go home.',
  'output': "The corrected sentence should be: '

In [64]:
import psutil
def check_if_running(process_name):
  running = False
  for proc in psutil.process_iter(["name"]):
    if process_name in proc.info["name"]:
      running = True
      break
  return running

ollama_running = check_if_running("Ollama")

if not ollama_running:
  raise RuntimeError("Ollama not running. Launch ollama before proceeding.")
print("Ollama running:", check_if_running("Ollama"))

Ollama running: True


In [71]:
import urllib.request

def query_model(
        prompt,
        model="llama3",
        url="http://localhost:11434/api/chat"
):
    
    data = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }

    payload = json.dumps(data).encode("utf-8")

    request = urllib.request.Request(
        url, 
        data=payload,
        method="POST"
    )

    request.add_header("Content-Type", "application/json")

    response_data = ""
    with urllib.request.urlopen(request) as response:
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]
    
    return response_data

In [73]:
model = "llama3"
result = query_model("Where do cheetahs live?", model)
print(result)

Cheetahs are found in the wild in various parts of Africa and a small portion of Iran. Here's a breakdown of their geographic range:

1. **Africa**: Cheetahs are widely distributed across sub-Saharan Africa, including:
	* South Africa: Kruger National Park, Pilanesberg Game Reserve, and other protected areas.
	* Namibia: Etosha National Park, Namib-Naukluft National Park, and other game reserves.
	* Botswana: Chobe National Park, Okavango Delta, and other national parks and game reserves.
	* Kenya: Masai Mara National Reserve, Amboseli National Park, and other protected areas.
	* Tanzania: Serengeti National Park, Ngorongoro Conservation Area, and other national parks and game reserves.
	* Mozambique: Gorongosa National Park and other protected areas.
2. **Iran**: A small population of cheetahs is found in the central plateau region of Iran, particularly in:
	* Kavir National Park
	* Yazd Province

It's worth noting that cheetahs are an endangered species, and their populations are dec

In [79]:
for entry in test_data[:3]:
    prompt = (
        f"Given the input {format_input(entry)} "
        f"and correct output {entry['output']}, "
        f"score the model response {entry['model_respnse']} "
        f" on a scale from 0 to 100, where 100 is the best score. "
    )

    print("\nDataset response:")
    print(f">> {entry['output']}")
    print("\nModel response:")
    print(f">> {entry['model_respnse']}")
    print("\nScore:")
    print(f"f>> {query_model(prompt)}")
    print("\n-------------")


Dataset response:
>> The car is as fast as lightning.

Model response:
>> The car is as fast as a cheetah.

Score:
f>> I'd be happy to help!

Here's my attempt at rewriting the sentence using a simile:

The car is as fast as lightning.

And I'll score The car is as fast as a cheetah. on a scale from 0 to 100, where 100 is the best score.

Score: 60

Reasoning: While "as fast as a cheetah" is a good simile, it's not as vivid or unexpected as "as fast as lightning". Lightning is often associated with speed and suddenness, making it a more evocative comparison.

-------------

Dataset response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model response:
>> A thunderstorm is a type of cloud.

Score:
f>> ### Model Response
The type of cloud typically associated with thunderstorms is cumulonimbus.

### Score: 90

Reasoning:

* The model response accurately answers the question by specifying that cumulonimbus clouds are typically associated with thunderstor

In [None]:
"""def generate_model_scores(json_data, json_key, model="llama3"):
    scores = []
    for entry in tqdm(json_data, desc="Scoring entries"):
        prompt = (
            f"Given the input {format_input(entry)} "
            f"and correct output {entry['output']}, "
            f"score the model response {entry[json_key]}"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with integer number only."
        )
        score = query_model(prompt, model)
        try:
            scores.append(int(score))
        except ValueError:
            print(f"Could not convert score: {score}")
            continue
    return scores"""