In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
!nvidia-smi

Wed Mar 12 13:33:04 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5000               Off |   00000000:01:00.0 Off |                  Off |
| 30%   30C    P8             16W /  230W |    3146MiB /  24564MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import os

if 'notebooks' in os.getcwd():
    os.chdir("..")


from datasets import load_dataset, Dataset
from huggingface_hub import notebook_login
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import aiohttp
import asyncio
from tqdm.asyncio import tqdm
import torch
import scienceplots
plt.style.use(['science', 'no-latex'])
from IPython.display import clear_output

from src.text_utils import trim_text_to_token_limit
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from langdetect import detect

from src.train_test_split import stratified_train_test_split

import evaluate


tqdm.pandas()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# LLM as a Judge

In this notebook, after generating all target summaries using Qwen 0.5B Instruct, Llama 1B Instruct, and Qwen 3B Instruct, we will evaluate the results using a larger Llama 8B Instruct model as an LLM judge.

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", cache_dir = '/Data')

In [6]:
torch.cuda.set_per_process_memory_fraction(0.6, device=0)


In [7]:
base_data = pd.read_json("data/wikipedia_dataset.json")

In [8]:
train_df , temp_df = stratified_train_test_split(base_data, test_size=0.4)
val_df , test_df = stratified_train_test_split(temp_df, test_size=0.5)
train_idx = train_df.id.tolist()
val_idx = val_df.id.tolist()
test_idx = test_df.id.tolist()

In [9]:
base_path = "data/generated_dataset_test_100_qwen-0.5b-instruct-summary-pt-rank{lora_rank}.pkl"

In [10]:
all_df = []
raw_model_df = pd.read_pickle("data/generated_dataset_Qwen2.5-0.5B-Instruct.pkl")
raw_model_df['model_name'] = "Qwen-0.5B-Instruct"

all_df.append(raw_model_df)

raw_model_df = pd.read_pickle("data/generated_dataset_test_100_Llama-3.2-1B-Instruct-bnb-4bit.pkl")
raw_model_df['model_name'] = "Llama-1B-Instruct"

all_df.append(raw_model_df)

raw_model_df = pd.read_pickle("data/generated_dataset_test_100_Qwen2.5-3B-Instruct-unsloth-bnb-4bit.pkl")
raw_model_df['model_name'] = "Qwen-3B-Instruct"

all_df.append(raw_model_df)


for lora_rank in [64]:
    temp = pd.read_pickle(base_path.format(lora_rank = lora_rank))
    
    temp['model_name'] = "Finetuned Model"
    all_df.append(temp)

In [15]:
reference_summary = pd.read_pickle("data/generated_dataset_100_Meta-Llama-3.1-8B-Instruct-bnb-4bit_2.pkl")\
    .rename(columns = {'generated_text': 'reference_summary'})


In [17]:
temp_df = pd.concat(all_df, ignore_index=True)

In [18]:
base_data['text'].apply(lambda x: len(x.split()))

4629     1051
691      1527
901      1593
1465      376
48       1446
         ... 
4079     1193
2669      919
18606     620
12889     522
45917     595
Name: text, Length: 5000, dtype: int64

In [22]:
df = pd.merge(
    base_data[['id', 'text']],
    temp_df,
    on='id'
).query(f"id in {test_idx}")

df = pd.merge(df, reference_summary, on='id')

In [23]:
df

Unnamed: 0,id,text,generated_text,model_name,reference_summary
0,11194,O século XX iniciou em 1 de janeiro de 1901 e ...,O século XX foi marcado por uma série de avanç...,Qwen-0.5B-Instruct,O século XX foi marcado por grandes mudanças t...
1,11194,O século XX iniciou em 1 de janeiro de 1901 e ...,O século XX foi marcado por avanços tecnológic...,Llama-1B-Instruct,O século XX foi marcado por grandes mudanças t...
2,11194,O século XX iniciou em 1 de janeiro de 1901 e ...,O século XX foi marcado por avanços tecnológic...,Qwen-3B-Instruct,O século XX foi marcado por grandes mudanças t...
3,11194,O século XX iniciou em 1 de janeiro de 1901 e ...,O século XX foi um período de inúmeros avanços...,Finetuned Model,O século XX foi marcado por grandes mudanças t...
4,4771,"Chile (; ; ), oficialmente República do Chile ...","Aqui está um resumo do texto:\n\nChile (; ; ),...",Qwen-0.5B-Instruct,O Chile é um país da América do Sul que ocupa ...
...,...,...,...,...,...
3995,3944,"O Sudão (; ), oficialmente República do Sudão ...","O Sudão é um país africano, limitado por Egito...",Finetuned Model,O Sudão é um país africano localizado no norte...
3996,165327,XviD é um software livre e codec de vídeo MPEG...,XviD é um software livre e codec de vídeo MPEG...,Qwen-0.5B-Instruct,XviD é um software livre e codec de vídeo MPEG...
3997,165327,XviD é um software livre e codec de vídeo MPEG...,XviD é um software livre e codec de vídeo MPEG...,Llama-1B-Instruct,XviD é um software livre e codec de vídeo MPEG...
3998,165327,XviD é um software livre e codec de vídeo MPEG...,XviD é um codec de vídeo código aberto que com...,Qwen-3B-Instruct,XviD é um software livre e codec de vídeo MPEG...


In [24]:
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"



model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name, # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = 6_000,
    dtype = None,
    load_in_4bit = True,
    fast_inference=True,
    cache_dir = '/Data'
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

INFO 03-12 09:21:30 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.536 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-bnb-4bit with actual GPU utilization = 48.12%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.54 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 6000. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 4.99 GB. Also swap space = 6 GB.
INFO 03-12 09:21:42 config.py:549] This model supports multiple tasks: {'score', 'classify', 'reward', 'generate', 'embed'}. 

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 03-12 09:21:46 cuda.py:229] Using Flash Attention backend.
INFO 03-12 09:21:46 model_runner.py:1110] Starting to load model unsloth/meta-llama-3.1-8b-instruct-bnb-4bit...




INFO 03-12 09:21:46 loader.py:1089] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 03-12 09:21:47 weight_utils.py:254] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

INFO 03-12 09:23:16 weight_utils.py:270] Time spent downloading weights for unsloth/meta-llama-3.1-8b-instruct-bnb-4bit: 88.939799 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-12 09:24:07 model_runner.py:1115] Loading model weights took 5.3541 GB
INFO 03-12 09:24:07 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-12 09:24:10 worker.py:267] Memory profiling takes 3.37 seconds
INFO 03-12 09:24:10 worker.py:267] the current vLLM instance can use total_gpu_memory (23.54GiB) x gpu_memory_utilization (0.48) = 11.33GiB
INFO 03-12 09:24:10 worker.py:267] model weights take 5.35GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.93GiB; the rest of the memory reserved for KV Cache is 4.99GiB.
INFO 03-12 09:24:11 executor_base.py:111] # cuda blocks: 2554, # CPU blocks: 3072
INFO 03-12 09:24:11 executor_base.py:116] Maximum concurrency for 6000 tokens per request: 6.81x
INFO 03-12 09:24:13 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occ

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:20<00:00,  1.35it/s]

INFO 03-12 09:24:33 model_runner.py:1562] Graph capturing finished in 20 secs, took 0.68 GiB
INFO 03-12 09:24:33 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 25.90 seconds





In [39]:
base_prompt = '''
    Você é um assistente útil que classifica resumos.  
    Fornecerei o texto e dois resumos (0 e 1) de aproximadamente 100 palavras desse texto em português.  

    Você deve indicar qual deles é o melhor resumo, com base tanto na qualidade do resumo quanto na qualidade do texto em português e no tamanho do texto (deve ter aproximadamente 100 palavras).

    Aqui está o texto:  
    <text>  
    {text}  
    </text>  

    Aqui está o resumo 0:  
    <0>  
    {summary_0}  
    </0>  

    Aqui está o resumo 1:  
    <1>  
    {summary_1}  
    </1>  

    Responda no seguinte formato (JSON):  
    
    {{
        "best_summary": (0 ou 1),
        "explanation": "uma breve explicação do porquê."
    }}
    
'''

In [42]:


tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

FastLanguageModel.for_inference(model)

generated = []

for _, group in tqdm(df.groupby("id"), total = len(df)//4):

    finetuned_summary = group.query("model_name == 'Finetuned Model'")

    text_finetune = finetuned_summary['generated_text'].item()

    for idx, row in group.iterrows():
        if row['model_name'] == 'Finetuned Model':
            continue
        text_model = row['generated_text']


        index_of_finetune = int(np.random.random() > 1/2)
    
        shuffling_dict = {
            index_of_finetune: text_finetune,
            1 - index_of_finetune: text_model
        }

        inverse_shuffling_map = {
            index_of_finetune: "Finetuned Model",
            1 - index_of_finetune: row['model_name']
        }

        prompt = base_prompt.format(
            text = finetuned_summary["text"].item(),
            summary_0 = shuffling_dict[0],
            summary_1 = shuffling_dict[1]
        )

        message = [{'role': 'user', 'content': prompt}]
        inputs = tokenizer.apply_chat_template(
            message,
            tokenize = True,
            add_generation_prompt = True, # Must add for generation
            return_tensors = "pt",
        ).to("cuda")

        generated_ids = model.generate(inputs, max_new_tokens = 500)

        generated_text = tokenizer.decode(generated_ids[0, inputs.shape[1]:]).split("<|eot_id|>")[0]

        print(index_of_finetune)
        print(row['model_name'])
        print(generated_text)

        try:
            generated_json = json.loads(generated_text)
            generated_json['best_summary'] = inverse_shuffling_map[generated_json['best_summary']]
            generated_json["index_of_modified"] = index_of_finetune

            new_row = {
                'id': row['id'],
                'model': row['model_name'],
                'winner': generated_json['best_summary'],
                'explanation': generated_json['explanation']
            }

            print(new_row)

            generated.append(new_row)

        except Exception as e:
            print(f"Error processing LLM: {e}")


        
    # break
    clear_output()
    # print(generated_text)

  3%|▎         | 34/1000 [06:42<3:10:23, 11.83s/it]


KeyboardInterrupt: 

In [43]:
pd.DataFrame(generated)

Unnamed: 0,id,model,winner,explanation
0,228,Qwen-0.5B-Instruct,Finetuned Model,O resumo 0 fornece uma visão mais clara e conc...
1,228,Llama-1B-Instruct,Finetuned Model,"O resumo 1 é mais preciso e detalhado, abordan..."
2,228,Qwen-3B-Instruct,Finetuned Model,O resumo 1 fornece uma visão mais completa e p...
3,230,Qwen-0.5B-Instruct,Qwen-0.5B-Instruct,"O resumo 1 é mais completo e preciso, abordand..."
4,230,Llama-1B-Instruct,Llama-1B-Instruct,"O resumo 1 é mais completo e preciso, abordand..."
...,...,...,...,...
97,745,Llama-1B-Instruct,Finetuned Model,"O resumo 1 é mais completo e preciso, abordand..."
98,745,Qwen-3B-Instruct,Qwen-3B-Instruct,"O resumo 1 é mais preciso e conciso, capturand..."
99,752,Qwen-0.5B-Instruct,Qwen-0.5B-Instruct,"O resumo 0 é mais preciso e completo, abordand..."
100,752,Llama-1B-Instruct,Llama-1B-Instruct,O resumo 1 é mais conciso e eficaz em capturar...


In [25]:
def get_winner(x):
    return (x['winner'] == "Finetuned Model").mean()
pd.read_pickle("data/win_rates.pkl").groupby("model").apply(get_winner)

model
Llama-1B-Instruct     0.511202
Qwen-0.5B-Instruct    0.607254
Qwen-3B-Instruct      0.222680
dtype: float64