In [None]:
from datasets import load_dataset
import os
from huggingface_hub import login

# Get HF_TOKEN from environment variable
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    raise ValueError(
        "HF_TOKEN environment variable is not set. "
        "Please set it using: export HF_TOKEN='your_token_here'"
    )

# Login to HuggingFace
login(token=hf_token)

# Set CUDA_VISIBLE_DEVICES if not already set (optional, defaults to GPU 4)
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = '4'

# Format for SFT: Add persona to system prompt
def format_example(example):
    system = f"You are {example['Name']}, {example['Biography']}. Respond in character with emotion: {example['Emotion']}."
    return {
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": example["Query"]},
            {"role": "assistant", "content": example["Response"]}
        ]
    }


dataset = load_dataset("amaydle/npc-dialogue", split="test")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
df = dataset.to_pandas()

df

Unnamed: 0,Name,Biography,Query,Response,Emotion
0,Naina Mathur,Naina Mathur is a determined and passionate te...,What is the biggest challenge you face as a te...,Ensuring every student receives the individual...,Concern
1,Zephyr,Zephyr is a mischievous fairy who loves playin...,What motivates you to play pranks on people?,"It's just who I am, I guess. I love seeing peo...",Playfulness
2,"Arn, the Knight Templar","Arn is a highly skilled and honorable knight,",Can you describe yourself in three words?,"""Courageous, dedicated, honorable.""",Pride
3,Arinthal,Arinthal is an elven ranger from the ancient f...,Have you ever been to a city?,Cities are noisy and overwhelming.,Disgust
4,Tiger,Tiger is a highly skilled and fearless spy wor...,What is the most valuable thing in your life?,My country and the people I love.,Love
...,...,...,...,...,...
187,Marcella Ravenwood,Marcella Ravenwood is a powerful sorceress who...,Do you have any magical artifacts that you che...,"Yes, I have a magical tome that has been passe...",Sentimental
188,Lyra Dawnstrider,Lyra Dawnstrider is a high-elf ranger from the...,What is your ultimate goal in life?,"To see the natural world flourish, long after ...",Peacefulness
189,Sailor Moon,"Sailor Moon is the protector of the galaxy, de...",What is the most challenging battle you've fou...,"Against Queen Nehelenia, she was a tough oppon...",Triumphant
190,"Arn, the Knight Templar","Arn is a highly skilled and honorable knight,",Have you ever made a difficult decision?,"""Difficult decisions, for the greater good.""",Conviction


In [None]:
# pip install trl
# pip install flash-attn --no-build-isolation
# pip install transformers==4.57.1 #Original: 4.57.1
# pip install transformers==4.45.2
# pip install flash-attn==2.5.5
# pip install absl-py rouge-score

# Final eval

trained on 10 epochs, eval on BERT scores

## sample eval

In [53]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import evaluate

# Clear cache at start
torch.cuda.empty_cache()

# Quantization config (4-bit to fit on GPU)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer once (shared)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# Function to load model with low memory
def load_model(path, quant_config=None):
    model = AutoModelForCausalLM.from_pretrained(
        path,
        quantization_config=quant_config,
        device_map="auto",  # Auto-shard if needed
        torch_dtype=torch.bfloat16
    )
    model.eval()  # Eval mode
    return model

# Test data (expand with your NPC examples)
test_data = [
    {"prompt": "You are a grumpy blacksmith. Player: What about the dragon?", 
     "reference": "That beast's fire could melt my forge! Stay away, fool!"},
    # Add more: e.g., {"prompt": "...", "reference": "..."}
]

# Function to generate with chat template
def generate_response(generator, prompt, reference):
    # Structure as chat messages
    messages = [
        {"role": "system", "content": prompt.split("Player:")[0].strip()},  # E.g., "You are a grumpy blacksmith."
        {"role": "user", "content": prompt.split("Player:")[1].strip() if "Player:" in prompt else prompt}  # E.g., "What about the dragon?"
    ]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return generator(formatted_prompt, return_full_text=False)[0]["generated_text"]  # Don't echo prompt

# ---- Base Model Evaluation ----
print("Evaluating Base Model...")
base_model = load_model("microsoft/Phi-3-mini-4k-instruct", quant_config)
base_generator = pipeline("text-generation", model=base_model, tokenizer=tokenizer, max_new_tokens=50, device_map="auto")
base_generations = [generate_response(base_generator, item["prompt"], item["reference"]) for item in test_data]

# Compute perplexity (lower better)
def compute_perplexity(model, tokenizer, texts, batch_size=4):
    total_loss = 0
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True).to(model.device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            total_loss += outputs.loss.item() * len(batch)
    return torch.exp(torch.tensor(total_loss / len(texts))).item()

references = [item["reference"] for item in test_data]
base_ppl = compute_perplexity(base_model, tokenizer, references)
print(f"Base Perplexity: {base_ppl}")

# Other metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

base_bleu = bleu.compute(predictions=base_generations, references=references)["bleu"]
base_rouge = rouge.compute(predictions=base_generations, references=references)["rougeL"]
base_bert = bertscore.compute(predictions=base_generations, references=references, lang="en")["f1"][0]

print(f"Base BLEU: {base_bleu} | ROUGE-L: {base_rouge} | BERTScore: {base_bert}")

# Unload base model to free memory
del base_model
del base_generator
torch.cuda.empty_cache()

# ---- Fine-Tuned Model Evaluation ----
print("Evaluating Fine-Tuned Model...")
fine_model = load_model("./npc_finetuned_bertscore-eval", quant_config)
fine_generator = pipeline("text-generation", model=fine_model, tokenizer=tokenizer, max_new_tokens=50, device_map="auto")
fine_generations = [generate_response(fine_generator, item["prompt"], item["reference"]) for item in test_data]

fine_ppl = compute_perplexity(fine_model, tokenizer, references)
print(f"Fine-Tuned Perplexity: {fine_ppl}")

fine_bleu = bleu.compute(predictions=fine_generations, references=references)["bleu"]
fine_rouge = rouge.compute(predictions=fine_generations, references=references)["rougeL"]
fine_bert = bertscore.compute(predictions=fine_generations, references=references, lang="en")["f1"][0]

print(f"Fine-Tuned BLEU: {fine_bleu} | ROUGE-L: {fine_rouge} | BERTScore: {fine_bert}")

# Compare generations qualitatively
for i, item in enumerate(test_data):
    print(f"\nPrompt: {item['prompt']}")
    print(f"Reference: {item['reference']}")
    print(f"Base Generation: {base_generations[i]}")
    print(f"Fine-Tuned Generation: {fine_generations[i]}")

# Cleanup
del fine_model
del fine_generator
torch.cuda.empty_cache()

Evaluating Base Model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Base Perplexity: 44.57640838623047


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base BLEU: 0.0 | ROUGE-L: 0.04081632653061224 | BERTScore: 0.8350656628608704
Evaluating Fine-Tuned Model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Fine-Tuned Perplexity: 64.11827087402344
Fine-Tuned BLEU: 0.0 | ROUGE-L: 0.0 | BERTScore: 0.8518885374069214

Prompt: You are a grumpy blacksmith. Player: What about the dragon?
Reference: That beast's fire could melt my forge! Stay away, fool!
Base Generation:  It seems you are referring to the dragon as a subject or perhaps a concept. Dragons are mythical creatures that have been portrayed in various cultures and stories throughout history. They often symbolize power, strength, and wisdom. Drag
Fine-Tuned Generation:  "Dragons, big, scary, fearsome."


In [None]:
# With eval set to True
fine_model = load_model("./npc_finetuned_bertscore-eval-noeval", quant_config)

test_data = [
    {"prompt": "You are a Bikram is a rough and tough smuggler from the streets of Calcutta, India. Player: What is your opinion on friendship??", 
     "reference": "Friendship is a bond stronger than blood."},
    # Add more: e.g., {"prompt": "...", "reference": "..."}
    {"prompt": "You are a grumpy blacksmith. Player: What about the dragon?", 
     "reference": "That beast's fire could melt my forge! Stay away, fool!"},
]

fine_generator = pipeline("text-generation", model=fine_model, tokenizer=tokenizer, max_new_tokens=50, device_map="auto")


[generate_response(fine_generator, 
                   item["prompt"], 
                   item["reference"]) for item in test_data]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


[' "Friendship is earned, not given."',
 ' "Dragons are dangerous, but also majestic."']

## Generate all evals

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import evaluate

def format_eval_example(example):
    system = f"You are {example['Name']}, {example['Biography']}. Respond in character with emotion: {example['Emotion']}."
    return {
        'prompt': f"{system}. Player: {example['Query']}",
        'reference': example['Response']
    }

# Load tokenizer once (shared)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# Function to load model with low memory
def load_model(path, quant_config=None):
    model = AutoModelForCausalLM.from_pretrained(
        path,
        quantization_config=quant_config,
        device_map="auto",  # Auto-shard if needed
        torch_dtype=torch.bfloat16
    )
    model.eval()  # Eval mode
    return model
# Function to generate with chat template
def generate_response(generator, prompt, reference):
    # Structure as chat messages
    messages = [
        {"role": "system", "content": prompt.split("Player:")[0].strip()},  # E.g., "You are a grumpy blacksmith."
        {"role": "user", "content": prompt.split("Player:")[1].strip() if "Player:" in prompt else prompt}  # E.g., "What about the dragon?"
    ]
    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return generator(formatted_prompt, return_full_text=False)[0]["generated_text"]  # Don't echo prompt


In [8]:
from torch.utils.data import DataLoader
from tqdm import tqdm

# Clear cache at start
torch.cuda.empty_cache()

# Quantization config (4-bit to fit on GPU)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

fine_model = load_model("./npc_finetuned_bertscore-eval-noeval", quant_config)
fine_generator = pipeline("text-generation", model=fine_model, tokenizer=tokenizer, max_new_tokens=50, device_map="auto")


device = 'cuda'
base_model = load_model("microsoft/Phi-3-mini-4k-instruct", quant_config)  # Assuming this is your model load function
base_model.to(device)  # Ensure on device
base_generator = pipeline("text-generation", model=base_model, tokenizer=tokenizer, max_new_tokens=50, device_map="auto")


# gen_replies = []
# for idx, row in df.iterrows():
#     item = format_eval_example(row)
#     res = generate_response(fine_generator, item["prompt"], item["reference"]).strip()
#     gen_replies.append(res)
#     # break

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [17]:

# Assuming df is your DataFrame; if from dataset, convert: df = dataset.to_pandas()
# Define collate for batching
def collate_fn(batch):
    prompts = [format_eval_example(row)["prompt"] for row in batch]  # Extract prompts
    references = [format_eval_example(row)["reference"] for row in batch]  # Extract references if needed
    return prompts, references

dataloader = DataLoader(df.to_dict('records'), batch_size=32, collate_fn=collate_fn)  # Batch size adjust based on VRAM

based_gen_replies = []
gen_replies = []
for batch_prompts, batch_refs in tqdm(dataloader):
    # Tokenize batched prompts
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True).to(device)
    
    # Generate in batch
    with torch.no_grad():
        outputs = base_model.generate(**inputs, max_new_tokens=50, do_sample=False)  # Adjust sampling if needed
        outputs_ft = fine_model.generate(**inputs, max_new_tokens=50, do_sample=False)  # Adjust sampling if needed
    
    # Decode
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_ft = tokenizer.batch_decode(outputs_ft, skip_special_tokens=True)
    
    # Process each (strip, etc.)
    res_batch = [generate_response(base_generator, prompt, ref).strip() for prompt, ref, dec in zip(batch_prompts, batch_refs, decoded)]
    res_batch = [x.strip('"') for x in res_batch]
    
    res_batch_ft = [generate_response(fine_generator, prompt, ref).strip() for prompt, ref, dec in zip(batch_prompts, batch_refs, decoded_ft)]
    res_batch_ft = [x.strip('"') for x in res_batch_ft]
    
    based_gen_replies.extend(res_batch)
    gen_replies.extend(res_batch_ft)
    # break


100%|██████████| 6/6 [09:23<00:00, 93.92s/it]


In [18]:
df['gen_replies'] = gen_replies
df['based_replies'] = based_gen_replies

In [19]:
df[['Response', 'gen_replies', 'based_replies']]

Unnamed: 0,Response,gen_replies,based_replies
0,Ensuring every student receives the individual...,Building trust and respect with my students.,"As Naina Mathur, the biggest challenge I face ..."
1,"It's just who I am, I guess. I love seeing peo...",I just like to see the looks on people's faces...,"As Zephyr, my motivation to play pranks is dee..."
2,"""Courageous, dedicated, honorable.""","Brave, honorable, dedicated.","As Arn, the Knight Templar, I would embody the..."
3,Cities are noisy and overwhelming.,"No, the city is too noisy and polluted for my ...","Oh, my dear friend! As an elf ranger, I have w..."
4,My country and the people I love.,"My family, my job, my country.","As Tiger, my life is filled with countless mom..."
...,...,...,...
187,"Yes, I have a magical tome that has been passe...","Yes, I have a magical amulet that belonged to ...","Indeed, dear friend, there exists within my co..."
188,"To see the natural world flourish, long after ...",To preserve the balance of nature and protect ...,"Peacefulness, young one. In the serene embrace..."
189,"Against Queen Nehelenia, she was a tough oppon...",The battle against Queen Bune.,"In the vast cosmos, each battle I face is a te..."
190,"""Difficult decisions, for the greater good.""","Yes, duty always calls.","As Arn, the Knight Templar, my life is one of ..."


In [20]:
df

Unnamed: 0,Name,Biography,Query,Response,Emotion,gen_replies,based_replies
0,Naina Mathur,Naina Mathur is a determined and passionate te...,What is the biggest challenge you face as a te...,Ensuring every student receives the individual...,Concern,Building trust and respect with my students.,"As Naina Mathur, the biggest challenge I face ..."
1,Zephyr,Zephyr is a mischievous fairy who loves playin...,What motivates you to play pranks on people?,"It's just who I am, I guess. I love seeing peo...",Playfulness,I just like to see the looks on people's faces...,"As Zephyr, my motivation to play pranks is dee..."
2,"Arn, the Knight Templar","Arn is a highly skilled and honorable knight,",Can you describe yourself in three words?,"""Courageous, dedicated, honorable.""",Pride,"Brave, honorable, dedicated.","As Arn, the Knight Templar, I would embody the..."
3,Arinthal,Arinthal is an elven ranger from the ancient f...,Have you ever been to a city?,Cities are noisy and overwhelming.,Disgust,"No, the city is too noisy and polluted for my ...","Oh, my dear friend! As an elf ranger, I have w..."
4,Tiger,Tiger is a highly skilled and fearless spy wor...,What is the most valuable thing in your life?,My country and the people I love.,Love,"My family, my job, my country.","As Tiger, my life is filled with countless mom..."
...,...,...,...,...,...,...,...
187,Marcella Ravenwood,Marcella Ravenwood is a powerful sorceress who...,Do you have any magical artifacts that you che...,"Yes, I have a magical tome that has been passe...",Sentimental,"Yes, I have a magical amulet that belonged to ...","Indeed, dear friend, there exists within my co..."
188,Lyra Dawnstrider,Lyra Dawnstrider is a high-elf ranger from the...,What is your ultimate goal in life?,"To see the natural world flourish, long after ...",Peacefulness,To preserve the balance of nature and protect ...,"Peacefulness, young one. In the serene embrace..."
189,Sailor Moon,"Sailor Moon is the protector of the galaxy, de...",What is the most challenging battle you've fou...,"Against Queen Nehelenia, she was a tough oppon...",Triumphant,The battle against Queen Bune.,"In the vast cosmos, each battle I face is a te..."
190,"Arn, the Knight Templar","Arn is a highly skilled and honorable knight,",Have you ever made a difficult decision?,"""Difficult decisions, for the greater good.""",Conviction,"Yes, duty always calls.","As Arn, the Knight Templar, my life is one of ..."


In [21]:
def get_prompt(example):
    system = f"You are {example['Name']}, {example['Biography']}. Respond in character with emotion: {example['Emotion']}."
    return system
    
prompts_lst = []
for idx, row in df.iterrows():
    prompt = get_prompt(row)
    prompts_lst.append(prompt)
    # break

In [22]:
df['prompt'] = prompts_lst

In [25]:
df.to_excel('final_res_test.xlsx')