# Persona Generation Model Evaluation

This notebook evaluates pre-trained models for persona generation from dialogues without any training.

## Features
- Evaluate any HuggingFace model (seq2seq or causal LM)
- Generate persona descriptions for either speaker 1 or 2
- Model-specific batch sizes based on memory requirements
- Batched generation for efficient GPU utilization
- GPU memory monitoring
- Multiple metrics: BLEU, ROUGE, CHRF, LaBSE similarity

In [None]:
# Install dependencies
!pip install -q torch transformers datasets sacremoses sentence-transformers sacrebleu rouge-score tqdm

## Configuration

Set up model and evaluation parameters:</cell_type>markdown

In [None]:
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATA_DIR = "./data"
OUTPUT_FILE = "persona_evaluation_results.json"
NUM_SAMPLES = None  # Use full dataset
MAX_NEW_TOKENS = 150
TEMPERATURE = 0.7

# Model-specific batch sizes (adjust based on GPU memory)
MODEL_BATCH_SIZES = {
    "cointegrated/rut5-base": 256,
    "cointegrated/rut5-small": 256,
    "google/flan-t5-base": 128,
    "google/flan-t5-small": 256,
    "google/flan-t5-large": 64,
    "facebook/bart-base": 128,
    "facebook/bart-large": 32,
    "Qwen/Qwen2.5-0.5B-Instruct": 256,
    "Qwen/Qwen2.5-1.5B-Instruct": 128,
    "Qwen/Qwen2.5-3B-Instruct": 64,
    "Qwen/Qwen2.5-7B-Instruct": 32,
    "meta-llama/Llama-3.2-1B-Instruct": 256,
    "meta-llama/Llama-3.2-3B-Instruct": 128,
    "meta-llama/Llama-3.2-8B-Instruct": 64,
}

def get_batch_size(model_name: str, default_size: int = 128) -> int:
    """Get batch size for model, with fallback to default."""
    return MODEL_BATCH_SIZES.get(model_name, default_size)

# Get batch size for selected model
BATCH_SIZE = get_batch_size(MODEL_NAME, 128)

print(f"Model: {MODEL_NAME}")
print(f"Target Speaker: 2")
print(f"Samples to evaluate: All")
print(f"Batch size: {BATCH_SIZE}")

In [None]:
from google.colab import files
import os

print("Upload test dialogues JSON file:")
uploaded = files.upload()

# Get the uploaded filename
uploaded_file = list(uploaded.keys())[0]
print(f"Uploaded: {uploaded_file}")

# Create data directory if it doesn't exist
os.makedirs(DATA_DIR, exist_ok=True)

# Move uploaded file to data directory
import shutil
if uploaded_file.endswith('.json'):
    dest_path = os.path.join(DATA_DIR, "dialogues_test.json")
    shutil.move(uploaded_file, dest_path)
    print(f"Moved to: {dest_path}")
else:
    print("Please upload a JSON file")

## GPU Memory Check

In [None]:
import torch

print("=" * 80)
print("GPU Memory Check")
print("=" * 80)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Device: {device}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    free = total_memory - allocated
    
    print(f"Total Memory: {total_memory:.2f} GB")
    print(f"Allocated: {allocated:.2f} GB")
    print(f"Reserved: {reserved:.2f} GB")
    print(f"Available: {free:.2f} GB")
    print("=" * 80)
else:
    print("No GPU available")
    print("=" * 80)

## Data Loading Functions

In [None]:
import json
import os
from typing import List, Dict

def load_dialogues(data_dir: str) -> List[Dict]:
    """Load test dialogues dataset from JSON file."""
    filename = "dialogues_test.json"
    filepath = os.path.join(data_dir, filename)

    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found: {filepath}")

    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    print(f"Loaded {len(data)} examples from {filename}")
    return data

In [None]:
def get_eval_subset(data: List[Dict], num_samples: int = None) -> List[Dict]:
    if num_samples:
        return data[:num_samples]
    return data

In [None]:
dialogues = load_dialogues(DATA_DIR)
eval_data = get_eval_subset(dialogues, NUM_SAMPLES)
print(f"\nEvaluating on {len(eval_data)} samples")

## Model Loading Functions

In [None]:
import torch

def load_model_and_tokenizer(model_name: str):
    from transformers import AutoTokenizer
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    try:
        from transformers import AutoModelForSeq2SeqLM
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        is_seq2seq = True
        print("Detected: Seq2Seq model")
    except (OSError, ValueError, KeyError):
        from transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
        )
        is_seq2seq = False
        print("Detected: Causal LM")
    except Exception as e:
        raise RuntimeError(f"Failed to load model {model_name}: {e}")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer, is_seq2seq

In [None]:
model, tokenizer, is_seq2seq = load_model_and_tokenizer(MODEL_NAME)
print(f"Model loaded on: {model.device}")
print(f"Model parameters: {model.num_parameters():,}")

## Batched Generation Functions

In [None]:
def build_dialogue_text(dialogue: List[Dict]) -> str:
    """Build dialogue text from messages."""
    messages = []
    for msg in dialogue:
        speaker_label = "Пользователь 1" if msg['speaker'] == 1 else "Пользователь 2"
        messages.append(f"{speaker_label}: {msg['text']}")
    return "\n".join(messages)


def generate_personas_batched(model, tokenizer, dialogues: List[Dict],
                                   is_seq2seq: bool,
                                   max_new_tokens: int = 150, temperature: float = 0.7):
    """Generate persona descriptions from dialogues in batches."""
    
    # Few-shot examples in Russian
    examples = """Ты - ассистент для описания личности человека на основе диалога. Опиши личность указанного участника в виде списка фактов.

Пример 1:
Диалог:
Пользователь 1: Привет! Работаю учителем.
Пользователь 2: Привет! А какие предметы?
Пользователь 1: Математику и физику.
Пользователь 2: Круто! У меня собака.
Пользователь 1: А у меня две дочки.

Опиши личность Пользователя 1:
- Работает учителем (математика и физика)
- Есть две дочери

Пример 2:
Диалог:
Пользователь 1: Привет! Люблю путешествовать.
Пользователь 2: Куда ездил?
Пользователь 1: В Турцию, в Египет.
Пользователь 2: Я люблю готовить, я повар.
Пользователь 1: У меня есть собака.
Пользователь 2: У меня подруга подарила котенка.

Опиши личность Пользователя 2:
- Любит готовить
- Работает поваром
- Есть подруга

"""
    
    if is_seq2seq:
        # Seq2Seq: dialogue -> persona for speaker 2
        prompts = [f"{examples}Диалог:\n{build_dialogue_text(d)}\n\nОпиши личность Пользователя 2:" for d in dialogues]
        inputs = tokenizer(prompts, padding=True, truncation=True, max_length=1024, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    else:
        # Causal LM
        prompts = [f"{examples}Диалог:\n{build_dialogue_text(d)}\n\nОпиши личность Пользователя 2:" for d in dialogues]
        inputs = tokenizer(prompts, padding=True, truncation=True, max_length=1024, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
        full_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        # Extract continuation after the prompt
        results = []
        for output in full_outputs:
            # Find where our persona description starts
            if "Опиши личность Пользователя 2:" in output:
                result = output.split("Опиши личность Пользователя 2:")[-1].strip()
            else:
                result = output
            # Clean up
            result = result.split("\n\n")[0].strip()
            result = result.split("Диалог:")[0].strip()
            results.append(result)
    
    return results

## Quick Evaluation

Test model on a few samples:

In [None]:
print("\nSample predictions:")
print("=" * 80)

# Get sample data
sample_data = eval_data[:5]

# Get reference personas (always persona_2)
sample_refs = [
    item.get("persona_2", "") for item in sample_data
]

# Generate in batch
predictions = generate_personas_batched(
    model, tokenizer, sample_data, is_seq2seq, MAX_NEW_TOKENS, TEMPERATURE
)

for i, (item, predicted) in enumerate(zip(sample_data, predictions)):
    reference = item.get("persona_2", "")
    print(f"\n--- Sample {i+1} ---")
    print(f"Reference Persona 2: {reference}")
    print(f"Predicted: {predicted}")
    print("-" * 80)

## Evaluation Functions

Functions to compute metrics:

In [None]:
import numpy as np
import torch

from sentence_transformers import SentenceTransformer, util

# LaBSE Model Caching
labse_model = None

def get_labse_model():
    global labse_model
    if labse_model is None:
        print("Loading LaBSE model...")
        labse_model = SentenceTransformer('sentence-transformers/LaBSE')
    return labse_model

In [None]:
def compute_metrics(references, hypotheses):
    from sacrebleu.metrics import BLEU, CHRF
    from rouge_score import rouge_scorer
    from sentence_transformers import SentenceTransformer, util

    print("Computing BLEU...")
    bleu_metric = BLEU()
    bleu_result = bleu_metric.corpus_score(hypotheses, [references])

    print("Computing ROUGE...")
    rouge_scorer_instance = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True
    )
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    for ref, hyp in zip(references, hypotheses):
        scores = rouge_scorer_instance.score(ref, hyp)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)

    print("Computing CHRF...")
    chrf_metric = CHRF()
    chrf_result = chrf_metric.corpus_score(hypotheses, [references])

    # LaBSE similarity - use cached model
    labse = get_labse_model()
    ref_embeddings = labse.encode(references, convert_to_tensor=True)
    hyp_embeddings = labse.encode(hypotheses, convert_to_tensor=True)
    similarities = util.cos_sim(hyp_embeddings, ref_embeddings)
    similarity_scores = torch.diagonal(similarities).cpu().numpy()

    return {
        'BLEU': bleu_result.score,
        'ROUGE-1': np.mean(rouge1_scores) * 100,
        'ROUGE-2': np.mean(rouge2_scores) * 100,
        'ROUGE-L': np.mean(rougeL_scores) * 100,
        'CHRF': chrf_result.score,
        'LaBSE-Similarity': np.mean(similarity_scores) * 100,
    }

In [None]:
def run_evaluation_batched(model, tokenizer, eval_data, is_seq2seq,
                                   batch_size, max_new_tokens, temperature, output_file=None):
    references, hypotheses, predictions = [], [], []
    num_samples = len(eval_data)
    num_batches = (num_samples + batch_size - 1) // batch_size
    print(f"\nEvaluating {num_samples} samples in {num_batches} batches (size={batch_size})...")

    for batch_idx in range(num_batches):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, num_samples)
        batch_data = eval_data[start_idx:end_idx]

        # Get reference personas (always persona_2)
        batch_refs = [item.get("persona_2", "") for item in batch_data]

        # Generate in batch
        batch_preds = generate_personas_batched(
            model, tokenizer, batch_data, is_seq2seq, max_new_tokens, temperature
        )

        # Store results
        for item, ref, pred in zip(batch_data, batch_refs, batch_preds):
            references.append(ref)
            hypotheses.append(pred)
            predictions.append({
                'id': item.get('id', ''),
                'target_speaker': 2,
                'reference': ref,
                'predicted': pred,
            })

        print(f"Processed batch {batch_idx + 1}/{num_batches} ({end_idx}/{num_samples} samples)")

    print("Computing metrics...")
    metrics = compute_metrics(references, hypotheses)

    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(predictions, f, indent=2, ensure_ascii=False)
        print(f"\nPredictions saved to {output_file}")

    return metrics

## Download Results

Download the evaluation results:

In [None]:
print("\n" + "=" * 80)
print("EVALUATION RESULTS")
print("=" * 80)
print(f"\nModel: {MODEL_NAME}")
print(f"Target Speaker: 2")
print(f"Test samples: {len(eval_data)}")
print(f"Batch size: {BATCH_SIZE}")
print("\n--- Metrics ---")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")
print("=" * 80)

## Compare Multiple Models

Define a list of models to compare on test set:

In [None]:
MODELS_TO_COMPARE = [
    "Qwen/Qwen2.5-0.5B-Instruct",
    "Qwen/Qwen2.5-1.5B-Instruct",
    "meta-llama/Llama-3.2-1B-Instruct",
]

COMPARE_NUM_SAMPLES = None  # Use full dataset

compare_data = load_dialogues(DATA_DIR)
compare_data = get_eval_subset(compare_data, COMPARE_NUM_SAMPLES)
print(f"\nComparing {len(MODELS_TO_COMPARE)} models on {len(compare_data)} test samples...")
all_results = []

for model_name in MODELS_TO_COMPARE:
    print(f"\n" + "=" * 80)
    print(f"Evaluating: {model_name}")
    print("=" * 80)
    try:
        compare_model, compare_tokenizer, compare_is_seq2seq = load_model_and_tokenizer(model_name)
        compare_batch_size = get_batch_size(model_name, 128)
        compare_metrics = run_evaluation_batched(
            compare_model, compare_tokenizer, compare_data, compare_is_seq2seq,
            compare_batch_size, MAX_NEW_TOKENS, TEMPERATURE, None
        )
        result = {
            "model": model_name,
            "target_speaker": 2,
            "samples": len(compare_data)
        }
        result.update(compare_metrics)
        all_results.append(result)
        del compare_model, compare_tokenizer
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error evaluating {model_name}: {e}")
        continue

import pandas as pd

df_results = pd.DataFrame(all_results)
print("\n" + "=" * 80)
print("MODEL COMPARISON RESULTS")
print("=" * 80)
print(df_results.to_string(index=False))
df_results.to_csv('persona_model_comparison.csv', index=False)
print("\nResults saved to persona_model_comparison.csv")

In [None]:
import pandas as pd

df_results = pd.DataFrame(all_results)
print("\n" + "=" * 80)
print("MODEL COMPARISON RESULTS")
print("=" * 80)
print(df_results.to_string(index=False))
df_results.to_csv('model_comparison.csv', index=False)
print("\nResults saved to model_comparison.csv")

In [None]:
print("\nStarting download...")
from google.colab import files
files.download(OUTPUT_FILE)
if os.path.exists('persona_model_comparison.csv'):
    files.download('persona_model_comparison.csv')