# Data preparation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
from huggingface_hub import hf_hub_download, list_repo_files


files = list_repo_files("LangAGI-Lab/pearl", repo_type="dataset")
print("Available files:", files)

train_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="train.json",
    repo_type="dataset"
)

validation_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="valid.json",
    repo_type="dataset"
)

test_file_path = hf_hub_download(
    repo_id="LangAGI-Lab/pearl",
    filename="test.json",
    repo_type="dataset"
)

trainDf = pd.read_json(train_file_path)
validationDf = pd.read_json(validation_file_path)
testDf = pd.read_json(test_file_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Available files: ['.gitattributes', 'README.md', 'test.json', 'train.json', 'valid.json']


train.json:   0%|          | 0.00/198M [00:00<?, ?B/s]

valid.json:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

test.json: 0.00B [00:00, ?B/s]

In [None]:
trainDf["dialogue"].iloc[0]

["Seeker: Hi there! I'm in the mood to watch a movie. Can you recommend something?",
 'Recommender: Absolutely! What kind of movie are you in the mood for? Any specific genre or theme?',
 "Seeker: I'm really into movies with a lot of action, suspense, and a great 50s vibe. I loved watching Julie Adams in the movie I just finished.",
 'Recommender: Based on your preference for action and suspense with a unique twist, I think you\'d enjoy "Happy Death Day." It\'s a comedic horror mystery thriller with a great concept and execution, along with a stylish and atmospheric vibe. Plus, it has some neat twists and a self-aware sense of humor that you might appreciate.',
 'Seeker: Thanks for the recommendation, but I prefer movies with great acting and a good plot. Unfortunately, "Happy Death Day" doesn\'t seem to match that criteria. I also tend to dislike movies with poor casting and a lack of care for the characters. Can you recommend something else that might align with my preferences?',
 'R

In [None]:
train_dataset = Dataset.from_pandas(trainDf)
validation_dataset = Dataset.from_pandas(validationDf)
test_dataset = Dataset.from_pandas(testDf)

In [None]:
train_dataset[0]

{'data_id': 3433,
 'user_persona': '[Like] Lots of action, suspense, and a brilliant 50s vibe. Julie Adams was wonderful to watch.\n[Dislike] Light on dialogue.\n\n[Like]\nNone.\n\n[Dislike]\n- Poor acting\n- Boring story\n- Crazy make up\n- Terrible casting\n- Slow pace\n- Lack of care for the characters\n- Dated appearance\n\n[Like] None.\n[Dislike] Boring and uneventful plot, annoying flashbacks, waste of talented actor.',
 'seen_movie_titles': ['Creature from the Black Lagoon (1954)',
  'Noise (2007)',
  'Una (2017)'],
 'gt_abstract': 'Title: House of Sand and Fog (2003)\nGenre: Crime, Drama\nDirector: Vadim Perelman\nCast: Jennifer Connelly, Ben Kingsley, Ron Eldard\nAbstract: [Like] Great acting and good plot. BK great as always.\n[Dislike] Another example of dumb American police and their disregard for life.',
 'gt_movie_title': 'House of Sand and Fog (2003)',
 'gt_genre': 'Crime, Drama',
 'gt_director': 'Vadim Perelman',
 'gt_cast': 'Jennifer Connelly, Ben Kingsley, Ron Eldard'

In [None]:
def to_messages(example):

    msgs = []

    system_parts = []
    if example.get("user_persona"):
        p = example["user_persona"]

        persona_text = " ".join(p) if isinstance(p, list) else p
        system_parts.append(f"Persona: {persona_text}")
    if example.get("seen_movie_titles"):
        seen = example["seen_movie_titles"]
        system_parts.append(f"Seen: {', '.join(seen)}")
    if system_parts:
        msgs.append({
            "role":    "system",
            "content": " | ".join(system_parts)
        })

    for turn in example["dialogue"]:
        if ":" in turn:
            speaker, text = turn.split(":", 1)
        else:
            speaker, text = "", turn
        speaker = speaker.strip().lower()
        text    = text.strip()

        if speaker.startswith("seeker"):
            role = "user"
        elif speaker.startswith("recommender"):
            role = "assistant"
        else:
            role = "user"

        msgs.append({"role": role, "content": text})

    return {"messages": msgs}


In [None]:
train_dataset = train_dataset.map(to_messages)
validation_dataset = validation_dataset.map(to_messages)
test_dataset = test_dataset.map(to_messages)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

In [None]:
train_dataset[0]

{'data_id': 3433,
 'user_persona': '[Like] Lots of action, suspense, and a brilliant 50s vibe. Julie Adams was wonderful to watch.\n[Dislike] Light on dialogue.\n\n[Like]\nNone.\n\n[Dislike]\n- Poor acting\n- Boring story\n- Crazy make up\n- Terrible casting\n- Slow pace\n- Lack of care for the characters\n- Dated appearance\n\n[Like] None.\n[Dislike] Boring and uneventful plot, annoying flashbacks, waste of talented actor.',
 'seen_movie_titles': ['Creature from the Black Lagoon (1954)',
  'Noise (2007)',
  'Una (2017)'],
 'gt_abstract': 'Title: House of Sand and Fog (2003)\nGenre: Crime, Drama\nDirector: Vadim Perelman\nCast: Jennifer Connelly, Ben Kingsley, Ron Eldard\nAbstract: [Like] Great acting and good plot. BK great as always.\n[Dislike] Another example of dumb American police and their disregard for life.',
 'gt_movie_title': 'House of Sand and Fog (2003)',
 'gt_genre': 'Crime, Drama',
 'gt_director': 'Vadim Perelman',
 'gt_cast': 'Jennifer Connelly, Ben Kingsley, Ron Eldard'

In [None]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

max_seq_length = 2048

def initialize_model_and_tokenizer(base_model_name: str = "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit", max_seq_length: int = max_seq_length, dtype = None, load_in_4bit: bool = True,):

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = base_model_name,
        max_seq_length = max_seq_length,
        dtype          = dtype,
        load_in_4bit   = load_in_4bit,
    )


    model = FastLanguageModel.get_peft_model(
        model,
        r               = 16,
        target_modules  = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha      = 16,
        lora_dropout    = 0.0,
        bias            = "none",
        use_gradient_checkpointing = "unsloth",
        random_state    = 3407,
        use_rslora      = False,
        loftq_config    = None,
    )

    tokenizer = get_chat_template(tokenizer, chat_template = "llama-3.2")

    return model, tokenizer

model, tokenizer = initialize_model_and_tokenizer()


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2025.6.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
def formatting_prompts_func(examples):
  convos = examples["messages"]
  texts = [
      tokenizer.apply_chat_template(
          convo,
          tokenize              = False,
          add_generation_prompt = True
      )
      for convo in convos
  ]
  return {"text": texts}



train_dataset = train_dataset.map(
    formatting_prompts_func,
    batched=True,
)

validation_dataset = validation_dataset.map(
    formatting_prompts_func,
    batched=True,
)

test_dataset = test_dataset.map(
    formatting_prompts_func,
    batched=True,
)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

# Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 200,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)


Unsloth: Tokenizing ["text"]:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
import torch

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
3.441 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.4304
2,2.418
3,2.267
4,2.3893
5,2.3475
6,2.0029
7,1.916
8,1.8536
9,1.7477
10,1.7561


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

1713.2895 seconds used for training.
28.55 minutes used for training.
Peak reserved memory = 4.051 GB.
Peak reserved memory for training = 0.61 GB.
Peak reserved memory % of max memory = 27.481 %.
Peak reserved memory for training % of max memory = 4.138 %.


In [None]:
model.save_pretrained("pearl_llama_model")
tokenizer.save_pretrained("pearl_llama_model")

('pearl_lama_model/tokenizer_config.json',
 'pearl_lama_model/special_tokens_map.json',
 'pearl_lama_model/chat_template.jinja',
 'pearl_lama_model/tokenizer.json')

In [None]:
!zip -r pearl_llama_model.zip pearl_llama_model/

  adding: pearl_lama_model/ (stored 0%)
  adding: pearl_lama_model/tokenizer_config.json (deflated 96%)
  adding: pearl_lama_model/adapter_model.safetensors (deflated 8%)
  adding: pearl_lama_model/chat_template.jinja (deflated 72%)
  adding: pearl_lama_model/tokenizer.json (deflated 85%)
  adding: pearl_lama_model/adapter_config.json (deflated 56%)
  adding: pearl_lama_model/special_tokens_map.json (deflated 71%)
  adding: pearl_lama_model/README.md (deflated 66%)


Uncomment if, instead of training the model with the code above, the model is imported as a `.zip`

In [None]:
!unzip pearl_llama_model.zip -d pearl_llama_model

Archive:  pearl_llama_model.zip
   creating: pearl_llama_model/pearl_llama_model/
  inflating: pearl_llama_model/pearl_llama_model/special_tokens_map.json  
  inflating: pearl_llama_model/pearl_llama_model/README.md  
  inflating: pearl_llama_model/pearl_llama_model/adapter_model.safetensors  
  inflating: pearl_llama_model/pearl_llama_model/tokenizer_config.json  
  inflating: pearl_llama_model/pearl_llama_model/tokenizer.json  
  inflating: pearl_llama_model/pearl_llama_model/chat_template.jinja  
  inflating: pearl_llama_model/pearl_llama_model/adapter_config.json  


# Evaluation

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./pearl_llama_model/pearl_llama_model",
    max_seq_length = max_seq_length,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [None]:
tokenizer = get_chat_template(tokenizer, chat_template = "llama-3.2")

In [None]:
%%capture
!pip install tqdm
!pip install rouge_score
!pip install evaluate bert_score

In [None]:
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from collections import Counter
import numpy as np
import nltk
from evaluate import load as load_eval

In [None]:
test_dataset

Dataset({
    features: ['data_id', 'user_persona', 'seen_movie_titles', 'gt_abstract', 'gt_movie_title', 'gt_genre', 'gt_director', 'gt_cast', 'dialogue', 'messages', 'text'],
    num_rows: 2277
})

In [None]:
percentage = 0.2
# Esto hace internamente un muestreo sin tener que barajar todo el dataset
split = test_dataset.train_test_split(test_size=percentage, seed=42)
subset_test_dataset = split["test"]

print(f"Usando {len(subset_test_dataset)} ejemplos para la evaluación.")


Usando 456 ejemplos para la evaluación.


In [None]:
subset_test_dataset

Dataset({
    features: ['data_id', 'user_persona', 'seen_movie_titles', 'gt_abstract', 'gt_movie_title', 'gt_genre', 'gt_director', 'gt_cast', 'dialogue', 'messages', 'text'],
    num_rows: 456
})

In [None]:
generation_args = {
    "max_new_tokens": 100,
    "temperature":    0.3,
    "top_p":          0.9,
    "top_k":          50,
    "use_cache":      True,
}

predictions = []
references  = []

for example in tqdm(subset_test_dataset):
    msgs    = example["messages"]
    context = msgs[:-2]

    # this returns a Tensor of shape [1, seq_len]
    inputs = tokenizer.apply_chat_template(
        context,
        tokenize              = True,
        add_generation_prompt = True,
        return_tensors        = "pt",
    )
    # inputs is already your input_ids
    input_ids = inputs.cuda()    # no indexing with ["input_ids"]
    attention_mask = None        # if you need one, generate it yourself or ignore

    with torch.no_grad():
        outputs = model.generate(
            input_ids = input_ids,
            **generation_args
        )

    raw = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # drop everything before the assistant tag
    if "<|assistant|>" in raw:
        gen = raw.split("<|assistant|>")[-1].strip()
    else:
        gen = raw.strip()

    predictions.append(gen)

    # 6) Grab your ground-truth suggestion (the second-to-last turn)
    references.append(msgs[-2]["content"])

  0%|          | 0/456 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
LlamaForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.
100%|██████████| 456/456 [28:21<00:00,  3.73s/it]


In [None]:
import re

TITLE_RE = re.compile(
    r'''
      "([^"]+?)"\s*\(\d{4}\)      |  # "The Sixth Sense (1999)"
      "([^"]+?)"\s*               |  # "The Dark Knight"
      ([^"]+?)\s*\(\d{4}\)           #  The Shallows (2016)
    ''',
    re.VERBOSE,
)

def extract_title(text):
    match = TITLE_RE.search(text)
    if match:
        return match.group(1) or match.group(2) or match.group(3)
    return None

import re
import string

def normalize_title(title: str) -> str:
    if not title or not isinstance(title, str):
        return ""

    title = re.sub(r"\s*\(\d{4}\)", "", title)

    title = title.translate(str.maketrans("", "", string.punctuation.replace("'", "")))

    title = title.strip().lower()

    return title


In [None]:
subset_test_dataset[0]['gt_movie_title']

'The Curious Case of Benjamin Button (2008)'

In [None]:
ground_truth_references = []
for test_case in subset_test_dataset:
  ground_truth_references.append(test_case['gt_movie_title'])

In [None]:
correct = 0
total   = 0
output  = []

for pred_text, ref_text in zip(predictions, ground_truth_references):
    pred_title = normalize_title(extract_title(pred_text.split("\n")[-1]))
    ref_title  = normalize_title(ref_text)

    hit = pred_title == ref_title
    correct += int(hit)
    total   += 1

    output.append({
        "reference": ref_title,
        "prediction": pred_title,
        "hit": hit
    })

recall_at_1 = correct / total if total > 0 else 0
print(f"Recall@1: {recall_at_1:.3f}")


Recall@1: 0.013


In [None]:
import json
with open("recall1_eval.json", "w") as f:
    json.dump(output, f, indent=2)


In [None]:
generated_predictions = []
for pred in predictions:
  generated_predictions.append(pred.split('\n')[-1])

In [None]:
print(references[0])

How about "The Curious Case of Benjamin Button"? It has vibrant characters, powerful storytelling, and heartwarming moments, along with subtle messages about kindness and strength. It's definitely a movie with deep emotional impact and character development that I think you would appreciate.


In [None]:
# 7) Compute BLEU
bleu_score = corpus_bleu([[r] for r in references], generated_predictions)
print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.5521


In [None]:
# 8) Compute ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=True)
rouge_scores = [scorer.score(r, p) for r, p in zip(references, generated_predictions)]
print("ROUGE-1:", np.mean([s["rouge1"].fmeasure for s in rouge_scores]))
print("ROUGE-2:", np.mean([s["rouge2"].fmeasure for s in rouge_scores]))
print("ROUGE-L:", np.mean([s["rougeL"].fmeasure for s in rouge_scores]))

ROUGE-1: 0.49820543550239405
ROUGE-2: 0.25000300886698834
ROUGE-L: 0.3707459259504848


In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# 9) Compute Distinct-1/2
def distinct(seqs):
    intra1, intra2 = [], []
    uni_all, bi_all = Counter(), Counter()
    for seq in seqs:
        unigrams = Counter(seq)
        bigrams  = Counter(zip(seq, seq[1:]))
        intra1.append(len(unigrams)/ (len(seq)+1e-5))
        intra2.append(len(bigrams)/ (max(len(seq)-1,1)))
        uni_all.update(unigrams)
        bi_all.update(bigrams)
    inter1 = len(uni_all)/sum(uni_all.values())
    inter2 = len(bi_all)/sum(bi_all.values())
    return np.mean(intra1), np.mean(intra2), inter1, inter2

# tokenize references for distinct
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
tokenized_refs = [nltk.word_tokenize(p) for p in generated_predictions]
d1_i, d2_i, d1_e, d2_e = distinct(tokenized_refs)
print(f"Distinct-1 intra/inter: {d1_i:.4f}/{d1_e:.4f}")
print(f"Distinct-2 intra/inter: {d2_i:.4f}/{d2_e:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Distinct-1 intra/inter: 0.7938/0.0808
Distinct-2 intra/inter: 0.9762/0.2776


In [None]:

bertscore = load_eval("bertscore")
results  = bertscore.compute(
    predictions = generated_predictions,
    references  = references,
    lang        = "en",
    model_type  = "distilbert-base-uncased",
)
print(f"BERTScore F1: {np.mean(results['f1']):.4f}")
print(f"BERTScore Precision:   {np.mean(results['precision']):.4f}")
print(f"BERTScore Recall:      {np.mean(results['recall']):.4f}")


Downloading builder script: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

BERTScore F1: 0.8631
BERTScore Precision:   0.8652
BERTScore Recall:      0.8614


In [None]:
import re

def text_to_chat(chat_text: str):
    pattern = re.compile(r'(system|assistant|user)\s*', re.IGNORECASE)

    segments = pattern.split(chat_text)[1:]

    conversation = []
    for role, content in zip(segments[0::2], segments[1::2]):
        conversation.append((role.strip().capitalize(), content.strip()))

    for role, content in conversation:
        print(f"{role}: {content}\n")

In [None]:
for prediction in predictions:

    print("\n========== New Chat ============\n")
    text_to_chat(prediction)



System: Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

Persona: [Like] The movie features visually and emotionally powerful scenes, a fantastic score, and is a technical marvel.

[Dislike] It lacks the joyful energy found in Leone's westerns.

[Like]
- The dialogue-driven nature of the film
- The mix of different themes
- The way the story was structured
- The excellent cinematography and lighting
- The score by Carter Burwell
- The feeling of a stage adaptation without the constraints of one

[Dislike]
None.

[Like]
- Bacon's exceptional performance
- Illumination of the movie's true nature through the performances of Bacon and Fishburne
- Highlighting the real horror of the situation through mundane police work and internalized body language

[Dislike]
- Laborious and overdone big tragedy arc
- Heavy acting and display of tics by Penn and Robbins
- Eastwood's music and authentic neighborhood shots trying too hard to seem profound | Seen: Once Upon a Time in America

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import nltk

nltk.download("punkt")

# --- MÉTRICA DE NOVEDAD ---
def calculate_novelty(predictions, references):
    novelty_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = set(word_tokenize(pred.lower()))
        ref_tokens = set(word_tokenize(ref.lower()))
        novel_tokens = pred_tokens - ref_tokens
        novelty = len(novel_tokens) / (len(pred_tokens) + 1e-5)
        novelty_scores.append(novelty)
    return np.mean(novelty_scores)

# --- MÉTRICA SELF-BLEU ---
def compute_self_bleu(predictions):
    scores = []
    smoothing = SmoothingFunction().method1
    for i, pred in enumerate(predictions):
        hyp = word_tokenize(pred)
        refs = [word_tokenize(p) for j, p in enumerate(predictions) if j != i]
        if refs:  # evitar listas vacías
            score = sentence_bleu(refs, hyp, smoothing_function=smoothing)
            scores.append(score)
    return np.mean(scores)

novelty_score = calculate_novelty(generated_predictions, references)
self_bleu_score = compute_self_bleu(generated_predictions)

print(f"Novelty Score: {novelty_score:.4f}")
print(f"Self-BLEU Score: {self_bleu_score:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Novelty Score: 0.4452
Self-BLEU Score: 0.6808


In [None]:
prompts = [
    "User: Can you recommend a horror movie?",
    "User: I don't know what to watch today, any ideas?",
    "User: I like sci-fi movies with a romantic touch.",
    "User: I watched Inception yesterday and loved it. Anything similar?",
    "User: Hey, what's a good movie to watch right now?",
    "User: Recomiéndame una película de terror.",
    "User: No sé qué ver hoy, ¿alguna idea?",
    "User: Me gustan las películas de ciencia ficción con un toque romántico.",
    "User: Ayer vi Inception y me encantó. ¿Algo similar?",
    "User: Ey, ¿qué peli está buena ahora?"
]

for i, prompt in enumerate(prompts):
    print(f"\n--- Prompt #{i+1} ---\n")

    # Agrega marcador claro entre input y expected output
    formatted_prompt = f"User: {prompt}\nAssistant:"

    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.cuda()

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,   # Penaliza repeticiones
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    response = decoded.split("Assistant:")[-1].strip()
    print(f"Prompt: {prompt}")
    print(f"Respuesta: {response}")


--- Prompt #1 ---

Prompt: User: Can you recommend a horror movie?
Respuesta: Based on your preference for atmospheric settings in classic monster movies with interesting character dynamics, I would highly recommend "The Shining" (1980) directed by Stanley Kubrick. It's an iconic masterpiece that has captivated audiences with its haunting setting and compelling storytelling

--- Prompt #2 ---

Prompt: User: I don't know what to watch today, any ideas?
Respuesta: Ah, how about a classic sci-fi movie from the 80s? "The Terminator" is a must-watch for all action and thriller fans. The special effects are amazing, and Arnold Schwarzenegger's performance as the cyborg assassin will keep you on the edge of your seat.

[Image description]
- Classic sci-fi movie
- Must-have for action and thriller fans

[Recommendation]
- If you enjoyed watching movies with great acting performances by actors like Tom Hanks or Harrison

--- Prompt #3 ---

Prompt: User: I like sci-fi movies with a romantic tou