# Preprocess dataset to adapt the format to PERC

In [3]:
#!/usr/bin/env python3
"""
Convert PEARL to the dialogue format expected by MovieRecDataset.

Result:
  data/PEARL/train_data_processed
  data/PEARL/dev_data_processed
  data/PEARL/test_data_processed
  data/PEARL/movie_db
"""
import ast, random, re
from pathlib import Path
import pandas as pd
import torch

# -------------------- config --------------------
random.seed(42)
REPO_ROOT  = Path(".")
PEARL_ROOT = REPO_ROOT / "data" / "PEARL"
PLACEHOLDER = "[MOVIE_ID]"
SEP = "[SEP]"
TRAIN_FRACTION = 0.15
PEARL_ROOT.mkdir(parents=True, exist_ok=True)

HF = {"train":"train.json", "dev":"valid.json", "test":"test.json"}

# -------------------- load splits --------------------
dfs = {}
for split,fname in HF.items():
    uri = f"hf://datasets/LangAGI-Lab/pearl/{fname}"
    print("loading", uri)
    dfs[split] = pd.read_json(uri)

# -------------------- build items_db --------------------
title2id, items_db = {}, {}
for df in dfs.values():
    for _,row in df.iterrows():
        title = row["gt_movie_title"]
        if title not in title2id:
            idx = len(title2id)
            title2id[title] = idx
            items_db[idx] = (f"{title} {SEP} {row['gt_cast']} {SEP} "
                             f"{row['gt_director']} {SEP} {row['gt_genre']} {SEP} "
                             f"{row['gt_abstract']}")
torch.save(items_db, PEARL_ROOT / "movie_db")
print("saved movie_db with", len(items_db), "movies")

def parse_dialog(cell):
    if isinstance(cell, list):
        return cell
    try:
        return ast.literal_eval(cell)
    except Exception:
        return cell.split("\n")

# -------------------- convert each split --------------------
for split, df in dfs.items():
    dialogues, dropped = [], 0
    for uid, row in enumerate(df.itertuples()):
        dl = parse_dialog(row.dialogue)
        persona = "[PERSONA] " + row.user_persona.replace("\n", " ")
        full_title = row.gt_movie_title          # e.g. "Fury (2014)"
        core_title = re.sub(r"\s*\(\d{4}\)$", "", full_title).lower()
        gt_id      = title2id[full_title]

        # find recommender turn
        rec_idx = next((i for i,u in enumerate(dl)
                        if full_title.lower() in u.lower() or core_title in u.lower()),
                       None)
        if rec_idx is None:
            dropped += 1
            print(f"[skip] could not locate '{full_title}' in dialogue id {uid}")
            continue

        history      = persona + " " + " ".join(dl[:rec_idx])
        seeker_line  = f"B: {history.strip()}"
        rec_line     = dl[rec_idx].replace(full_title, PLACEHOLDER)
        if not rec_line.startswith(("A","B")):
            rec_line = "A: " + rec_line

        dialogue = [(seeker_line, None),
                    (rec_line,   [gt_id])]
        dialogues.append((uid, dialogue))

    if split == "train" and TRAIN_FRACTION < 1.0:
        keep = int(len(dialogues) * TRAIN_FRACTION)
        random.shuffle(dialogues)          # reproducible because random.seed(42)
        dialogues = dialogues[:keep]
        print(f"[subsample] keeping {keep} train dialogues ({TRAIN_FRACTION:.0%})")

    out = PEARL_ROOT / f"{split}_data_processed"
    torch.save(dialogues, out)
    print(f"{split}: kept {len(dialogues):5d}  •  dropped {dropped:3d}  •  wrote → {out}")


loading hf://datasets/LangAGI-Lab/pearl/train.json
loading hf://datasets/LangAGI-Lab/pearl/valid.json
loading hf://datasets/LangAGI-Lab/pearl/test.json
saved movie_db with 7850 movies
[subsample] keeping 7500 train dialogues (15%)
train: kept  7500  •  dropped   0  •  wrote → data/PEARL/train_data_processed
dev: kept  5000  •  dropped   0  •  wrote → data/PEARL/dev_data_processed
test: kept  2277  •  dropped   0  •  wrote → data/PEARL/test_data_processed


# Checker

In [4]:
#!/usr/bin/env python3
import torch, textwrap, sys
from pathlib import Path
import os


PEARL_ROOT = Path(os.getcwd() + "/data/PEARL")
items_db   = torch.load(PEARL_ROOT / "movie_db")

def pretty_dialog(dialog):
    """Return a short printable string version of a two-turn dialogue."""
    lines = []
    for utt, gt in dialog:
        marker = "→ " + (",".join(map(str, gt)) if gt else "") if gt else ""
        wrapped = textwrap.wrap(utt, width=100)
        lines.append("   " + wrapped[0] + (" …" if len(wrapped) > 1 else "") + marker)
    return "\n".join(lines)

def inspect_split(split, show=2):
    path = PEARL_ROOT / f"{split}_data_processed"
    if not path.exists():
        print(f"[ERR] {path} missing"); return
    data = torch.load(path)
    print(f"\n== {split.upper()} ==  {len(data)} dialogues")
    bad = 0
    for uid, dialog in data[:show]:
        print(f"UID {uid}")
        print(pretty_dialog(dialog))
        print("-"*80)
    # structural checks
    for uid,dialog in data:
        if not isinstance(uid,int): bad+=1; break
        if not isinstance(dialog,list): bad+=1; break
        for utt,gt in dialog:
            if not isinstance(utt,str): bad+=1; break
            if gt is not None:
                if not (isinstance(gt,list) and all(isinstance(x,int) for x in gt)): bad+=1; break
                for x in gt:
                    if x not in items_db: bad+=1; break
    if bad==0:
        print(" ✓ structure OK")
    else:
        print(" ✗ found structural issues")


print("items_db size:", len(items_db))
for split in ["train","dev","test"]:
    inspect_split(split)

items_db size: 7850

== TRAIN ==  7500 dialogues
UID 9926
   B: [PERSONA] [Like]  - Innovative use of digital video techniques - Creepy ambiance and legitimately …
   A: Recommender: Based on your preference for straightforward plots, dramatic and poignant stories, …→ 3615
--------------------------------------------------------------------------------
UID 35283
   B: [PERSONA] [Like] - Well made and entertaining drama-thriller - Some action sequences - …
   A: Recommender: I think "[MOVIE_ID]" might be a perfect fit for what you're looking for. It's a …→ 1964
--------------------------------------------------------------------------------
 ✓ structure OK

== DEV ==  5000 dialogues
UID 0
   B: [PERSONA] [Like] The film's powerful and prescient portrayal of white power consciousness and …
   A: Recommender: I think you might enjoy "The Manchurian Candidate." It's a gripping drama and …→ 961
--------------------------------------------------------------------------------
UID 1
   B: [PER