# Cold-Start Recommender Workflow
This notebook prepares a strict new-item MovieLens split, fits leakage-safe text encoders, and benchmarks content-to-factor models including CTR-lite, A2F, CTPF, CDL, HFT, MICM, and the counterfactual CMCL baseline with pseudo-exposure modeling.


## Notebook outline
1. Configure dataset paths plus encoder/back-end toggles.
2. Prepare warm/val/test assets with the leakage-safe text encoder.
3. Train MF + requested content models (incl. MICM/CMCL) with pseudo-exposure estimation.
4. Inspect saved artefacts, exposure checkpoints, and evaluation metrics (point estimates, CIs, buckets, and deltas).


In [19]:
from datetime import datetime
from pathlib import Path
import sys


def locate_project_root() -> Path:
    current = Path.cwd().resolve()
    for candidate in (current, *current.parents):
        if (candidate / "coldstart" / "src").exists():
            return candidate
    raise RuntimeError("Cannot locate project root containing 'coldstart/src'.")


PROJECT_ROOT = locate_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from coldstart.src.notebook_utils import _read_simple_yaml
from coldstart.src.notebook_utils import build_interaction_frame
from coldstart.src import data_io, pipeline

DATA_DIR = PROJECT_ROOT / "coldstart" / "data"
DATASET_VARIANT = "small"  # use 'medium' for the full benchmark
DATA_PATH = DATA_DIR / f"movielens_latest_{DATASET_VARIANT}.csv"
CONFIG_PATH = PROJECT_ROOT / "coldstart" / "configs" / "base.yaml"
OUTPUT_ROOT = PROJECT_ROOT / "coldstart" / "output"
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

if not DATA_PATH.exists():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    df = build_interaction_frame(dataset=DATASET_VARIANT)
    df.to_csv(DATA_PATH, index=False)
    print(f"Downloaded MovieLens {DATASET_VARIANT} interactions to {DATA_PATH}")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = OUTPUT_ROOT / f"notebook_run_{timestamp}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH, RUN_DIR


(WindowsPath('C:/Users/user/PGMS_Rec_Systems/PGMS_for_Recommender_Systems/coldstart/data/movielens_latest_small.csv'),
 WindowsPath('C:/Users/user/PGMS_Rec_Systems/PGMS_for_Recommender_Systems/coldstart/output/notebook_run_20251029_223545'))

## Prepare the cold-start split
We parse the lightweight configuration file, run the strict new-item split (optionally reserving validation items), normalise title+genre+tag text, and fit the chosen encoder on warm items only.


In [20]:
from importlib import reload
from coldstart.src.models import ctpf, hft
reload(data_io)
reload(ctpf)
reload(hft)
reload(pipeline)

config = _read_simple_yaml(CONFIG_PATH)
tfidf_params = config.get("tfidf", {})
prepare_cfg = config.get("prepare", {})
text_encoder_cfg = config.get("text_encoder", {})
encoder_type = text_encoder_cfg.get("type", "tfidf")
encoder_params = text_encoder_cfg.get("params")
if encoder_params is None and encoder_type == "tfidf":
    encoder_params = tfidf_params

seed = int(prepare_cfg.get("seed", 42) or 42)
cold_item_frac = float(prepare_cfg.get("cold_item_frac", 0.2) or 0.2)
val_item_frac = float(prepare_cfg.get("val_item_frac", 0) or 0)
val_item_frac = val_item_frac if val_item_frac > 0 else None
interaction_limit = int(prepare_cfg.get("interaction_limit", 0) or 0) or None

pipeline.prepare_dataset(
    DATA_PATH,
    RUN_DIR,
    tfidf_params=tfidf_params,
    encoder_type=encoder_type,
    encoder_params=encoder_params,
    cold_item_frac=cold_item_frac,
    val_item_frac=val_item_frac,
    seed=seed,
    interaction_limit=interaction_limit,
)
sorted(p.name for p in RUN_DIR.iterdir())


Loaded 100836 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\data\movielens_latest_small.csv.
Dataset contains 610 unique users and 9724 unique items.
Using 100836 interactions out of the source data (limit=1200000).
TF-IDF fit on warm only
Warm text features shape: (7779, 128)
Cold text features shape: (1945, 128)


['cold_interactions.csv',
 'cold_item_ids.json',
 'cold_item_ids.txt',
 'cold_item_text_features.json',
 'tfidf_state.json',
 'warm_interactions.csv',
 'warm_item_ids.json',
 'warm_item_text_features.json']

## Train and evaluate
Compare CTR-lite, A2F, CTPF, CDL, HFT, MICM, and CMCL on the prepared split. Set `model_choice = "all"` to evaluate every option (Torch backend required for MICM/CMCL). Each CMCL run automatically (re)trains the pseudo-exposure estimator if a cached checkpoint is missing.


In [21]:
model_choice = "micm,cmcl"  # options: 'ctrlite', 'a2f', 'ctpf', 'cdl', 'hft', 'micm', 'cmcl', 'all'
run_adaptive = False  # only used by ctrlite; ignored otherwise
prefer_gpu = True
lf_model_choice = model_choice.strip().lower()
torch_only = {"micm", "cmcl"}
if lf_model_choice == "all":
    backend_choice = "torch"
else:
    requested = [part.strip().lower() for part in model_choice.split(',') if part.strip()]
    backend_choice = "torch" if any(model in torch_only for model in requested) else "numpy"

mf_cfg = config.get("mf", {})
ctrlite_cfg = config.get("ctrlite", {})
a2f_cfg = config.get("a2f", {})
ctpf_cfg = config.get("ctpf", {})
cdl_cfg = config.get("cdl", {})
hft_cfg = config.get("hft", {})
micm_cfg = config.get("micm", {})
cmcl_cfg = config.get("cmcl", {})

results = pipeline.train_and_evaluate_content_model(
    RUN_DIR,
    k_factors=16,
    k_eval=[5, 10],
    mf_reg=float(mf_cfg.get("reg", 0.02)),
    mf_iters=int(mf_cfg.get("iters", 30)),
    mf_lr=float(mf_cfg.get("lr", 0.02)),
    seed=seed,
    ctrlite_reg=float(ctrlite_cfg.get("reg", 0.01)),
    ctrlite_lr=float(ctrlite_cfg.get("lr", 0.1)),
    ctrlite_iters=int(ctrlite_cfg.get("iters", 80)),
    adaptive=run_adaptive and model_choice == "ctrlite",
    model=model_choice,
    a2f_cfg=a2f_cfg,
    ctpf_cfg=ctpf_cfg,
    cdl_cfg=cdl_cfg,
    hft_cfg=hft_cfg,
    micm_cfg=micm_cfg,
    cmcl_cfg=cmcl_cfg,
    mf_cfg=mf_cfg,
    backend=backend_choice,
    prefer_gpu=prefer_gpu,
)
results


Loaded 80541 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\warm_interactions.csv.
Dataset contains 610 unique users and 7779 unique items.
Loaded 20295 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\cold_interactions.csv.
Dataset contains 609 unique users and 1945 unique items.


{'ctrlite': {'hit@5': 0.047619047619047616,
  'ndcg@5': 0.01063449180480855,
  'evaluated_users': 609},
 'a2f': {'hit@5': 0.09688013136288999,
  'ndcg@5': 0.020248667237844895,
  'evaluated_users': 609},
 'ctpf': {'hit@5': 0.10016420361247948,
  'ndcg@5': 0.03560659161553215,
  'evaluated_users': 609},
 'cdl': {'hit@5': 0.03284072249589491,
  'ndcg@5': 0.005631455778430841,
  'evaluated_users': 609},
 'hft': {'hit@5': 0.10016420361247948,
  'ndcg@5': 0.032112409367314576,
  'evaluated_users': 609}}

## Inspect the evaluation dictionary
Each model now reports point estimates, 95% bootstrap confidence intervals, per-bucket breakdowns (item text length, item popularity, user history length), and `delta_vs_micm` whenever MICM is part of the run. Use the snippet below to explore the nested structure.


In [None]:
import json
from pprint import pprint

pprint(results)

if "cmcl" in results:
    bucket_snapshot = results["cmcl"].get("buckets", {})
    print("
CMCL bucket snapshot:")
    print(json.dumps(bucket_snapshot, indent=2)[:2000])
    delta = results["cmcl"].get("delta_vs_micm")
    if delta:
        print("
Delta vs MICM:")
        print(json.dumps(delta, indent=2)[:2000])


In [22]:
warm_rows = data_io.load_interactions(RUN_DIR / "warm_interactions.csv")
warm_rows[:3]


Loaded 80541 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\warm_interactions.csv.
Dataset contains 610 unique users and 7779 unique items.


[{'user_id': '1',
  'item_id': '1',
  'rating_or_y': 4.0,
  'item_text': 'Toy Story (1995) Adventure Animation Children Comedy Fantasy'},
 {'user_id': '1',
  'item_id': '3',
  'rating_or_y': 4.0,
  'item_text': 'Grumpier Old Men (1995) Comedy Romance'},
 {'user_id': '1',
  'item_id': '6',
  'rating_or_y': 4.0,
  'item_text': 'Heat (1995) Action Crime Thriller'}]

In [23]:
for path in sorted(RUN_DIR.rglob("*")):
    if path.is_file():
        rel_path = path.relative_to(PROJECT_ROOT)
        size = path.stat().st_size
        print(f"{rel_path} ({size} bytes)")


coldstart\output\notebook_run_20251029_223545\cold_interactions.csv (1239615 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_ids.json (22354 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_ids.txt (12626 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_text_features.json (1395387 bytes)
coldstart\output\notebook_run_20251029_223545\models\U.json (92740 bytes)
coldstart\output\notebook_run_20251029_223545\models\V_warm.json (1182263 bytes)
coldstart\output\notebook_run_20251029_223545\tfidf_state.json (6049 bytes)
coldstart\output\notebook_run_20251029_223545\warm_interactions.csv (4871313 bytes)
coldstart\output\notebook_run_20251029_223545\warm_item_ids.json (89748 bytes)
coldstart\output\notebook_run_20251029_223545\warm_item_text_features.json (5591572 bytes)
