# Cold-Start Recommender Workflow
This notebook walks through preparing a strict cold-start split, extracting text features, and comparing several content-to-factor models (CTR-lite, A2F, CTPF, CDL, HFT).


## Notebook outline
1. Configure paths and parameters.
2. Prepare warm/cold assets with TF-IDF features.
3. Train multiple content models and evaluate on cold items.
4. Inspect saved artefacts and metrics.


In [19]:
from datetime import datetime
from pathlib import Path
import sys


def locate_project_root() -> Path:
    current = Path.cwd().resolve()
    for candidate in (current, *current.parents):
        if (candidate / "coldstart" / "src").exists():
            return candidate
    raise RuntimeError("Cannot locate project root containing 'coldstart/src'.")


PROJECT_ROOT = locate_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from coldstart.src.notebook_utils import _read_simple_yaml
from coldstart.src.notebook_utils import build_interaction_frame
from coldstart.src import data_io, pipeline

DATA_DIR = PROJECT_ROOT / "coldstart" / "data"
DATA_PATH = DATA_DIR / "movielens_latest_small.csv"
CONFIG_PATH = PROJECT_ROOT / "coldstart" / "configs" / "base.yaml"
OUTPUT_ROOT = PROJECT_ROOT / "coldstart" / "output"
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

if not DATA_PATH.exists():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    df = build_interaction_frame(dataset="small")
    df.to_csv(DATA_PATH, index=False)
    print(f"Downloaded MovieLens latest-small interactions to {DATA_PATH}")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = OUTPUT_ROOT / f"notebook_run_{timestamp}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

DATA_PATH, RUN_DIR


(WindowsPath('C:/Users/user/PGMS_Rec_Systems/PGMS_for_Recommender_Systems/coldstart/data/movielens_latest_small.csv'),
 WindowsPath('C:/Users/user/PGMS_Rec_Systems/PGMS_for_Recommender_Systems/coldstart/output/notebook_run_20251029_223545'))

## Prepare the cold-start split
We parse the lightweight configuration file, run the strict cold-start split, and build TF-IDF features on warm item text only.


In [20]:
from importlib import reload
from coldstart.src.models import ctpf, hft
reload(data_io)
reload(ctpf)
reload(hft)
reload(pipeline)

config = _read_simple_yaml(CONFIG_PATH)
tfidf_params = config.get("tfidf", {})
prepare_cfg = config.get("prepare", {})

cold_item_frac = 0.2
seed = 42
interaction_limit = int(prepare_cfg.get("interaction_limit", 0) or 0)

pipeline.prepare_dataset(
    DATA_PATH,
    RUN_DIR,
    tfidf_params=tfidf_params,
    cold_item_frac=cold_item_frac,
    seed=seed,
    interaction_limit=interaction_limit if interaction_limit else None,
)
sorted(p.name for p in RUN_DIR.iterdir())


Loaded 100836 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\data\movielens_latest_small.csv.
Dataset contains 610 unique users and 9724 unique items.
Using 100836 interactions out of the source data (limit=1200000).
TF-IDF fit on warm only
Warm text features shape: (7779, 128)
Cold text features shape: (1945, 128)


['cold_interactions.csv',
 'cold_item_ids.json',
 'cold_item_ids.txt',
 'cold_item_text_features.json',
 'tfidf_state.json',
 'warm_interactions.csv',
 'warm_item_ids.json',
 'warm_item_text_features.json']

## Train and evaluate
Compare CTR-lite, A2F, CTPF, CDL, and HFT on the prepared cold-start split. Set `model_choice` to `'all'` to evaluate every option.


In [21]:
model_choice = "all"  # options: 'ctrlite', 'a2f', 'ctpf', 'cdl', 'hft', 'all'
run_adaptive = False  # only used by ctrlite; ignored otherwise
mf_cfg = config.get("mf", {})
ctrlite_cfg = config.get("ctrlite", {})
a2f_cfg = config.get("a2f", {})
ctpf_cfg = config.get("ctpf", {})
cdl_cfg = config.get("cdl", {})
hft_cfg = config.get("hft", {})

results = pipeline.train_and_evaluate_content_model(
    RUN_DIR,
    k_factors=16,
    k_eval=5,
    mf_reg=float(mf_cfg.get("reg", 0.02)),
    mf_iters=int(mf_cfg.get("iters", 30)),
    mf_lr=float(mf_cfg.get("lr", 0.02)),
    seed=seed,
    ctrlite_reg=float(ctrlite_cfg.get("reg", 0.01)),
    ctrlite_lr=float(ctrlite_cfg.get("lr", 0.1)),
    ctrlite_iters=int(ctrlite_cfg.get("iters", 80)),
    adaptive=run_adaptive and model_choice == "ctrlite",
    model=model_choice,
    a2f_cfg=a2f_cfg,
    ctpf_cfg=ctpf_cfg,
    cdl_cfg=cdl_cfg,
    hft_cfg=hft_cfg,
)
results


Loaded 80541 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\warm_interactions.csv.
Dataset contains 610 unique users and 7779 unique items.
Loaded 20295 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\cold_interactions.csv.
Dataset contains 609 unique users and 1945 unique items.


{'ctrlite': {'hit@5': 0.047619047619047616,
  'ndcg@5': 0.01063449180480855,
  'evaluated_users': 609},
 'a2f': {'hit@5': 0.09688013136288999,
  'ndcg@5': 0.020248667237844895,
  'evaluated_users': 609},
 'ctpf': {'hit@5': 0.10016420361247948,
  'ndcg@5': 0.03560659161553215,
  'evaluated_users': 609},
 'cdl': {'hit@5': 0.03284072249589491,
  'ndcg@5': 0.005631455778430841,
  'evaluated_users': 609},
 'hft': {'hit@5': 0.10016420361247948,
  'ndcg@5': 0.032112409367314576,
  'evaluated_users': 609}}

In [22]:
warm_rows = data_io.load_interactions(RUN_DIR / "warm_interactions.csv")
warm_rows[:3]


Loaded 80541 interactions from C:\Users\user\PGMS_Rec_Systems\PGMS_for_Recommender_Systems\coldstart\output\notebook_run_20251029_223545\warm_interactions.csv.
Dataset contains 610 unique users and 7779 unique items.


[{'user_id': '1',
  'item_id': '1',
  'rating_or_y': 4.0,
  'item_text': 'Toy Story (1995) Adventure Animation Children Comedy Fantasy'},
 {'user_id': '1',
  'item_id': '3',
  'rating_or_y': 4.0,
  'item_text': 'Grumpier Old Men (1995) Comedy Romance'},
 {'user_id': '1',
  'item_id': '6',
  'rating_or_y': 4.0,
  'item_text': 'Heat (1995) Action Crime Thriller'}]

In [23]:
for path in sorted(RUN_DIR.rglob("*")):
    if path.is_file():
        rel_path = path.relative_to(PROJECT_ROOT)
        size = path.stat().st_size
        print(f"{rel_path} ({size} bytes)")


coldstart\output\notebook_run_20251029_223545\cold_interactions.csv (1239615 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_ids.json (22354 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_ids.txt (12626 bytes)
coldstart\output\notebook_run_20251029_223545\cold_item_text_features.json (1395387 bytes)
coldstart\output\notebook_run_20251029_223545\models\U.json (92740 bytes)
coldstart\output\notebook_run_20251029_223545\models\V_warm.json (1182263 bytes)
coldstart\output\notebook_run_20251029_223545\tfidf_state.json (6049 bytes)
coldstart\output\notebook_run_20251029_223545\warm_interactions.csv (4871313 bytes)
coldstart\output\notebook_run_20251029_223545\warm_item_ids.json (89748 bytes)
coldstart\output\notebook_run_20251029_223545\warm_item_text_features.json (5591572 bytes)
