# Cold-Start Recommender: Colab Workflow

This notebook clones the project, downloads MovieLens interactions, prepares the cold-start splits, and trains all content models with GPU acceleration (if available). Upload only this notebook to Colab, run the cells from top to bottom, and you will get a full medium-dataset benchmark.

In [None]:
from pathlib import Path

PROJECT_URL = "https://github.com/mohdfaour03/PGMS_for_Recommender_Systems.git"
PROJECT_ROOT = Path("/content/PGMS_for_Recommender_Systems")

if not PROJECT_ROOT.exists():
    !git clone $PROJECT_URL $PROJECT_ROOT
else:
    print(f"Repository already present at {PROJECT_ROOT}")

%cd /content/PGMS_for_Recommender_Systems

In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("/content/PGMS_for_Recommender_Systems").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from coldstart.src.notebook_utils import build_interaction_frame, _read_simple_yaml
from coldstart.src import data_io, pipeline

## Configure the run

In [None]:
from datetime import datetime

CONFIG_PATH = PROJECT_ROOT / "coldstart" / "configs" / "base.yaml"
config = _read_simple_yaml(CONFIG_PATH)

# --- User-tunable parameters ---
dataset = "medium"          # "small" or "medium"
interaction_limit = 300000   # set to None for full dataset
cold_item_frac = 0.2
seed = 42
model_choice = "all"        # "ctrlite", "a2f", "ctpf", "cdl", "hft", or "all"
run_adaptive = False         # only relevant for ctrlite
prefer_gpu = True

k_factors = 16
k_eval = 5

DATA_DIR = PROJECT_ROOT / "coldstart" / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)
DATA_PATH = DATA_DIR / f"movielens_latest_{dataset}.csv"

OUTPUT_ROOT = PROJECT_ROOT / "coldstart" / "output_colab"
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
RUN_DIR = OUTPUT_ROOT / f"colab_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data path: {DATA_PATH}")
print(f"Run directory: {RUN_DIR}")

In [None]:
if not DATA_PATH.exists():
    print(f"Downloading MovieLens '{dataset}' interactions...")
    frame = build_interaction_frame(dataset=dataset, limit=interaction_limit)
    frame.to_csv(DATA_PATH, index=False)
    print(f"Saved dataset to {DATA_PATH}")
else:
    print(f"Dataset already present at {DATA_PATH}")

In [None]:
tfidf_params = config.get("tfidf", {})
prepare_limit = interaction_limit if interaction_limit else None

pipeline.prepare_dataset(
    DATA_PATH,
    RUN_DIR,
    tfidf_params=tfidf_params,
    cold_item_frac=cold_item_frac,
    seed=seed,
    interaction_limit=prepare_limit,
)

sorted(path.name for path in RUN_DIR.iterdir())

In [None]:
mf_cfg = config.get("mf", {})
ctrlite_cfg = config.get("ctrlite", {})
a2f_cfg = config.get("a2f", {})
ctpf_cfg = config.get("ctpf", {})
cdl_cfg = config.get("cdl", {})
hft_cfg = config.get("hft", {})

mf_runtime_cfg = {
    "batch_size": 8192,
    "score_batch_size": 8192,
    "infer_batch_size": 8192,
    "ctrlite_batch_size": 4096,
}

results = pipeline.train_and_evaluate_content_model(
    RUN_DIR,
    k_factors=k_factors,
    k_eval=k_eval,
    mf_reg=float(mf_cfg.get("reg", 0.02)),
    mf_iters=int(mf_cfg.get("iters", 30)),
    mf_lr=float(mf_cfg.get("lr", 0.02)),
    seed=seed,
    ctrlite_reg=float(ctrlite_cfg.get("reg", 0.01)),
    ctrlite_lr=float(ctrlite_cfg.get("lr", 0.1)),
    ctrlite_iters=int(ctrlite_cfg.get("iters", 80)),
    adaptive=run_adaptive,
    model=model_choice,
    a2f_cfg=a2f_cfg,
    ctpf_cfg=ctpf_cfg,
    cdl_cfg=cdl_cfg,
    hft_cfg=hft_cfg,
    backend="torch",
    prefer_gpu=prefer_gpu,
    mf_cfg=mf_runtime_cfg,
)

import json
print(json.dumps(results, indent=2))

In [None]:
for path in sorted(RUN_DIR.rglob("*")):
    if path.is_file():
        rel = path.relative_to(PROJECT_ROOT)
        size_kb = path.stat().st_size / 1024
        print(f"{rel} ({size_kb:.1f} KB)")