# Cold-Start Recommender: Colab Workflow

This notebook clones the project, downloads MovieLens interactions, prepares the strict new-item splits (with leakage-safe text encoders), and trains all baselines plus MICM/CMCL with GPU acceleration (if available). Upload only this notebook to Colab, run the cells from top to bottom, and you will get a full medium-dataset benchmark with pseudo-exposure modeling and counterfactual evaluation metrics.


In [None]:
from pathlib import Path

PROJECT_URL = "https://github.com/mohdfaour03/PGMS_for_Recommender_Systems.git"
PROJECT_ROOT = Path("/content/PGMS_for_Recommender_Systems")

if not PROJECT_ROOT.exists():
    !git clone $PROJECT_URL $PROJECT_ROOT
else:
    print(f"Repository already present at {PROJECT_ROOT}")

%cd /content/PGMS_for_Recommender_Systems

In [None]:
from datetime import datetimefrom pathlib import Pathimport sysdef locate_project_root() -> Path:    current = Path.cwd().resolve()    for candidate in (current, *current.parents):        if (candidate / "coldstart" / "src").exists():            return candidate    raise RuntimeError("Cannot locate project root containing 'coldstart/src'.")PROJECT_ROOT = locate_project_root()if str(PROJECT_ROOT) not in sys.path:    sys.path.insert(0, str(PROJECT_ROOT))from coldstart.src.notebook_utils import (    _read_simple_yaml,    build_amazon_interaction_frame,    build_goodreads_interaction_frame,    build_interaction_frame,    GOODREADS_GENRES,)from coldstart.src import data_io, pipelineDATA_SOURCE = "movielens"  # options: 'movielens', 'amazon', 'goodreads'DATASET_VARIANT = "small"  # only used for MovieLensAMAZON_VARIANT = "beauty"  # only used when DATA_SOURCE == 'amazon'GOODREADS_VARIANT = "poetry"  # only used when DATA_SOURCE == 'goodreads'; see GOODREADS_GENRESDATA_DIR = PROJECT_ROOT / "coldstart" / "data"if DATA_SOURCE == "amazon":    DATA_PATH = DATA_DIR / f"amazon_{AMAZON_VARIANT}.csv"elif DATA_SOURCE == "goodreads":    DATA_PATH = DATA_DIR / f"goodreads_{GOODREADS_VARIANT}.csv"else:    DATA_PATH = DATA_DIR / f"movielens_latest_{DATASET_VARIANT}.csv"CONFIG_PATH = PROJECT_ROOT / "coldstart" / "configs" / "base.yaml"OUTPUT_ROOT = PROJECT_ROOT / "coldstart" / "output"OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)if not DATA_PATH.exists():    DATA_DIR.mkdir(parents=True, exist_ok=True)    if DATA_SOURCE == "amazon":        df = build_amazon_interaction_frame(dataset=AMAZON_VARIANT, cache_dir=DATA_DIR)        source_name = f"Amazon {AMAZON_VARIANT}"    elif DATA_SOURCE == "goodreads":        df = build_goodreads_interaction_frame(genre=GOODREADS_VARIANT, cache_dir=DATA_DIR)        source_name = f"Goodreads {GOODREADS_VARIANT}"    else:        df = build_interaction_frame(dataset=DATASET_VARIANT)        source_name = f"MovieLens {DATASET_VARIANT}"    df.to_csv(DATA_PATH, index=False)    print(f"Downloaded {source_name} interactions to {DATA_PATH}")timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")RUN_DIR = OUTPUT_ROOT / f"notebook_run_{timestamp}"RUN_DIR.mkdir(parents=True, exist_ok=True)DATA_PATH, RUN_DIR

## Configure the run

In [None]:
from datetime import datetime

CONFIG_PATH = PROJECT_ROOT / 'coldstart' / 'configs' / 'base.yaml'
config = _read_simple_yaml(CONFIG_PATH)

# --- User-tunable parameters ---
dataset = 'medium'          # 'small' or 'medium'
interaction_limit = 300000   # set to None for full dataset
cold_item_frac = 0.2
seed = 42
prefer_gpu = True

k_factors = 16
k_eval = [10, 20, 50]

DATA_DIR = PROJECT_ROOT / 'coldstart' / 'data'
DATA_DIR.mkdir(parents=True, exist_ok=True)
DATA_PATH = DATA_DIR / f"movielens_latest_{dataset}.csv"

OUTPUT_ROOT = PROJECT_ROOT / 'coldstart' / 'output_colab'
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
RUN_DIR = OUTPUT_ROOT / f"colab_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
RUN_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data path: {DATA_PATH}")
print(f"Run directory: {RUN_DIR}")


In [None]:
if not DATA_PATH.exists():
    print(f"Downloading MovieLens '{dataset}' interactions...")
    frame = build_interaction_frame(dataset=dataset, limit=interaction_limit)
    frame.to_csv(DATA_PATH, index=False)
    print(f"Saved dataset to {DATA_PATH}")
else:
    print(f"Dataset already present at {DATA_PATH}")

In [None]:
tfidf_params = config.get('tfidf', {})
text_encoder_cfg = config.get('text_encoder', {})
encoder_type = text_encoder_cfg.get('type', 'tfidf')
encoder_params = text_encoder_cfg.get('params')
if encoder_params is None and encoder_type == 'tfidf':
    encoder_params = tfidf_params

prepare_limit = interaction_limit if interaction_limit else None
val_item_frac = float(config.get('prepare', {}).get('val_item_frac', 0) or 0)
val_item_frac = val_item_frac if val_item_frac > 0 else None

pipeline.prepare_dataset(
    DATA_PATH,
    RUN_DIR,
    tfidf_params=tfidf_params,
    encoder_type=encoder_type,
    encoder_params=encoder_params,
    cold_item_frac=cold_item_frac,
    val_item_frac=val_item_frac,
    seed=seed,
    interaction_limit=prepare_limit,
)

sorted(path.name for path in RUN_DIR.iterdir())


In [None]:
mf_cfg = config.get('mf', {})
ctrlite_cfg = config.get('ctrlite', {})
a2f_cfg = config.get('a2f', {})
ctpf_cfg = config.get('ctpf', {})
cdl_cfg = config.get('cdl', {})
hft_cfg = config.get('hft', {})
micm_cfg = dict(config.get('micm', {}))
cmcl_cfg = config.get('cmcl', {})

# Optional MICM overrides for the InfoNCE mapper
micm_cfg.update({
    'temperature': 0.03,
    'iters': 600,
    'batch_size': 2048,
    'lr': 1.5e-3,
    'symmetric': False,
})

mf_runtime_cfg = {
    'batch_size': 8192,
    'score_batch_size': 8192,
    'infer_batch_size': 8192,
    'ctrlite_batch_size': 4096,
}

results = pipeline.train_and_evaluate_content_model(
    RUN_DIR,
    k_factors=k_factors,
    k_eval=k_eval,
    mf_reg=float(mf_cfg.get('reg', 0.02)),
    mf_iters=int(mf_cfg.get('iters', 30)),
    mf_lr=float(mf_cfg.get('lr', 0.02)),
    seed=seed,
    ctrlite_reg=float(ctrlite_cfg.get('reg', 0.01)),
    ctrlite_lr=float(ctrlite_cfg.get('lr', 0.1)),
    ctrlite_iters=int(ctrlite_cfg.get('iters', 80)),
    model='all',
    a2f_cfg=a2f_cfg,
    ctpf_cfg=ctpf_cfg,
    cdl_cfg=cdl_cfg,
    hft_cfg=hft_cfg,
    micm_cfg=micm_cfg,
    cmcl_cfg=cmcl_cfg,
    backend='torch',
    prefer_gpu=prefer_gpu,
    mf_cfg=mf_runtime_cfg,
)
results


In [None]:
for path in sorted(RUN_DIR.rglob("*")):
    if path.is_file():
        rel = path.relative_to(PROJECT_ROOT)
        size_kb = path.stat().st_size / 1024
        print(f"{rel} ({size_kb:.1f} KB)")

## Inspect metrics and buckets
Colab runs can print the nested evaluation dictionary (with confidence intervals, per-bucket metrics, and `delta_vs_micm`) using the next cell.


In [None]:
import json
from pprint import pprint

pprint(results)
if 'cmcl' in results:
    print('CMCL bucket snapshot:')
    print(json.dumps(results['cmcl'].get('buckets', {}), indent=2)[:2000])
    delta = results['cmcl'].get('delta_vs_micm')
    if delta:
        print('Delta vs MICM:')
        print(json.dumps(delta, indent=2)[:2000])
