In [2]:
import sys
import os
import json
import subprocess
from pathlib import Path

In [3]:
PROJECT_ROOT = Path(r"C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M")

FEATURE_STORE_DIR = PROJECT_ROOT / "data" / "feature_store"
REGISTRY_DIR = PROJECT_ROOT / "ml" / "registry" / "recommender"
REPORTS_DIR = PROJECT_ROOT / "ml" / "reports" / "recommender"

TOP_KS = "5,10,20"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("FEATURE_STORE_DIR:", FEATURE_STORE_DIR)
print("REGISTRY_DIR:", REGISTRY_DIR)
print("REPORTS_DIR:", REPORTS_DIR)

PROJECT_ROOT: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M
FEATURE_STORE_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store
REGISTRY_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
REPORTS_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [4]:
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("sys.path[0]:", sys.path[0])

sys.path[0]: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M


In [None]:
import ml
from ml.recommender import train, evaluate, inference

print("import ml OK")
print("import recommender modules OK")

In [5]:
assert PROJECT_ROOT.exists()
assert (PROJECT_ROOT / "ml").exists()
assert (PROJECT_ROOT / "ml" / "__init__.py").exists()
assert (PROJECT_ROOT / "ml" / "recommender" / "__init__.py").exists()
assert (PROJECT_ROOT / "features" / "__init__.py").exists()

print("project structure OK")

project structure OK


In [6]:
required_files = [
    FEATURE_STORE_DIR / "interactions.parquet",
    FEATURE_STORE_DIR / "user_features.parquet",
    FEATURE_STORE_DIR / "item_features.parquet",
    FEATURE_STORE_DIR / "item_popularity.parquet",
]

for f in required_files:
    print(f, "exists:", f.exists())
    if not f.exists():
        raise FileNotFoundError(str(f))

print("feature_store OK")

C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\interactions.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\user_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_popularity.parquet exists: True
feature_store OK


In [None]:
REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("registry dir:", REGISTRY_DIR)
print("reports dir:", REPORTS_DIR)

registry dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [8]:
import pyarrow
import pandas as pd

print("pyarrow version:", pyarrow.__version__)
pd.read_parquet(FEATURE_STORE_DIR / "interactions.parquet")
print("parquet read OK")

pyarrow version: 22.0.0
parquet read OK


In [11]:
def run_cmd(cmd: str):
    env = os.environ.copy()
    env["PYTHONPATH"] = str(PROJECT_ROOT)

    print(cmd)

    p = subprocess.Popen(
        cmd,
        shell=True,
        cwd=str(PROJECT_ROOT),
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        encoding="utf-8",
        errors="replace",
    )

    for line in p.stdout:
        print(line.rstrip())

    p.wait()
    if p.returncode != 0:
        raise RuntimeError(f"command failed: {p.returncode}")

In [1]:
REGISTRY_RUN = PROJECT_ROOT / "ml" / "registry" / "recommender" / "run_2m_e2_v3_bpr"
REGISTRY_RUN.mkdir(parents=True, exist_ok=True)

cmd = f'"{sys.executable}" -u -m ml.recommender.train ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--epochs 2 --batch_size 512 --max_interactions 2000000'
run_cmd(cmd)


NameError: name 'PROJECT_ROOT' is not defined

In [47]:
for p in sorted(REGISTRY_DIR.iterdir(), key=lambda x: x.name):
    print(p.name, p.stat().st_size)

faiss.index 20822573
feature_encoders.json 57369064
item_embeddings.npy 20822656
item_id_mapping.json 1615650
item_popularity.csv 2115768
metadata.json 697
run_2m_e2 4096
run_2m_e2_v2 4096
run_3m_e3 4096
two_tower_model.pt 242334
user_id_mapping.json 51913778
write_test_1768454260.txt 2


In [48]:
cmd = f'"{sys.executable}" -u -m ml.recommender.evaluate ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--ks "{TOP_KS}"'
run_cmd(cmd)

"c:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\.venv\Scripts\python.exe" -u -m ml.recommender.evaluate --feature_store_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store" --registry_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v2" --reports_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender" --ks "5,10,20"
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender
split: {'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 676238, 'filtered_out_users': 554529, 'avg_history_len_train': 22.431170221121782, 'uses_user_history_agg': True}
model: {'precision@5': 0.00010321809777031165, 'recall@5': 0.0005160904888515582, 'ndcg@5': 0.00027931447522670284, 'precision@10': 8.148018892756694e-05, 'recall@10': 0.0008148018892756693, 'ndcg@10': 0.000378068750691

In [49]:
with open(REPORTS_DIR / "split_stats.json") as f:
    print("SPLIT STATS")
    print(json.load(f))

with open(REPORTS_DIR / "metrics.json") as f:
    print("MODEL METRICS")
    print(json.load(f))

with open(REPORTS_DIR / "baseline_metrics.json") as f:
    print("BASELINE METRICS")
    print(json.load(f))


SPLIT STATS
{'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 676238, 'filtered_out_users': 554529, 'avg_history_len_train': 22.431170221121782, 'uses_user_history_agg': True}
MODEL METRICS
{'precision@5': 0.00010321809777031165, 'recall@5': 0.0005160904888515582, 'ndcg@5': 0.00027931447522670284, 'precision@10': 8.148018892756694e-05, 'recall@10': 0.0008148018892756693, 'ndcg@10': 0.0003780687506917739, 'precision@20': 6.506584959733111e-05, 'recall@20': 0.0013013169919466223, 'ndcg@20': 0.0004975564483242267, 'num_eval_users': 676238, 'num_users_with_recs': 676238}
BASELINE METRICS
{'precision@5': 0.0013998030279280373, 'recall@5': 0.006999015139640186, 'ndcg@5': 0.004310570270700537, 'precision@10': 0.0011118866434598471, 'recall@10': 0.01111886643459847, 'ndcg@10': 0.005620917157914896, 'precision@20': 0.0008712169384151734, 'recall@20': 0.017424338768303467, 'ndcg@20': 0.007184700869867926, 'num_eva