In [1]:
import sys
import os
import json
import subprocess
from pathlib import Path

In [2]:
PROJECT_ROOT = Path(r"C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M")

FEATURE_STORE_DIR = PROJECT_ROOT / "data" / "feature_store"
REGISTRY_DIR = PROJECT_ROOT / "ml" / "registry" / "recommender"
REPORTS_DIR = PROJECT_ROOT / "ml" / "reports" / "recommender"

TOP_KS = "5,10,20"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("FEATURE_STORE_DIR:", FEATURE_STORE_DIR)
print("REGISTRY_DIR:", REGISTRY_DIR)
print("REPORTS_DIR:", REPORTS_DIR)

PROJECT_ROOT: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M
FEATURE_STORE_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store
REGISTRY_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
REPORTS_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [3]:
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("sys.path[0]:", sys.path[0])

sys.path[0]: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M


In [4]:
import ml
from ml.recommender import train, evaluate, inference

print("import ml OK")
print("import recommender modules OK")

import ml OK
import recommender modules OK


In [5]:
assert PROJECT_ROOT.exists()
assert (PROJECT_ROOT / "ml").exists()
assert (PROJECT_ROOT / "ml" / "__init__.py").exists()
assert (PROJECT_ROOT / "ml" / "recommender" / "__init__.py").exists()
assert (PROJECT_ROOT / "features" / "__init__.py").exists()

print("project structure OK")

project structure OK


In [6]:
required_files = [
    FEATURE_STORE_DIR / "interactions.parquet",
    FEATURE_STORE_DIR / "user_features.parquet",
    FEATURE_STORE_DIR / "item_features.parquet",
    FEATURE_STORE_DIR / "item_popularity.parquet",
]

for f in required_files:
    print(f, "exists:", f.exists())
    if not f.exists():
        raise FileNotFoundError(str(f))

print("feature_store OK")

C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\interactions.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\user_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_popularity.parquet exists: True
feature_store OK


In [None]:
REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("registry dir:", REGISTRY_DIR)
print("reports dir:", REPORTS_DIR)

registry dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [8]:
import pyarrow
import pandas as pd

print("pyarrow version:", pyarrow.__version__)
pd.read_parquet(FEATURE_STORE_DIR / "interactions.parquet")
print("parquet read OK")

pyarrow version: 22.0.0
parquet read OK


In [11]:
def run_cmd(cmd: str):
    env = os.environ.copy()
    env["PYTHONPATH"] = str(PROJECT_ROOT)

    print(cmd)

    p = subprocess.Popen(
        cmd,
        shell=True,
        cwd=str(PROJECT_ROOT),
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        encoding="utf-8",
        errors="replace",
    )

    for line in p.stdout:
        print(line.rstrip())

    p.wait()
    if p.returncode != 0:
        raise RuntimeError(f"command failed: {p.returncode}")

In [None]:
REGISTRY_RUN = PROJECT_ROOT / "ml" / "registry" / "recommender" / "run_2m_e2_v2"
REGISTRY_RUN.mkdir(parents=True, exist_ok=True)

cmd = f'"{sys.executable}" -u -m ml.recommender.train ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--epochs 2 --batch_size 512 --max_interactions 2000000'
run_cmd(cmd)


"c:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\.venv\Scripts\python.exe" -u -m ml.recommender.train --feature_store_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store" --registry_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v2" --reports_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender" --epochs 2 --batch_size 512 --max_interactions 2000000
device: cpu
feature_store_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store
registry_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v2
reports_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender
raw interactions rows: 31788324
raw user_features rows: 1371980
raw item_features rows: 105542
user_history_agg loaded: True
user_history_agg rows: 1362281
train interactions rows: 2000000
unique customers in interactions

In [42]:
for p in sorted(REGISTRY_DIR.iterdir(), key=lambda x: x.name):
    print(p.name, p.stat().st_size)

faiss.index 20822573
feature_encoders.json 57369064
item_embeddings.npy 20822656
item_id_mapping.json 1615650
item_popularity.csv 2115768
metadata.json 697
run_2m_e2 4096
run_3m_e3 4096
two_tower_model.pt 242334
user_id_mapping.json 51913778
write_test_1768454260.txt 2


In [43]:
cmd = f'"{sys.executable}" -u -m ml.recommender.evaluate ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--ks "{TOP_KS}"'
run_cmd(cmd)

"c:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\.venv\Scripts\python.exe" -u -m ml.recommender.evaluate --feature_store_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store" --registry_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_3m_e3" --reports_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender" --ks "5,10,20"
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender
split: {'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 792055, 'filtered_out_users': 438712, 'avg_history_len_train': 22.431170221121782}
model: {'precision@5': 4.368383508720986e-05, 'recall@5': 0.00021841917543604926, 'ndcg@5': 0.0001249412672728313, 'precision@10': 3.4214795689693267e-05, 'recall@10': 0.0003421479568969327, 'ndcg@10': 0.00016402138360604483, 'precision@20': 2.449324857

In [44]:
with open(REPORTS_DIR / "split_stats.json") as f:
    print("SPLIT STATS")
    print(json.load(f))

with open(REPORTS_DIR / "metrics.json") as f:
    print("MODEL METRICS")
    print(json.load(f))

with open(REPORTS_DIR / "baseline_metrics.json") as f:
    print("BASELINE METRICS")
    print(json.load(f))


SPLIT STATS
{'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 792055, 'filtered_out_users': 438712, 'avg_history_len_train': 22.431170221121782}
MODEL METRICS
{'precision@5': 4.368383508720986e-05, 'recall@5': 0.00021841917543604926, 'ndcg@5': 0.0001249412672728313, 'precision@10': 3.4214795689693267e-05, 'recall@10': 0.0003421479568969327, 'ndcg@10': 0.00016402138360604483, 'precision@20': 2.449324857490957e-05, 'recall@20': 0.0004898649714981914, 'ndcg@20': 0.0002007495362302112, 'num_eval_users': 792055, 'num_users_with_recs': 792055}
BASELINE METRICS
{'precision@5': 0.0013951051379007772, 'recall@5': 0.006975525689503885, 'ndcg@5': 0.004292512564772274, 'precision@10': 0.0011160841103206218, 'recall@10': 0.011160841103206217, 'ndcg@10': 0.005623734233663688, 'precision@20': 0.0008833982488589808, 'recall@20': 0.017667964977179616, 'ndcg@20': 0.007238930050941653, 'num_eval_users': 792055, 'num_users_

In [45]:
import pandas as pd

h = pd.read_parquet(FEATURE_STORE_DIR / "user_history_agg.parquet")
print(h.columns.tolist())
print(h.head(3))


['customer_id', 'total_purchases', 'last_purchase_date', 'avg_price', 'top_product_group_name']
                                         customer_id  total_purchases  \
0  000064249685c11552da43ef22a5030f35a147f723d5b0...                3   
1  0001ab2ebc1bb9a21d135e2fefdb11f12bee5c74ab2984...               30   
2  0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...               24   

  last_purchase_date  avg_price top_product_group_name  
0         2019-10-02   0.033881     Garment Lower body  
1         2020-02-28   0.019700     Garment Upper body  
2         2020-09-02   0.031679     Garment Upper body  
