In [4]:
import sys
import os
import json
import subprocess
from pathlib import Path

In [5]:
PROJECT_ROOT = Path(r"C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M")

FEATURE_STORE_DIR = PROJECT_ROOT / "data" / "feature_store"
REGISTRY_DIR = PROJECT_ROOT / "ml" / "registry" / "recommender"
REPORTS_DIR = PROJECT_ROOT / "ml" / "reports" / "recommender"

TOP_KS = "5,10,20"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("FEATURE_STORE_DIR:", FEATURE_STORE_DIR)
print("REGISTRY_DIR:", REGISTRY_DIR)
print("REPORTS_DIR:", REPORTS_DIR)

PROJECT_ROOT: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M
FEATURE_STORE_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store
REGISTRY_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
REPORTS_DIR: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [6]:
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("sys.path[0]:", sys.path[0])

sys.path[0]: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M


In [7]:
import ml
from ml.recommender import train, evaluate, inference

print("import ml OK")
print("import recommender modules OK")

import ml OK
import recommender modules OK


In [8]:
assert PROJECT_ROOT.exists()
assert (PROJECT_ROOT / "ml").exists()
assert (PROJECT_ROOT / "ml" / "__init__.py").exists()
assert (PROJECT_ROOT / "ml" / "recommender" / "__init__.py").exists()
assert (PROJECT_ROOT / "features" / "__init__.py").exists()

print("project structure OK")

project structure OK


In [9]:
required_files = [
    FEATURE_STORE_DIR / "interactions.parquet",
    FEATURE_STORE_DIR / "user_features.parquet",
    FEATURE_STORE_DIR / "item_features.parquet",
    FEATURE_STORE_DIR / "item_popularity.parquet",
]

for f in required_files:
    print(f, "exists:", f.exists())
    if not f.exists():
        raise FileNotFoundError(str(f))

print("feature_store OK")

C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\interactions.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\user_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_features.parquet exists: True
C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store\item_popularity.parquet exists: True
feature_store OK


In [10]:
REGISTRY_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("registry dir:", REGISTRY_DIR)
print("reports dir:", REPORTS_DIR)

registry dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender


In [11]:
import pyarrow
import pandas as pd

print("pyarrow version:", pyarrow.__version__)
pd.read_parquet(FEATURE_STORE_DIR / "interactions.parquet")
print("parquet read OK")

pyarrow version: 22.0.0
parquet read OK


In [12]:
def run_cmd(cmd: str):
    env = os.environ.copy()
    env["PYTHONPATH"] = str(PROJECT_ROOT)

    print(cmd)

    p = subprocess.Popen(
        cmd,
        shell=True,
        cwd=str(PROJECT_ROOT),
        env=env,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        encoding="utf-8",
        errors="replace",
    )

    for line in p.stdout:
        print(line.rstrip())

    p.wait()
    if p.returncode != 0:
        raise RuntimeError(f"command failed: {p.returncode}")

In [13]:
REGISTRY_RUN = PROJECT_ROOT / "ml" / "registry" / "recommender" / "run_2m_e2_v4_bpr"
REGISTRY_RUN.mkdir(parents=True, exist_ok=True)

cmd = f'"{sys.executable}" -u -m ml.recommender.train ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--epochs 2 --batch_size 512 --max_interactions 2000000'
run_cmd(cmd)


"c:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\.venv\Scripts\python.exe" -u -m ml.recommender.train --feature_store_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store" --registry_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v4_bpr" --reports_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender" --epochs 2 --batch_size 512 --max_interactions 2000000
device: cpu
feature_store_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store
registry_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v4_bpr
reports_dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender
objective: bpr
neg_sampling: mixed
raw interactions rows: 31788324
raw user_features rows: 1371980
raw item_features rows: 105542
user_history_agg loaded: True
train interactions rows: 2000000
unique customers in 

In [14]:
for p in sorted(REGISTRY_DIR.iterdir(), key=lambda x: x.name):
    print(p.name, p.stat().st_size)

faiss.index 20822573
feature_encoders.json 57369064
item_embeddings.npy 20822656
item_id_mapping.json 1615650
item_popularity.csv 2115768
metadata.json 697
run_2m_e2 4096
run_2m_e2_v1_bucket_and_bpr 4096
run_2m_e2_v2 4096
run_2m_e2_v3_bpr 4096
run_2m_e2_v4_bpr 4096
run_3m_e3 4096
two_tower_model.pt 242334
user_id_mapping.json 51913778
write_test_1768454260.txt 2


In [15]:
cmd = f'"{sys.executable}" -u -m ml.recommender.evaluate ' \
      f'--feature_store_dir "{FEATURE_STORE_DIR}" ' \
      f'--registry_dir "{REGISTRY_RUN}" ' \
      f'--reports_dir "{REPORTS_DIR}" ' \
      f'--ks "{TOP_KS}"'
run_cmd(cmd)

"c:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\.venv\Scripts\python.exe" -u -m ml.recommender.evaluate --feature_store_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\data\feature_store" --registry_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v4_bpr" --reports_dir "C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender" --ks "5,10,20"
reports dir: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\reports\recommender
split: {'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 676238, 'filtered_out_users': 554529, 'avg_history_len_train': 22.431170221121782, 'uses_user_history_agg': True}
model: {'precision@5': 0.00014225760752871033, 'recall@5': 0.0007112880376435515, 'ndcg@5': 0.00037899225038887476, 'precision@10': 0.00011149920590088106, 'recall@10': 0.0011149920590088104, 'ndcg@10': 0.0005072381

In [16]:
with open(REPORTS_DIR / "split_stats.json") as f:
    print("SPLIT STATS")
    print(json.load(f))

with open(REPORTS_DIR / "metrics.json") as f:
    print("MODEL METRICS")
    print(json.load(f))

with open(REPORTS_DIR / "baseline_metrics.json") as f:
    print("BASELINE METRICS")
    print(json.load(f))


SPLIT STATS
{'train_rows': 30557557, 'test_rows': 1230767, 'num_users_total': 1362281, 'num_users_eval_raw': 1230767, 'num_users_eval_filtered': 676238, 'filtered_out_users': 554529, 'avg_history_len_train': 22.431170221121782, 'uses_user_history_agg': True}
MODEL METRICS
{'precision@5': 0.00014225760752871033, 'recall@5': 0.0007112880376435515, 'ndcg@5': 0.00037899225038887476, 'precision@10': 0.00011149920590088106, 'recall@10': 0.0011149920590088104, 'ndcg@10': 0.0005072381723215829, 'precision@20': 0.00010181326692673288, 'recall@20': 0.002036265338534658, 'ndcg@20': 0.0007360375452154655, 'num_eval_users': 676238, 'num_users_with_recs': 676238}
BASELINE METRICS
{'precision@5': 0.0013998030279280373, 'recall@5': 0.006999015139640186, 'ndcg@5': 0.004310570270700537, 'precision@10': 0.0011118866434598471, 'recall@10': 0.01111886643459847, 'ndcg@10': 0.005620917157914896, 'precision@20': 0.0008712169384151734, 'recall@20': 0.017424338768303467, 'ndcg@20': 0.007184700869867926, 'num_ev

In [17]:
from ml.recommender.inference import RecommenderService
import pandas as pd
import json

service = RecommenderService(
    registry_dir=str(REGISTRY_RUN),
    feature_store_dir=str(FEATURE_STORE_DIR),
)

service.load()

with open(REGISTRY_RUN / "user_id_mapping.json", "r") as f:
    user_map = list(json.load(f).keys())

customer_id = user_map[0]
top_k = 10

res = service.recommend(customer_id=customer_id, top_k=top_k)

df = pd.DataFrame(
    [{"article_id": r.article_id, "score": r.score} for r in res.recommendations]
)

print("customer_id:", customer_id)
print("fallback:", res.is_fallback)
df


customer_id: 3a90a1b9c8b3cc6a73ed007b774c868113af5e4b9ff4ce214f673a8102a2da44
fallback: False


Unnamed: 0,article_id,score
0,926015001,0.50707
1,910568002,0.50707
2,926938001,0.50707
3,179208008,0.50707
4,931720001,0.50707
5,367372001,0.50707
6,795013001,0.50707
7,806605002,0.50707
8,603686001,0.50707
9,379963002,0.50707


In [18]:
import torch
import numpy as np

cid = '3a90a1b9c8b3cc6a73ed007b774c868113af5e4b9ff4ce214f673a8102a2da44'

u_idx = service.enc.user_id_map[cid]
feat = service.user_feat_t[u_idx : u_idx + 1].to(service.device)

with torch.no_grad():
    u_emb = service.model.user_forward(feat).detach().cpu().numpy().astype(np.float32)

D, I = service.index.search(u_emb, 10)

print("u_emb norm:", float(np.linalg.norm(u_emb)))
print("D[0]:", D[0])
print("I[0]:", I[0])


u_emb norm: 0.9999999403953552
D[0]: [0.9721141 0.9721141 0.9721141 0.9721141 0.9721141 0.9721141 0.9721141
 0.9721141 0.9721141 0.9721141]
I[0]: [3588 2601 1893 1071  954  925  658  465  359  207]


In [None]:
import sys, json
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M")
FEATURE_STORE_DIR = PROJECT_ROOT / "data" / "feature_store"
REGISTRY_RUN = PROJECT_ROOT / "ml" / "registry" / "recommender" / "run_2m_e2_v4_bpr"

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from ml.recommender.inference import RecommenderService

service = RecommenderService(
    registry_dir=str(REGISTRY_RUN),
    feature_store_dir=str(FEATURE_STORE_DIR),
)

service.load()

with open(REGISTRY_RUN / "user_id_mapping.json", "r") as f:
    user_ids = list(json.load(f).keys())

customer_id = user_ids[0]
top_k = 10

res = service.recommend(customer_id=customer_id, top_k=top_k)

df = pd.DataFrame(
    [
        {
            "rank": i + 1,
            "article_id": r.article_id,
            "calibrated_score": round(r.score, 6),
        }
        for i, r in enumerate(res.recommendations)
    ]
)

print("customer_id:", customer_id)
print("fallback:", res.is_fallback)
df

PROJECT_ROOT: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M
REGISTRY_RUN: C:\Users\kamil\Documents\SELF PROJECT\BIG ML Project\H&M\ml\registry\recommender\run_2m_e2_v4_bpr
customer_id: 3a90a1b9c8b3cc6a73ed007b774c868113af5e4b9ff4ce214f673a8102a2da44
fallback: False


Unnamed: 0,article_id,score
0,926015001,0.972114
1,910568002,0.972114
2,926938001,0.972114
3,179208008,0.972114
4,931720001,0.972114
5,367372001,0.972114
6,795013001,0.972114
7,806605002,0.972114
8,603686001,0.972114
9,379963002,0.972114


In [4]:
import random, pandas as pd

uids = random.sample(user_map, 10)
rows = []
for uid in uids:
    res = service.recommend(uid, top_k=10)
    scores = [r.score for r in res.recommendations]
    rows.append({"uid": uid[:10]+"...", "fallback": res.is_fallback, "min": min(scores), "max": max(scores)})

pd.DataFrame(rows)

Unnamed: 0,uid,fallback,min,max
0,66ff24d63a...,False,0.947706,0.97186
1,ddb091a195...,False,0.972114,0.972114
2,921341888e...,False,0.988885,0.988885
3,d6d4aeb879...,False,0.972114,0.972114
4,ce50d13a1c...,False,0.957217,0.960273
5,3b7682c47c...,False,0.988885,0.988885
6,1ac99918ec...,False,0.985357,0.985357
7,a54ae2735b...,False,0.985357,0.985357
8,b7d95167be...,False,0.976587,0.978741
9,b091235fa6...,False,0.976587,0.978741


In [3]:
import faiss

print(type(service.index))
print("is_trained:", service.index.is_trained)
print("ntotal:", service.index.ntotal)

# metric_type: 0 = INNER_PRODUCT, 1 = L2 in FAISS
mt = getattr(service.index, "metric_type", None)
print("metric_type:", mt, "(0=IP, 1=L2)")

<class 'faiss.swigfaiss_avx2.IndexFlatIP'>
is_trained: True
ntotal: 81338
metric_type: 0 (0=IP, 1=L2)


In [4]:
import numpy as np
from pathlib import Path

E = np.load(Path(REGISTRY_RUN) / "item_embeddings.npy").astype("float32")

print("E shape:", E.shape)
print("norm mean/std:", float(np.linalg.norm(E, axis=1).mean()), float(np.linalg.norm(E, axis=1).std()))
print("feature std mean:", float(E.std(axis=0).mean()))
print("feature std min/max:", float(E.std(axis=0).min()), float(E.std(axis=0).max()))

E shape: (81338, 64)
norm mean/std: 1.0 4.3718682007920506e-08
feature std mean: 0.11686147749423981
feature std min/max: 0.01747976243495941 0.16710512340068817


In [6]:
import pandas as pd
from pathlib import Path

item_df = pd.read_parquet(Path(FEATURE_STORE_DIR) / "item_features.parquet")
print("item_features columns:", item_df.columns.tolist())
print(item_df.nunique().sort_values().head(20))

item_features columns: ['article_id', 'product_type_name', 'product_group_name', 'department_name', 'colour_group_name', 'section_name', 'garment_group_name']
product_group_name        19
garment_group_name        21
colour_group_name         50
section_name              56
product_type_name        131
department_name          250
article_id            105542
dtype: int64
