In [2]:
# === Retrain ALS using feedback.jsonl → log to MLflow → deploy best (model_latest.npz) ===
import os, json, time, pathlib, requests
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# 1) Ustawienia i ścieżki
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("OMP_NUM_THREADS", "1")
try:
    from threadpoolctl import threadpool_limits
    threadpool_limits(1, "blas")
except Exception:
    pass

ART = pathlib.Path("/workspace/artifacts")
DATA = pathlib.Path("/workspace/data/raw/ml-latest-small")
ART.mkdir(parents=True, exist_ok=True)

RATINGS_CSV = DATA / "ratings.csv"
USERS_MAP   = ART / "users_map.csv"
ITEMS_MAP   = ART / "items_map.csv"
FEEDBACK    = ART / "feedback.jsonl"
POPULAR_CSV = ART / "popular_items.csv"
MODEL_LATEST= ART / "model_latest.npz"     # to ładuje API (hot-reload)
BACKUP_DIR  = ART / "models"
BACKUP_DIR.mkdir(exist_ok=True)

API_URL = os.getenv("API_URL", "http://localhost:8080")  # lub http://api:8080 z kontenera

# 2) Dane + mapy
ratings = pd.read_csv(RATINGS_CSV)
assert USERS_MAP.exists() and ITEMS_MAP.exists(), "Brak users_map.csv / items_map.csv – uruchom najpierw notebook treningowy."
users_map = pd.read_csv(USERS_MAP)
items_map = pd.read_csv(ITEMS_MAP)

# 3) Feedback (👍/👎) → DataFrame
fb_rows = []
if FEEDBACK.exists():
    with FEEDBACK.open("r", encoding="utf-8") as f:
        for line in f:
            try:
                fb_rows.append(json.loads(line))
            except Exception:
                pass
feedback = pd.DataFrame(fb_rows) if fb_rows else pd.DataFrame(columns=["user_id","item_index","movieId","relevant"])

# 4) Zbuduj macierz UI (implicit) z wagami feedbacku
#    - bazowo: rating>=4.0 → waga 1.0
#    - 👎: para user-item jest usuwana (ban)
#    - 👍: dodajemy tę parę z wagą alpha_up (np. 2.0)
alpha_up = 2.0

# bazowe pozytywy
pos = ratings[ratings["rating"] >= 4.0][["userId","movieId"]].copy()
pos["rating"] = 1.0

# mapuj na indeksy wg istniejących map (KRYTYCZNE: zgodność z API)
df = (pos.merge(users_map, on="userId", how="inner")
         .merge(items_map[["item_index","movieId"]], on="movieId", how="inner"))
u = df["user_index"].to_numpy(np.int32)
i = df["item_index"].to_numpy(np.int32)
v = df["rating"].to_numpy(np.float32)

# feedback → map indeksów
if not feedback.empty:
    fb = (feedback
          .merge(users_map.rename(columns={"userId":"user_id"}), on="user_id", how="inner")
          .merge(items_map[["item_index","movieId"]], on="item_index", how="left"))
    # 👎
    bans = set(fb[fb["relevant"]==False][["user_index","item_index"]].dropna().astype(int)
               .itertuples(index=False, name=None))
    if bans:
        keep = np.array([(int(uu),int(ii)) not in bans for uu,ii in zip(u,i)], dtype=bool)
        u, i, v = u[keep], i[keep], v[keep]
    # 👍
    ups = fb[fb["relevant"]==True][["user_index","item_index"]].dropna().astype(int).to_numpy()
    if len(ups):
        u = np.concatenate([u, ups[:,0]])
        i = np.concatenate([i, ups[:,1]])
        v = np.concatenate([v, np.full(len(ups), float(alpha_up), dtype=np.float32)])

n_users = int(users_map["user_index"].max()) + 1
n_items = int(items_map["item_index"].max()) + 1
UI = csr_matrix((v, (u, i)), shape=(n_users, n_items), dtype=np.float32)
print("UI shape:", UI.shape, "nnz:", UI.nnz)

# 5) Split + BM25
rows, cols = UI.nonzero()
idx_all = np.arange(UI.nnz, dtype=np.int64)
train_idx, test_idx = train_test_split(idx_all, test_size=0.2, random_state=42)

def build_sparse(indices):
    r = rows[indices]; c = cols[indices]; vv = UI.data[indices]
    return csr_matrix((vv, (r, c)), shape=UI.shape, dtype=np.float32)

UI_train = build_sparse(train_idx)
UI_test  = build_sparse(test_idx)

from implicit.nearest_neighbours import bm25_weight
UIw = bm25_weight(UI_train, K1=1.2, B=0.75).astype(np.float32)
IUw = UIw.T.tocsr()  # ITEM×USER

# 6) Trening ALS (+ swap-safe utils)
from implicit.als import AlternatingLeastSquares
factors, reg, iters, k_eval = 64, 0.02, 15, 10

model = AlternatingLeastSquares(factors=factors, regularization=reg, iterations=iters, random_state=42)
model.fit(IUw)

n_items_train, n_users_train = UI_train.shape[1], UI_train.shape[0]
n_items_model, _ = model.item_factors.shape
n_users_model, _ = model.user_factors.shape
SWAPPED = (n_items_model == n_users_train) and (n_users_model == n_items_train)
print("SWAPPED:", SWAPPED)

def get_I():  # items matrix
    return model.user_factors if SWAPPED else model.item_factors
def get_u(uix):  # user vector
    return model.item_factors[uix] if SWAPPED else model.user_factors[uix]

# 7) Ewaluacja (P@10 / R@10 / MAP@10), maskowanie „seen”
truth = defaultdict(set)
tr, tc = UI_test.nonzero()
for r,c in zip(tr, tc): truth[r].add(c)

pop_counts = np.asarray(UI_train.sum(axis=0)).ravel()
pop_order = np.argsort(-pop_counts)

def rec(uix, N=10):
    I = get_I()
    n = I.shape[0]
    if UI_train.getrow(uix).nnz == 0:
        return pop_order[:N].tolist()
    s = I @ get_u(uix)
    seen = [x for x in UI_train.getrow(uix).indices if x < n]
    if seen: s[seen] = -1e12
    top = np.argpartition(-s, min(N, n-1))[:N]
    return top[np.argsort(-s[top])].tolist()

users = np.array(list(truth.keys()))
if len(users) > 500:
    rng = np.random.default_rng(42)
    users = rng.choice(users, size=500, replace=False)

precs, recs, maps = [], [], []
for uix in users:
    t = {i for i in truth[uix] if i < get_I().shape[0]}
    if not t: continue
    p = rec(uix, k_eval)
    inter = len(set(p) & t)
    precs.append(inter/k_eval)
    recs.append(inter/len(t))
    hits=0; score=0.0
    for rank,item in enumerate(p, start=1):
        if item in t:
            hits += 1; score += hits/rank
    maps.append(score / min(k_eval, len(t)))

metrics = {
    "precision_at_10": float(np.mean(precs)) if precs else 0.0,
    "recall_at_10": float(np.mean(recs)) if recs else 0.0,
    "map_at_10": float(np.mean(maps)) if maps else 0.0,
}
print("METRICS:", metrics)

# 8) Zapis modelu (backup + latest), popularność i log do MLflow
ts = time.strftime("%Y%m%d_%H%M%S")
backup_path = BACKUP_DIR / f"als_{ts}.npz"
np.savez_compressed(backup_path, user_factors=model.user_factors, item_factors=model.item_factors)
# podmień latest
MODEL_LATEST.write_bytes(backup_path.read_bytes())

# popularność do miksu
pop = (ratings[ratings["rating"]>=4]
       .groupby("movieId").size().rename("count").reset_index())
pop = items_map.merge(pop, on="movieId", how="left").fillna({"count":0})
pop = pop.sort_values("count", ascending=False).reset_index(drop=True)
pop.to_csv(POPULAR_CSV, index=False)

# MLflow
import mlflow
mlflow.set_experiment("netflix-poc")
with mlflow.start_run(run_name=f"ALS_feedback_{ts}"):
    mlflow.log_param("factors", factors)
    mlflow.log_param("regularization", reg)
    mlflow.log_param("iterations", iters)
    mlflow.log_param("alpha_upvote", alpha_up)
    mlflow.log_param("swapped_detected", bool(SWAPPED))
    mlflow.log_metric("precision_at_10", metrics["precision_at_10"])
    mlflow.log_metric("recall_at_10", metrics["recall_at_10"])
    mlflow.log_metric("map_at_10", metrics["map_at_10"])
    mlflow.log_artifact(str(backup_path))
    mlflow.log_artifact(str(USERS_MAP))
    mlflow.log_artifact(str(ITEMS_MAP))
    mlflow.log_artifact(str(POPULAR_CSV))

print("Saved model:", backup_path.name, "and updated", MODEL_LATEST.name)

# 9) Hot-reload modelu w API
try:
    r = requests.post(f"{API_URL}/admin/reload-model", timeout=5)
    print("API reload:", r.status_code, r.text[:200])
except Exception as e:
    print("API reload error:", e)

UI shape: (609, 6298) nnz: 48577


  0%|          | 0/15 [00:00<?, ?it/s]

SWAPPED: True
METRICS: {'precision_at_10': 0.15259999999999999, 'recall_at_10': 0.15034830920398098, 'map_at_10': 0.10818761841773746}
🏃 View run ALS_feedback_20250924_200925 at: http://mlflow:5001/#/experiments/1/runs/d6c157b211e54e4485650e11fa2186b6
🧪 View experiment at: http://mlflow:5001/#/experiments/1
Saved model: als_20250924_200925.npz and updated model_latest.npz
API reload error: HTTPConnectionPool(host='localhost', port=8080): Max retries exceeded with url: /admin/reload-model (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7db28e8b48b0>: Failed to establish a new connection: [Errno 111] Connection refused'))
