
# Most Popular Re-ranking 

In [1]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "1"  


import re
import numpy as np
import pandas as pd
from pathlib import Path
import sys, traceback
from datetime import datetime

from typing import Optional, Dict, List, Tuple, Iterable

dt = datetime.now().strftime("%Y%m%d%H%M%S")
test = "untest"
model = "most_popular"
config_model = "als"  


PARENTS_PATH = Path(os.getcwd()).resolve().parent
CONFIG_PATH = os.path.join(PARENTS_PATH, f"config/models/mf/{config_model}.yaml")
PREPROCESS_CONFIG_PATH = os.path.join(PARENTS_PATH, f"config/preprocess/preprocess.yaml")
RESULT_PATH = os.path.join(PARENTS_PATH, f"result/{test}/{model}/{dt}")

ROOT_PATH = os.path.join(PARENTS_PATH,'src')


if str(ROOT_PATH) not in sys.path:
    sys.path.append(str(ROOT_PATH))

print(f"ROOT_PATH: {ROOT_PATH}")
print(f"CONFIG_PATH: {CONFIG_PATH}")
print(f"PREPROCESS_CONFIG_PATH: {PREPROCESS_CONFIG_PATH}")
print(f"RESULT_ROOT: {RESULT_PATH}")


ROOT_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\src
CONFIG_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\config/models/mf/als.yaml
PREPROCESS_CONFIG_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\config/preprocess/preprocess.yaml
RESULT_ROOT: C:\Users\LEEYS\Desktop\yamyam-lab\result/untest/most_popular/20250825211407


In [2]:

PROJECT_MODE = True
try:
    import numpy as np
    import pandas as pd

    from data.dataset import DataConfig, DatasetLoader
    from evaluation.metric_calculator import MostPopularMetricCalculator
    from tools.config import load_yaml
    from tools.logger import common_logging, setup_logger
    from tools.parse_args import save_command_to_file
    from tools.utils import haversine
    print("Project imports succeeded.")
except Exception as e:
    PROJECT_MODE = False
    print("Project imports failed.")
    print("Reason:", repr(e))


Project imports succeeded.


## Function

In [None]:
def extract_region_label(addr: str) -> str:
    """
    Extract a region label (e.g., '서울시 강남구') from a road address string.

    Args:
        addr (str): Address string.

    Returns:
        str: Region label extracted from the address.
             - If the second token ends with '구', returns first two tokens joined.
             - If matches '군', '구', '시', returns the matched group.
             - Otherwise returns the first token.
             - Returns 'unknown' if input is invalid or empty.
    """
    if not isinstance(addr, str) or not addr:
        return "unknown"
    parts = addr.split()
    if len(parts) >= 2 and parts[1].endswith("구"):
        return " ".join(parts[:2])
    m = re.match(r"^(\S+)\s+(\S+구|\S+군|\S+시)", addr)
    return m.group(0) if m else parts[0] if parts else "unknown"


def _minmax(x: np.ndarray) -> np.ndarray:
    """
    Apply min-max normalization to an array.

    Args:
        x (np.ndarray): Input array.

    Returns:
        np.ndarray: Normalized array scaled to [0, 1].
                    Returns zeros if input has no variation.
    """
    x = x.astype(np.float32, copy=False)
    mn, mx = float(x.min()), float(x.max())
    return (x - mn) / (mx - mn + 1e-8) if mx > mn else np.zeros_like(x, dtype=np.float32)


def _validate_and_clip_k(item_ids: np.ndarray, base_scores: np.ndarray, k: int) -> int:
    """
    Validate and clip k to be within the valid range of candidate items.

    Args:
        item_ids (np.ndarray): Candidate item IDs.
        base_scores (np.ndarray): Base scores corresponding to item_ids.
        k (int): Requested number of items.

    Returns:
        int: Clipped value of k (0 if no items or invalid input).
    """
    assert len(item_ids) == len(base_scores), "item_ids/base_scores length mismatch"
    L = len(item_ids)
    return 0 if (L == 0 or k <= 0) else min(k, L)


def _prepare_meta(item_meta: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[int, int]]:
    """
    Prepare metadata for items and build an index mapping.

    Args:
        item_meta (pd.DataFrame): DataFrame containing diner metadata.
                                  Required columns: diner_idx, diner_category_large,
                                  diner_lat, diner_lon.

    Returns:
        Tuple[pd.DataFrame, Dict[int, int]]:
            - Cleaned metadata DataFrame (unique diner_idx).
            - Mapping from diner_idx to row index.
    """
    required = {"diner_idx", "diner_category_large", "diner_lat", "diner_lon"}
    missing = required - set(item_meta.columns)
    if missing:
        raise ValueError(f"item_meta missing columns: {missing}")
    meta = (
        item_meta[["diner_idx", "diner_category_large", "diner_lat", "diner_lon"]]
        .drop_duplicates("diner_idx").reset_index(drop=True)
    )
    id2row = {int(r.diner_idx): i for i, r in meta.iterrows()}
    return meta, id2row


def _filter_candidates_by_meta_and_topm(
    item_ids: np.ndarray,
    rel: np.ndarray,
    id2row: Dict[int, int],
    top_m: Optional[int],
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Filter candidate items by available metadata and optionally keep only top-M by relevance.

    Args:
        item_ids (np.ndarray): Candidate item IDs.
        rel (np.ndarray): Relevance scores for items.
        id2row (Dict[int, int]): Mapping from diner_idx to row index.
        top_m (Optional[int]): If set, keep only top-M items by relevance.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]:
            - Filtered item IDs.
            - Corresponding relevance scores.
            - Row indices in metadata DataFrame.
    """
    has_meta = np.fromiter((int(x) in id2row for x in item_ids), dtype=bool, count=len(item_ids))
    item_ids = item_ids[has_meta]
    rel = rel[has_meta]
    if item_ids.size == 0:
        return item_ids, rel, np.array([], dtype=int)

    if top_m is not None and top_m < item_ids.size:
        top_idx = np.argpartition(-rel, kth=top_m - 1)[:top_m]
        item_ids = item_ids[top_idx]
        rel = rel[top_idx]

    rows = np.fromiter((id2row[int(cid)] for cid in item_ids), dtype=int, count=item_ids.size)
    return item_ids, rel, rows


def _encode_categories(meta_sorted: pd.DataFrame) -> Tuple[np.ndarray, pd.Series]:
    """
    Encode categorical labels of diners into numeric codes.

    Args:
        meta_sorted (pd.DataFrame): Metadata DataFrame sorted to match candidates.

    Returns:
        Tuple[np.ndarray, pd.Series]:
            - Category codes as integers.
            - Original category labels as pandas Series.
    """
    cats = meta_sorted["diner_category_large"].astype("category")
    cat_codes = cats.cat.codes.to_numpy(dtype=np.int32)
    return cat_codes, cats


def _geo_precompute(meta_sorted: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Precompute geographic values for efficiency.

    Args:
        meta_sorted (pd.DataFrame): Metadata DataFrame containing diner_lat, diner_lon.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray]:
            - Latitudes in radians.
            - Longitudes in radians.
            - Cosine of latitudes.
    """
    lat_rad = np.deg2rad(meta_sorted["diner_lat"].to_numpy(dtype=np.float32)).astype(np.float32)
    lon_rad = np.deg2rad(meta_sorted["diner_lon"].to_numpy(dtype=np.float32)).astype(np.float32)
    cos_lat = np.cos(lat_rad).astype(np.float32)
    return (np.ascontiguousarray(lat_rad),
            np.ascontiguousarray(lon_rad),
            np.ascontiguousarray(cos_lat))


def _build_coverage_labels(
    item_ids: np.ndarray,
    cats: pd.Series,
    region_of: Optional[Dict[int, str]],
) -> Tuple[List[List[str]], Dict[str, np.ndarray]]:
    """
    Build coverage labels for items (category + region) and index mapping.

    Args:
        item_ids (np.ndarray): Candidate item IDs.
        cats (pd.Series): Category labels for items.
        region_of (Optional[Dict[int, str]]): Mapping from item_id to region label.

    Returns:
        Tuple[List[List[str]], Dict[str, np.ndarray]]:
            - List of labels per item (category + region).
            - Dictionary mapping label -> array of indices having that label.
    """
    region_of = region_of or {}
    categories_str = cats.astype(str).to_numpy()
    regions = np.array([region_of.get(int(cid), "unknown") for cid in item_ids], dtype=object)

    lab_cat = np.array([f"diner_category_large:{v}" for v in categories_str], dtype=object)
    lab_reg = np.array([f"diner_road_address:{r}" for r in regions], dtype=object)
    labels_by_idx = [[lab_cat[i], lab_reg[i]] for i in range(item_ids.size)]

    label_to_indices: Dict[str, np.ndarray] = {}
    for lab in np.unique(lab_cat):
        label_to_indices[lab] = np.flatnonzero(lab_cat == lab)
    for lab in np.unique(lab_reg):
        label_to_indices[lab] = np.flatnonzero(lab_reg == lab)
    return labels_by_idx, label_to_indices


def _apply_coverage_max_candonly(
    cand_idx: np.ndarray,
    cov_counts: Dict[str, int],
    coverage_max: Dict[str, int],
    label_to_indices: Dict[str, np.ndarray],
) -> np.ndarray:
    """
    Apply coverage_max constraint to filter out candidates exceeding maximum quota.

    Args:
        cand_idx (np.ndarray): Indices of candidate items.
        cov_counts (Dict[str, int]): Current coverage counts per label.
        coverage_max (Dict[str, int]): Maximum allowed counts per label.
        label_to_indices (Dict[str, np.ndarray]): Label-to-indices mapping.

    Returns:
        np.ndarray: Filtered candidate indices.
    """
    if not coverage_max or cand_idx.size == 0:
        return cand_idx
    cand_mask = np.ones(cand_idx.size, dtype=bool)
    idx_in_cand = {int(i): pos for pos, i in enumerate(cand_idx)}
    for lab, mx in coverage_max.items():
        if mx is None:
            continue
        if cov_counts.get(lab, 0) >= mx:
            idxs = label_to_indices.get(lab)
            if idxs is None or idxs.size == 0:
                continue
            for i in idxs:
                pos = idx_in_cand.get(int(i))
                if pos is not None:
                    cand_mask[pos] = False
    return cand_idx[cand_mask]


def _coverage_min_bonus_candonly(
    cand_idx: np.ndarray,
    cov_counts: Dict[str, int],
    coverage_min: Dict[str, int],
    label_to_indices: Dict[str, np.ndarray],
    step: float = 0.05,
) -> np.ndarray:
    """
    Compute bonus scores for candidates based on coverage_min deficits.

    Args:
        cand_idx (np.ndarray): Indices of candidate items.
        cov_counts (Dict[str, int]): Current coverage counts per label.
        coverage_min (Dict[str, int]): Minimum desired counts per label.
        label_to_indices (Dict[str, np.ndarray]): Label-to-indices mapping.
        step (float): Bonus increment per deficit unit.

    Returns:
        np.ndarray: Bonus values for candidates.
    """
    if not coverage_min or cand_idx.size == 0:
        return np.zeros(cand_idx.size, dtype=np.float32)
    bonus = np.zeros(cand_idx.size, dtype=np.float32)
    idx_in_cand = {int(i): pos for pos, i in enumerate(cand_idx)}
    for lab, mn in coverage_min.items():
        deficit = mn - cov_counts.get(lab, 0)
        if deficit <= 0:
            continue
        idxs = label_to_indices.get(lab)
        if idxs is None or idxs.size == 0:
            continue
        for i in idxs:
            pos = idx_in_cand.get(int(i))
            if pos is not None:
                bonus[pos] += deficit * step
    return bonus


def _geo_similarity_haversine(
    lat_vec_rad: np.ndarray,
    lon_vec_rad: np.ndarray,
    sel_lat_rad: float,
    sel_lon_rad: float,
    tau_km: float,
) -> np.ndarray:
    """
    Compute geographic similarity exp(-d/tau) using haversine distance.

    Args:
        lat_vec_rad (np.ndarray): Latitudes of candidates in radians.
        lon_vec_rad (np.ndarray): Longitudes of candidates in radians.
        sel_lat_rad (float): Selected latitude in radians.
        sel_lon_rad (float): Selected longitude in radians.
        tau_km (float): Decay parameter in kilometers.

    Returns:
        np.ndarray: Geographic similarity values in [0, 1].
    """
    lat_deg = np.degrees(lat_vec_rad)
    lon_deg = np.degrees(lon_vec_rad)
    sel_lat_deg = float(np.degrees(sel_lat_rad))
    sel_lon_deg = float(np.degrees(sel_lon_rad))

    d_km = haversine(
        reviewer_lat=sel_lat_deg,
        reviewer_lon=sel_lon_deg,
        diner_lat=pd.Series(lat_deg),
        diner_lon=pd.Series(lon_deg),
    )

    inv_tau = 1.0 / max(float(tau_km), 1e-6)
    return np.exp(-np.asarray(d_km, dtype=np.float64) * inv_tau).astype(np.float32)


In [None]:

# --- 메인: MMR re-ranking (지리 유사도만 haversine 기반으로 교체) ---
def rerank_most_popular_with_diversity(
    item_ids: np.ndarray,
    base_scores: np.ndarray,
    item_meta: pd.DataFrame,
    k: int,
    lambda_div: float = 0.55,      
    w_cat: float = 0.5,
    w_geo: float = 0.5,
    geo_tau_km: float = 2.0,
    coverage_min: Optional[Dict[str, int]] = None,
    coverage_max: Optional[Dict[str, int]] = None,
    region_of: Optional[Dict[int, str]] = None,
    popularity_weight: float = 0.0,
    popularity_scores: Optional[np.ndarray] = None,
    normalize_rel: bool = True,
    top_m: Optional[int] = None,         
    debug: bool = False,
    prefix_freeze: int = 0,               
    coverage_step: float = 0.05,          
):
    """
    Re-rank the Most Popular recommendation list with category and geographic diversity
    using a Maximal Marginal Relevance (MMR) approach.

    The final score of a candidate item i is computed as:
        score(i) = λ · rel(i) - (1-λ) · sim_max(i) + bonus(i)
    where:
        - rel(i): normalized relevance (optionally blended with popularity)
        - sim_max(i): maximum similarity to already selected items
          sim(i,j) = w_cat * [cat(i) = cat(j)] + w_geo * exp(-d(i,j)/τ)
          with d(i,j) the haversine distance in km and τ = geo_tau_km
        - bonus(i): additional bonus to encourage coverage_min constraints

    Args:
        item_ids (np.ndarray): Array of candidate item IDs.
        base_scores (np.ndarray): Base relevance scores for each item.
        item_meta (pd.DataFrame): Metadata with required columns:
            ['diner_idx', 'diner_category_large', 'diner_lat', 'diner_lon'].
        k (int): Number of items to select.
        lambda_div (float, optional): Trade-off parameter (λ↑ → accuracy↑, diversity↓).
        w_cat (float, optional): Weight for category similarity.
        w_geo (float, optional): Weight for geographic similarity.
        geo_tau_km (float, optional): Decay parameter (km) for geographic similarity kernel.
        coverage_min (Optional[Dict[str, int]], optional): Minimum coverage requirements per label.
        coverage_max (Optional[Dict[str, int]], optional): Maximum coverage limits per label.
        region_of (Optional[Dict[int, str]], optional): Mapping from item_id to region label.
        popularity_weight (float, optional): Weight for blending relevance with popularity.
        popularity_scores (Optional[np.ndarray], optional): Popularity scores for items.
        normalize_rel (bool, optional): Whether to min-max normalize relevance.
        top_m (Optional[int], optional): Pre-select top-M items by relevance before re-ranking.
        debug (bool, optional): If True, print simple debug info during first steps.
        prefix_freeze (int, optional): Keep top-T original items fixed in the final ranking.
        use_geo_fast (bool, optional): Legacy flag (ignored).
        coverage_step (float, optional): Step size for coverage_min bonus increments.

    Returns:
        Tuple[np.ndarray, np.ndarray]:
            - Selected item IDs (length k).
            - (Currently empty) array for scores, placeholder for extension.
    """

    k = _validate_and_clip_k(item_ids, base_scores, k)
    if k == 0:
        return np.array([], dtype=int), np.array([], dtype=float)

    base_scores = np.asarray(item_ids, dtype=np.int64) * 0 + np.asarray(base_scores, dtype=np.float32)  # ensure same length
    rel = base_scores.astype(np.float32, copy=True)
    if normalize_rel:
        rel = _minmax(rel)
    if popularity_scores is not None:
        pop = np.asarray(popularity_scores, dtype=np.float32)
        if normalize_rel:
            pop = _minmax(pop)
        rel = (1 - popularity_weight) * rel + popularity_weight * pop

    lam = float(np.clip(lambda_div, 0.0, 1.0))
    rel_n = _minmax(rel)  # 루프 밖 1회

    meta, id2row = _prepare_meta(item_meta)
    item_ids_f, rel_f, rows = _filter_candidates_by_meta_and_topm(item_ids, rel_n, id2row, top_m)
    if item_ids_f.size == 0:
        return np.array([], dtype=int), np.array([], dtype=float)

    meta_sorted = meta.iloc[rows]
    cat_codes, cats = _encode_categories(meta_sorted)
    lat_rad, lon_rad, _cos_lat_unused = _geo_precompute(meta_sorted)
    labels_by_idx, label_to_indices = _build_coverage_labels(item_ids_f, cats, region_of)

    T = int(max(0, min(prefix_freeze, k, item_ids_f.size)))
    frozen_ids = item_ids_f[:T].astype(int, copy=False)
    size = item_ids_f.size
    alive = np.ones(size, dtype=bool)
    alive[:T] = False

    coverage_min = coverage_min or {}
    coverage_max = coverage_max or {}
    cov_counts: Dict[str, int] = {}
    for i in range(T):
        for lab in labels_by_idx[i]:
            cov_counts[lab] = cov_counts.get(lab, 0) + 1

    current_max_sim = np.zeros(size, dtype=np.float32)
    if T > 0 and alive.any():
        aidx = np.flatnonzero(alive)
        for sel in range(T):
            sel_code = cat_codes[sel]
            sel_lat = float(lat_rad[sel]); sel_lon = float(lon_rad[sel])

            sim_cat = (cat_codes[aidx] == sel_code).astype(np.float32)
            sim_geo = _geo_similarity_haversine(
                lat_vec_rad=lat_rad[aidx],
                lon_vec_rad=lon_rad[aidx],
                sel_lat_rad=sel_lat,
                sel_lon_rad=sel_lon,
                tau_km=geo_tau_km,
            )
            combined = (w_cat * sim_cat + w_geo * sim_geo).astype(np.float32)
            np.maximum(current_max_sim[aidx], combined, out=current_max_sim[aidx])

    chosen_ids: List[int] = [] if T == 0 else list(frozen_ids)
    chosen_scores: List[float] = []  # MMR 점수도 기록용

    step = 0
    while len(chosen_ids) < k and alive.any():
        cand_idx = np.flatnonzero(alive)
        if cand_idx.size == 0:
            break

        cand_idx = _apply_coverage_max_candonly(
            cand_idx=cand_idx,
            cov_counts=cov_counts,
            coverage_max=coverage_max,
            label_to_indices=label_to_indices,
        )
        if cand_idx.size == 0:
            break

        bonus_c = _coverage_min_bonus_candonly(
            cand_idx=cand_idx,
            cov_counts=cov_counts,
            coverage_min=coverage_min,
            label_to_indices=label_to_indices,
            step=coverage_step,
        )

        sim_c = current_max_sim[cand_idx]
        mn, mx = float(sim_c.min()), float(sim_c.max())
        sim_c_n = (sim_c - mn) / (mx - mn + 1e-8) if mx > mn else np.zeros_like(sim_c, dtype=np.float32)

        mmr = lam * rel_n[cand_idx] - (1.0 - lam) * sim_c_n + bonus_c
        best_local = int(np.argmax(mmr))
        best_idx = int(cand_idx[best_local])

        chosen_ids.append(int(item_ids_f[best_idx]))
        alive[best_idx] = False

        for lab in labels_by_idx[best_idx]:
            cov_counts[lab] = max(0, cov_counts.get(lab, 0)) + 1

        if alive.any():
            sel_code = cat_codes[best_idx]
            sel_lat = float(lat_rad[best_idx]); sel_lon = float(lon_rad[best_idx])
            aidx = np.flatnonzero(alive)

            sim_cat = (cat_codes[aidx] == sel_code).astype(np.float32)
            sim_geo = _geo_similarity_haversine(
                lat_vec_rad=lat_rad[aidx],
                lon_vec_rad=lon_rad[aidx],
                sel_lat_rad=sel_lat,
                sel_lon_rad=sel_lon,
                tau_km=geo_tau_km,
            )
            combined = (w_cat * sim_cat + w_geo * sim_geo).astype(np.float32)
            np.maximum(current_max_sim[aidx], combined, out=current_max_sim[aidx])

        if debug and step in (0, 1):
            ci = np.flatnonzero(alive)
            if ci.size:
                print(f"[step {step}] penalty.std={current_max_sim[ci].std():.5f}")
        step += 1

    return np.array(chosen_ids[:k], dtype=int), np.array([], dtype=float)

In [None]:
def rerank_region_periphery(
    item_ids: np.ndarray,
    base_scores: np.ndarray,
    item_meta_std: pd.DataFrame,
    k: int,
    region_label: str = "서울 강남구",
    hotspot_coords: Optional[Iterable[Tuple[float, float]]] = None,
    n_auto_hotspots: int = 5,
    periphery_strength: float = 0.5,
    periphery_cap: float = 0.15,
    lambda_div: float = 0.55,
    w_cat: float = 0.5,
    w_geo: float = 0.5,
    geo_tau_km: float = 2.0,
    coverage_min: Optional[Dict[str, int]] = None,
    coverage_max: Optional[Dict[str, int]] = None,
    region_of: Optional[Dict[int, str]] = None,
):
    """
    Re-rank items within a target region by adding a periphery bonus (farther from hotspots)
    and then applying MMR-based re-ranking with category/geography diversity.

    The periphery bonus is computed by:
      1) Determining hotspot centers (given or auto via KMeans).
      2) Computing each candidate's minimum haversine distance to the centers.
      3) Min-max normalizing that minimum distance and scaling/clipping to [0, periphery_cap].
      4) Adding the bonus to base_scores before calling MMR re-ranking.

    Args:
        item_ids (np.ndarray): Array of candidate item IDs (ordered by base ranking).
        base_scores (np.ndarray): Base relevance scores aligned with item_ids.
        item_meta_std (pd.DataFrame): Metadata containing at least:
            ['diner_idx', 'diner_lat', 'diner_lon', 'diner_road_address'].
        k (int): Number of items to select.
        region_label (str, optional): Target region label (e.g., "서울 강남구").
        hotspot_coords (Optional[Iterable[Tuple[float, float]]], optional):
            Iterable of (lat, lon) hotspot coordinates in degrees. If None, auto-detected.
        n_auto_hotspots (int, optional): Number of clusters (hotspots) to auto-detect via KMeans when not provided.
        periphery_strength (float, optional): Scale for the periphery bonus before capping.
        periphery_cap (float, optional): Upper cap applied to the periphery bonus.
        lambda_div (float, optional): MMR trade-off between relevance and diversity [0, 1].
        w_cat (float, optional): Weight for category similarity in MMR.
        w_geo (float, optional): Weight for geographic similarity in MMR.
        geo_tau_km (float, optional): Length scale (km) for geographic similarity kernel.
        coverage_min (Optional[Dict[str, int]], optional): Minimum coverage constraints per label.
        coverage_max (Optional[Dict[str, int]], optional): Maximum coverage constraints per label.
        region_of (Optional[Dict[int, str]], optional): Mapping from item_id to region label (used by MMR coverage).

    Returns:
        Tuple[np.ndarray, np.ndarray]:
            - Selected item IDs (length ≤ k).
            - Placeholder scores array (currently empty).
    """
    # 0) 기본 변환
    item_ids = np.asarray(item_ids, dtype=np.int64)
    base_scores = np.asarray(base_scores, dtype=np.float32)
    meta = item_meta_std

    # 1) 지역 필터
    target_region = extract_region_label(region_label)
    if "diner_road_address" in meta.columns:
        region_norm = meta["diner_road_address"].map(extract_region_label)
        region_idx = meta.loc[region_norm == target_region, "diner_idx"].to_numpy(dtype=np.int64, copy=False)
    else:
        region_idx = np.empty(0, dtype=np.int64)

    if region_idx.size > 0:
        mask = np.isin(item_ids, region_idx, assume_unique=False)
        item_ids_g = item_ids[mask]
        base_scores_g = base_scores[mask]
    else:
        # 필터 결과가 0이면 원본 전체 후보 사용
        item_ids_g = item_ids
        base_scores_g = base_scores

    if item_ids_g.size == 0 or k <= 0:
        return item_ids[:0], base_scores[:0]

    # 2) 좌표 정리
    meta_idx = meta.set_index("diner_idx", drop=False)
    latlon = meta_idx.reindex(item_ids_g)[["diner_lat", "diner_lon"]].to_numpy(dtype=np.float32)
    valid = np.isfinite(latlon).all(axis=1)
    latlon_valid = latlon[valid]

    # 3) 핫스팟 결정 (주어지지 않으면 KMeans로 자동 산출)
    if hotspot_coords is None and n_auto_hotspots > 0 and latlon_valid.shape[0] >= 2:
        from sklearn.cluster import KMeans
        n_clusters = int(min(n_auto_hotspots, latlon_valid.shape[0]))
        km = KMeans(n_clusters=n_clusters, n_init=5, random_state=42)
        km.fit(latlon_valid)
        centers = km.cluster_centers_.astype(np.float32, copy=False)   # degrees
    elif hotspot_coords is not None:
        centers = np.asarray(list(hotspot_coords), dtype=np.float32)   # degrees
    else:
        centers = np.empty((0, 2), dtype=np.float32)

    # 4) 변두리 보너스 (정확한 거리: haversine 사용)
    periphery_bonus = np.zeros_like(base_scores_g, dtype=np.float32)
    if centers.size and latlon_valid.shape[0] > 0 and periphery_strength > 0 and periphery_cap > 0:
        # centers: shape (Nc, 2) in degrees; latlon_valid: (Nv, 2) in degrees
        # 각 center에 대해 haversine(center, 모든 valid 후보) → (Nv, Nc) 거리 행렬 후 min
        dists_stack = []
        diner_lat_series = pd.Series(latlon_valid[:, 0], copy=False)
        diner_lon_series = pd.Series(latlon_valid[:, 1], copy=False)
        for c_lat, c_lon in centers:
            d_km = haversine(
                reviewer_lat=float(c_lat),
                reviewer_lon=float(c_lon),
                diner_lat=diner_lat_series,
                diner_lon=diner_lon_series,
            )  # (Nv,)
            dists_stack.append(np.asarray(d_km, dtype=np.float64))
        if dists_stack:
            D = np.vstack(dists_stack).T  # (Nv, Nc)
            dmin = D.min(axis=1).astype(np.float32, copy=False)
            # 0–1 정규화 후 가점
            dmin_n = _minmax(dmin)  # float32, [0,1]
            bonus_valid = np.clip(periphery_strength * dmin_n, 0.0, periphery_cap).astype(np.float32, copy=False)
            periphery_bonus[valid] = bonus_valid  # invalid 좌표는 0 유지

    base_scores_boosted = (base_scores_g + periphery_bonus).astype(np.float32, copy=False)

    # 5) 최종 MMR 재랭크 호출
    use_geo = float(np.isfinite(latlon).sum()) >= 2.0
    final_ids, final_scores = rerank_most_popular_with_diversity(
        item_ids=item_ids_g,
        base_scores=base_scores_boosted,
        item_meta=meta,
        k=k,
        lambda_div=lambda_div,
        w_cat=w_cat,
        w_geo=w_geo if use_geo else 0.0,
        geo_tau_km=geo_tau_km,
        coverage_min=coverage_min,
        coverage_max=coverage_max,
        region_of=region_of,
        popularity_weight=0.0,
        popularity_scores=None,
        normalize_rel=True,
        top_m=None,
        debug=False,
    )
    return final_ids, final_scores


## Project Mode (Main)


In [None]:
if PROJECT_MODE:
    ROOT_PATH = Path(ROOT_PATH)
    RESULT_PATH = Path(RESULT_PATH)

    RESULT_PATH.mkdir(parents=True, exist_ok=True)

    config = load_yaml(CONFIG_PATH)
    preprocess_config = load_yaml(str(PREPROCESS_CONFIG_PATH))

    # 로그 세팅 및 실행 커맨드 기록
    file_name = config.post_training.file_name
    logger = setup_logger(str(RESULT_PATH / file_name.log))
    save_command_to_file(str(RESULT_PATH))

    try:
        logger.info("model: most_popular")
        logger.info(f"training results will be saved in {RESULT_PATH}")

        # 데이터 로딩
        fe = config.preprocess.feature_engineering
        data_config=DataConfig(
                X_columns=["diner_idx", "reviewer_id"],
                y_columns=["reviewer_review_score"],
                user_engineered_feature_names=fe.user_engineered_feature_names,
                diner_engineered_feature_names=fe.diner_engineered_feature_names,
                is_timeseries_by_time_point=config.preprocess.data.is_timeseries_by_time_point,
                train_time_point=config.preprocess.data.train_time_point,
                val_time_point=config.preprocess.data.val_time_point,
                test_time_point=config.preprocess.data.test_time_point,
                end_time_point=config.preprocess.data.end_time_point,
                test=False,
            )
        data_config.additional_reviews_path = PARENTS_PATH / data_config.additional_reviews_path 
        data_loader = DatasetLoader(data_config = data_config)
        data = data_loader.prepare_train_val_dataset(is_csr=True, filter_config=preprocess_config.filter)
        common_logging(config=config, data=data, logger=logger)

        # K 설정
        top_k_values_for_pred = config.training.evaluation.top_k_values_for_pred
        top_k_values_for_candidate = config.training.evaluation.top_k_values_for_candidate
        top_k_values = top_k_values_for_pred + top_k_values_for_candidate

        item_meta = data["diner_meta"]
        candidates = np.array(data["most_popular_diner_ids"], dtype=np.int64)
        base_scores = 1.0 / (np.arange(len(candidates)) + 1)

        meta_ids = item_meta["diner_idx"]
        if not pd.api.types.is_integer_dtype(meta_ids.dtype):
            meta_vals = pd.to_numeric(meta_ids, errors="coerce").dropna().astype(np.int64).to_numpy()
        else:
            meta_vals = meta_ids.to_numpy(dtype=np.int64, copy=False)

        mask = np.isin(candidates, meta_vals)
        dropped = int((~mask).sum())
        if dropped:
            logger.warning(f"dropped {dropped} candidates not in item_meta")

        candidates = candidates[mask]
        base_scores = base_scores[mask]

        reranked_ids, _ = rerank_region_periphery(
            item_ids=candidates,
            base_scores=base_scores,
            item_meta_std=item_meta,
            k=max(top_k_values),
            region_label="서울 강남구",
            hotspot_coords=None,
            n_auto_hotspots=5,      
            periphery_strength=0.5,  
            periphery_cap=0.15,
            lambda_div=0.55,       
            w_cat=0.5, w_geo=0.5,    
            geo_tau_km=2.0,         
        )
        reranked_most_popular = reranked_ids.tolist()

        # 평가
        metric_calculator = MostPopularMetricCalculator(
            top_k_values=top_k_values,
            filter_already_liked=True,
            recommend_batch_size=config.training.evaluation.recommend_batch_size,
            logger=logger,
        )

        metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
            X_train=data["X_train_df"],
            X_val_warm_users=data["X_val_warm_users"],
            X_val_cold_users=data["X_val_cold_users"],
            most_popular_diner_ids=reranked_most_popular,
            filter_already_liked=True,
            most_popular_rec_to_warm_users=True,
        )
        for user_type, metric in metric_dict.items():
            metric_calculator.calculate_mean_metric(metric)
        logger.info("################################ Validation data metric report ################################")
        metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="val")

        metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
            X_train=data["X_train_df"],
            X_val_warm_users=data["X_test_warm_users"],
            X_val_cold_users=data["X_test_cold_users"],
            most_popular_diner_ids=reranked_most_popular,
            filter_already_liked=True,
            most_popular_rec_to_warm_users=True,
        )
        for user_type, metric in metric_dict.items():
            metric_calculator.calculate_mean_metric(metric)
        logger.info("################################ Test data metric report ################################")
        metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="test")

        print("Project pipeline finished. See logs for details:", RESULT_PATH / file_name.log)

    except Exception as e:
        print("Error during project run:", repr(e))
        traceback.print_exc()


2025-08-25 00:30:28,739 - yamyam - INFO - model: most_popular
2025-08-25 00:30:28,740 - yamyam - INFO - training results will be saved in C:\Users\LEEYS\Desktop\yamyam-lab\result\untest\most_popular\20250825003028


기존 data가 존재합니다. 파일 경로를 반환합니다.


2025-08-25 00:37:17,966 - preprocess.filter - INFO - Token time for tokenizing: 369.46
2025-08-25 00:37:19,455 - preprocess.filter - INFO - Detected 10 diner_ids with abusive reviews: [20557155.0, 561814157.0, 717255023.0, 1210281986.0, 1210386151.0, 1275807781.0, 1390211388.0, 1420824177.0, 1567102742.0, 1983344097.0]
2025-08-25 00:37:19,686 - preprocess.filter - INFO - Excluded 3204 abusive reviews
2025-08-25 00:37:22,552 - yamyam - INFO - train dataset period: 2024-09-01 <= dt < 2024-12-01
2025-08-25 00:37:22,553 - yamyam - INFO - val dataset period: 2024-12-01 <= dt < 2025-01-01
2025-08-25 00:37:22,554 - yamyam - INFO - test dataset period: 2025-01-01 <= dt < 2025-02-01
2025-08-25 00:37:22,555 - yamyam - INFO - ######## Number of reviews statistics ########
2025-08-25 00:37:22,555 - yamyam - INFO - Number of reviews in train: 666811
2025-08-25 00:37:22,556 - yamyam - INFO - Number of reviews in val: 666811
2025-08-25 00:37:22,561 - yamyam - INFO - Number of reviews in test: 666811


Project pipeline finished. See logs for details: C:\Users\LEEYS\Desktop\yamyam-lab\result\untest\most_popular\20250825003028\log.log


## 추가 실험 

In [None]:
# ===================== Variants: BALANCED vs BASED vs DIV_HEAVY =====================

# (안전) logger 없으면 간단 세팅
try:
    logger
except NameError:
    import logging, sys
    logger = logging.getLogger("rerank_test")
    logger.setLevel(logging.INFO)
    h = logging.StreamHandler(sys.stdout)
    h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
    logger.addHandler(h)

# 0) 공통 준비 --------------------------------------------------------------------------
item_meta = data["diner_meta"]

# 후보/베이스 점수 
candidates = np.array(data["most_popular_diner_ids"], dtype=np.int64)
base_scores = 1.0 / (np.arange(len(candidates)) + 1)

meta_ids = item_meta["diner_idx"]
if not pd.api.types.is_integer_dtype(meta_ids.dtype):
    meta_vals = pd.to_numeric(meta_ids, errors="coerce").dropna().astype(np.int64).to_numpy()
else:
    meta_vals = meta_ids.to_numpy(dtype=np.int64, copy=False)

mask = np.isin(candidates, meta_vals)
dropped = int((~mask).sum())
if dropped:
    logger.warning(f"dropped {dropped} candidates not in item_meta")
candidates = candidates[mask]
base_scores = base_scores[mask]

# K 고정
k = int(max(top_k_values))

# 결과 비교 로그
def _log_variant(name: str, ref_ids: np.ndarray, var_ids: np.ndarray, k_show: int = 10):
    ref_top = ref_ids[:k]
    var_top = var_ids[:k]
    overlap = len(set(ref_top) & set(var_top))
    logger.info(f"[{name}] overlap@K={overlap}/{k}")
    for i, (a, b) in enumerate(zip(ref_top, var_top)):
        if a != b:
            logger.info(f"[{name}] first diff at pos {i}: ref={a}, var={b}")
            break
    else:
        logger.info(f"[{name}] no diff within top-{k}")
    logger.info(f"[{name}] head{ k_show } ref={ref_top[:k_show].tolist()}")
    logger.info(f"[{name}] head{ k_show } var={var_top[:k_show].tolist()}")

# 메트릭 계산/리포트
def _report_metrics(label: str, ids_seq):
    seq = ids_seq.tolist() if isinstance(ids_seq, np.ndarray) else list(ids_seq)

    # Validation
    metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
        X_train=data["X_train_df"],
        X_val_warm_users=data["X_val_warm_users"],
        X_val_cold_users=data["X_val_cold_users"],
        most_popular_diner_ids=seq,
        filter_already_liked=True,
        most_popular_rec_to_warm_users=True,
    )
    for user_type, metric in metric_dict.items():
        metric_calculator.calculate_mean_metric(metric)
    logger.info(f"######## Validation metrics ({label}) ########")
    metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="val")

    # Test
    metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
        X_train=data["X_train_df"],
        X_val_warm_users=data["X_test_warm_users"],
        X_val_cold_users=data["X_test_cold_users"],
        most_popular_diner_ids=seq,
        filter_already_liked=True,
        most_popular_rec_to_warm_users=True,
    )
    for user_type, metric in metric_dict.items():
        metric_calculator.calculate_mean_metric(metric)
    logger.info(f"######## Test metrics ({label}) ########")
    metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="test")

# 1) BALANCED -----------------------------------------------------
balanced_ids, _ = rerank_region_periphery(
    item_ids=candidates,
    base_scores=base_scores,
    item_meta_std=item_meta,
    k=k,
    region_label="서울 강남구",
    hotspot_coords=None,
    n_auto_hotspots=5,       
    periphery_strength=0.5,  
    periphery_cap=0.15,
    lambda_div=0.55,         
    w_cat=0.5, w_geo=0.5,   
    geo_tau_km=2.0,
)
logger.info("[BALANCED] generated")

# 2) BASED  -------------------------------------------------------------
BASED_ids = candidates[:k].copy()
logger.info("[BASED] generated")

# 3) DIV_HEAVY (다양성 극대화) ----------------------------------------------------------
div_heavy_ids, _ = rerank_region_periphery(
    item_ids=candidates,
    base_scores=base_scores,
    item_meta_std=item_meta,
    k=k,
    region_label="서울 강남구",
    hotspot_coords=None,
    n_auto_hotspots=5,
    periphery_strength=0.8,   
    periphery_cap=0.30,
    lambda_div=0.20,         
    w_cat=0.0, w_geo=1.0,     
    geo_tau_km=1.5,           
)
logger.info("[DIV_HEAVY] generated")

# 정확도 극대화
# accuracy_heavy_ids, _ = rerank_region_periphery(
#     item_ids=candidates, base_scores=base_scores, item_meta_std=item_meta, k=k,
#     region_label="서울 강남구", hotspot_coords=None, n_auto_hotspots=0,
#     periphery_strength=0.0, periphery_cap=0.0,
#     lambda_div=0.95, w_cat=1.0, w_geo=0.0, geo_tau_km=2.0
# )

_log_variant("BASED vs BALANCED", balanced_ids, BASED_ids, k_show=10)
_log_variant("DIV_HEAVY vs BALANCED", balanced_ids, div_heavy_ids, k_show=10)
# _log_variant("ACCURACY_HEAVY vs BALANCED", balanced_ids, accuracy_heavy_ids, k_show=10)

metric_calculator = MostPopularMetricCalculator(
    top_k_values=top_k_values,
    filter_already_liked=True,
    recommend_batch_size=config.training.evaluation.recommend_batch_size,
    logger=logger,
)

_report_metrics("BALANCED", balanced_ids)
_report_metrics("BASED", BASED_ids)
_report_metrics("DIV_HEAVY", div_heavy_ids)
# _report_metrics("ACCURACY_HEAVY", accuracy_heavy_ids)

reranked_most_popular = balanced_ids.tolist()
# ======================================================================================


2025-08-25 00:59:11,964 - yamyam - INFO - [BALANCED] generated
2025-08-25 00:59:11,964 - yamyam - INFO - [POP_BASE] generated
2025-08-25 00:59:15,803 - yamyam - INFO - [DIV_HEAVY] generated
2025-08-25 00:59:15,804 - yamyam - INFO - [POP_BASE vs BALANCED] overlap@K=141/2000
2025-08-25 00:59:15,806 - yamyam - INFO - [POP_BASE vs BALANCED] first diff at pos 0: ref=159368, var=99146
2025-08-25 00:59:15,807 - yamyam - INFO - [POP_BASE vs BALANCED] head10 ref=[159368, 48283, 74907, 120101, 159318, 104501, 95844, 37572, 9863, 111554]
2025-08-25 00:59:15,808 - yamyam - INFO - [POP_BASE vs BALANCED] head10 var=[99146, 115693, 4522, 62327, 125037, 126769, 6778, 131370, 48283, 51000]
2025-08-25 00:59:15,809 - yamyam - INFO - [DIV_HEAVY vs BALANCED] overlap@K=141/2000
2025-08-25 00:59:15,811 - yamyam - INFO - [DIV_HEAVY vs BALANCED] first diff at pos 1: ref=48283, var=135396
2025-08-25 00:59:15,812 - yamyam - INFO - [DIV_HEAVY vs BALANCED] head10 ref=[159368, 48283, 74907, 120101, 159318, 104501, 