
# Most Popular Re-ranking 

In [50]:

# === 경로/설정 ===
from pathlib import Path
import os, sys, traceback
from datetime import datetime


dt = datetime.now().strftime("%Y%m%d%H%M%S")
test = "untest"
model = "most_popular"
config_model = "als"  


PARENTS_PATH = Path(os.getcwd()).resolve().parent
CONFIG_PATH = os.path.join(PARENTS_PATH, f"config/models/mf/{config_model}.yaml")
PREPROCESS_CONFIG_PATH = os.path.join(PARENTS_PATH, f"config/preprocess/preprocess.yaml")
RESULT_PATH = os.path.join(PARENTS_PATH, f"result/{test}/{model}/{dt}")

ROOT_PATH = os.path.join(PARENTS_PATH,'src')


if str(ROOT_PATH) not in sys.path:
    sys.path.append(str(ROOT_PATH))

print(f"ROOT_PATH: {ROOT_PATH}")
print(f"CONFIG_PATH: {CONFIG_PATH}")
print(f"PREPROCESS_CONFIG_PATH: {PREPROCESS_CONFIG_PATH}")
print(f"RESULT_ROOT: {RESULT_PATH}")


ROOT_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\src
CONFIG_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\config/models/mf/als.yaml
PREPROCESS_CONFIG_PATH: C:\Users\LEEYS\Desktop\yamyam-lab\config/preprocess/preprocess.yaml
RESULT_ROOT: C:\Users\LEEYS\Desktop\yamyam-lab\result/untest/most_popular/20250817232110


In [12]:

PROJECT_MODE = True
try:
    import numpy as np
    import pandas as pd

    from data.dataset import DataConfig, DatasetLoader
    from evaluation.metric_calculator import MostPopularMetricCalculator
    from tools.config import load_yaml
    from tools.logger import common_logging, setup_logger
    from tools.parse_args import save_command_to_file

    print("Project imports succeeded.")
except Exception as e:
    PROJECT_MODE = False
    print("Project imports failed.")
    print("Reason:", repr(e))


Project imports succeeded.


In [13]:

import re
import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Tuple, Iterable

def _haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    dlat = np.radians(lat2 - lat1); dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1))*np.cos(np.radians(lat2))*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

def _geo_similarity_km(latlon_i, latlon_j, tau_km: float):
    li = np.asarray(latlon_i, dtype=float)
    lj = np.asarray(latlon_j, dtype=float)
    if li.shape != (2,) or lj.shape != (2,):
        return 0.0
    if np.any(np.isnan(li)) or np.any(np.isnan(lj)): 
        return 0.0

    R = 6371.0
    dlat = np.radians(lj[0] - li[0])
    dlon = np.radians(lj[1] - li[1])
    a = np.sin(dlat/2)**2 + np.cos(np.radians(li[0]))*np.cos(np.radians(lj[0]))*np.sin(dlon/2)**2
    dist = 2 * R * np.arcsin(np.sqrt(a))
    return float(np.exp(-dist / max(tau_km, 1e-6)))

def extract_region_label(addr: str) -> str:
    if not isinstance(addr, str) or not addr:
        return "unknown"
    parts = addr.split()
    if len(parts) >= 2 and parts[1].endswith("구"):
        return " ".join(parts[:2])
    m = re.match(r"^(\S+)\s+(\S+구|\S+군|\S+시)", addr)
    return m.group(0) if m else parts[0] if parts else "unknown"

def build_item_meta_for_rerank(diner_meta: pd.DataFrame) -> pd.DataFrame:
    meta = diner_meta.copy()
    if "region" not in meta.columns and "diner_road_address" in meta.columns:
        meta["region"] = meta["diner_road_address"].map(extract_region_label)

    if "lat" not in meta.columns: meta["lat"] = np.nan
    if "lon" not in meta.columns: meta["lon"] = np.nan

    meta["lat"] = pd.to_numeric(meta["lat"], errors="coerce")
    meta["lon"] = pd.to_numeric(meta["lon"], errors="coerce")

    return meta[["diner_idx", "category", "lat", "lon", "region"]].drop_duplicates("diner_idx")


In [14]:

def _pairwise_sim_max(i_meta: pd.Series, selected_meta: List[pd.Series], w_cat: float, w_geo: float, geo_tau_km: float) -> float:
    if not selected_meta:
        return 0.0
    cat_i = i_meta.get("category", "unknown")
    latlon_i = (i_meta.get("lat", np.nan), i_meta.get("lon", np.nan))
    sims = []
    for j_meta in selected_meta:
        cat_sim = 1.0 if (cat_i == j_meta.get("category", None)) else 0.0
        geo_sim = _geo_similarity_km(latlon_i, (j_meta.get("lat", np.nan), j_meta.get("lon", np.nan)), geo_tau_km)
        sims.append(w_cat * cat_sim + w_geo * geo_sim)
    return float(np.max(sims))

def _apply_coverage_constraints(
    cand_ids: np.ndarray,
    chosen_ids: List[int],
    item_meta_idx: pd.DataFrame,
    coverage_min: dict,
    coverage_max: dict,
    region_of: dict,
):
    if not coverage_min and not coverage_max:
        return cand_ids

    def _labels_of(item_id: int):
        if item_id not in item_meta_idx.index:
            labels = ["category:unknown"]
            if region_of: labels.append(f"region:{region_of.get(int(item_id), 'unknown')}")
            return labels
        row = item_meta_idx.loc[item_id]
        labels = [f"category:{row['category']}"]
        if region_of: labels.append(f"region:{region_of.get(int(item_id), 'unknown')}")
        return labels

    counts = {}
    for cid in chosen_ids:
        for lab in _labels_of(int(cid)):
            counts[lab] = counts.get(lab, 0) + 1

    keep = []
    for cid in cand_ids:
        labs = _labels_of(int(cid))
        ok = True
        if coverage_max:
            for lab, mx in coverage_max.items():
                if lab in labs and counts.get(lab, 0) >= mx:
                    ok = False; break
        if ok: keep.append(cid)
    return np.array(keep, dtype=cand_ids.dtype)

def rerank_most_popular_with_diversity(
    item_ids: np.ndarray,
    base_scores: np.ndarray,
    item_meta: pd.DataFrame,
    k: int,
    lambda_div: float = 0.35,
    w_cat: float = 0.6,
    w_geo: float = 0.4,
    geo_tau_km: float = 2.0,
    coverage_min: Optional[Dict[str, int]] = None,
    coverage_max: Optional[Dict[str, int]] = None,
    region_of: Optional[Dict[int, str]] = None,
    popularity_weight: float = 0.5,
    popularity_scores: Optional[np.ndarray] = None,
):
    assert len(item_ids) == len(base_scores), "item_ids/base_scores 길이 불일치"
    L = len(item_ids); k = min(k, L)

    rel = base_scores.astype(float).copy()
    if popularity_scores is not None:
        def _norm(x):
            x = x - np.min(x); denom = np.max(x)
            return x / denom if denom > 0 else np.zeros_like(x)
        rel = (1 - popularity_weight) * _norm(rel) + popularity_weight * _norm(popularity_scores.astype(float))

    item_meta_idx = item_meta.set_index("diner_idx", drop=False)

    has_meta = np.array([cid in item_meta_idx.index for cid in item_ids])
    if not has_meta.all():
        item_ids = item_ids[has_meta]
        rel = rel[has_meta]

    chosen_ids: List[int] = []; chosen_scores: List[float] = []; selected_meta: List[pd.Series] = []
    remaining = item_ids.copy()

    while len(chosen_ids) < k and len(remaining) > 0:
        remaining = _apply_coverage_constraints(
            cand_ids=remaining,
            chosen_ids=chosen_ids,
            item_meta_idx=item_meta_idx,
            coverage_min=coverage_min or {},
            coverage_max=coverage_max or {},
            region_of=region_of or {},
        )
        if len(remaining) == 0: break

        mmr_scores = []
        for cid in remaining:
            base_rel = rel[np.where(item_ids == cid)[0][0]]
            row = item_meta_idx.loc[int(cid)]
            div_penalty = _pairwise_sim_max(row, selected_meta, w_cat, w_geo, geo_tau_km)
            score = (1 - lambda_div) * base_rel - lambda_div * div_penalty

            if coverage_min:
                counts = {}
                for cc in chosen_ids:
                    crow = item_meta_idx.loc[int(cc)]
                    clabs = [f"category:{crow['category']}"]
                    if region_of: clabs.append(f"region:{region_of.get(int(cc), 'unknown')}")
                    for lab in clabs: counts[lab] = counts.get(lab, 0) + 1
                labs = [f"category:{row['category']}"]
                if region_of: labs.append(f"region:{region_of.get(int(cid), 'unknown')}")
                for lab, mn in (coverage_min or {}).items():
                    if lab in labs and counts.get(lab, 0) < mn:
                        score += (mn - counts.get(lab, 0)) * 0.05

            mmr_scores.append(score)

        mmr_scores = np.array(mmr_scores)
        best_idx = int(np.argmax(mmr_scores))
        best_id  = int(remaining[best_idx])

        chosen_ids.append(best_id)
        chosen_scores.append(float(mmr_scores[best_idx]))
        selected_meta.append(item_meta_idx.loc[best_id])

        remaining = remaining[remaining != best_id]

    return np.array(chosen_ids, dtype=int), np.array(chosen_scores, dtype=float)


In [None]:

from typing import Iterable, Tuple

def rerank_region_periphery(
    item_ids: np.ndarray,
    base_scores: np.ndarray,
    item_meta_std: pd.DataFrame,
    k: int,
    region_label: str = "서울 강남구",
    hotspot_coords: Optional[Iterable[Tuple[float, float]]] = None,
    n_auto_hotspots: int = 3,
    periphery_strength: float = 0.25,
    periphery_cap: float = 0.3,
    lambda_div: float = 0.3,
    w_cat: float = 0.6,
    w_geo: float = 0.3,
    geo_tau_km: float = 2.0,
    coverage_min: Optional[Dict[str, int]] = None,
    coverage_max: Optional[Dict[str, int]] = None,
    region_of: Optional[Dict[int, str]] = None,
):
    meta = item_meta_std.copy()
    gangnam_ids = set(meta.loc[meta["region"] == region_label, "diner_idx"].tolist())
    mask = np.array([iid in gangnam_ids for iid in item_ids])
    item_ids_g = item_ids[mask]
    base_scores_g = base_scores[mask]

    if len(item_ids_g) == 0:
        return item_ids[:k], base_scores[:k]

    meta_idx = meta.set_index("diner_idx")
    latlon = meta_idx.loc[item_ids_g, ["lat", "lon"]].astype(float).values
    valid = ~np.isnan(latlon).any(axis=1)
    latlon_valid = latlon[valid]

    if hotspot_coords is None:
        from sklearn.cluster import KMeans
        n_clusters = min(n_auto_hotspots, max(1, latlon_valid.shape[0]))
        km = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        km.fit(latlon_valid)
        centers = km.cluster_centers_
    else:
        centers = np.array(list(hotspot_coords), dtype=float)

    periphery_bonus = np.zeros_like(base_scores_g, dtype=float)
    if len(centers) > 0 and latlon_valid.shape[0] > 0:
        dists_min = []
        for (lat, lon) in latlon_valid:
            dmin = np.min([_haversine_km(lat, lon, c[0], c[1]) for c in centers])
            dists_min.append(dmin)
        dists_min = np.array(dists_min)

        if dists_min.ptp() > 0:
            d_norm = (dists_min - dists_min.min()) / (dists_min.max() - dists_min.min())
        else:
            d_norm = np.zeros_like(dists_min)

        bonus_valid = np.clip(periphery_strength * d_norm, 0.0, periphery_cap)
        periphery_bonus[valid] = bonus_valid

    base_scores_boosted = base_scores_g + periphery_bonus

    final_ids, final_scores = rerank_most_popular_with_diversity(
        item_ids=item_ids_g,
        base_scores=base_scores_boosted,
        item_meta=meta,
        k=k,
        lambda_div=lambda_div,
        w_cat=w_cat,
        w_geo=w_geo if np.isfinite(latlon).any() else 0.0,
        geo_tau_km=geo_tau_km,
        coverage_min=coverage_min,
        coverage_max=coverage_max,
        region_of=region_of,
        popularity_weight=0.0,
        popularity_scores=None,
    )
    return final_ids, final_scores



## Project Mode (Main)


In [None]:
if PROJECT_MODE:
    ROOT_PATH = Path(ROOT_PATH)
    RESULT_PATH = Path(RESULT_PATH)

    RESULT_PATH.mkdir(parents=True, exist_ok=True)

    config = load_yaml(CONFIG_PATH)
    preprocess_config = load_yaml(str(PREPROCESS_CONFIG_PATH))

    # 로그 세팅 및 실행 커맨드 기록
    file_name = config.post_training.file_name
    logger = setup_logger(str(RESULT_PATH / file_name.log))
    save_command_to_file(str(RESULT_PATH))

    try:
        logger.info("model: most_popular")
        logger.info(f"training results will be saved in {RESULT_PATH}")

        # 데이터 로딩
        fe = config.preprocess.feature_engineering
        data_config=DataConfig(
                X_columns=["diner_idx", "reviewer_id"],
                y_columns=["reviewer_review_score"],
                user_engineered_feature_names=fe.user_engineered_feature_names,
                diner_engineered_feature_names=fe.diner_engineered_feature_names,
                is_timeseries_by_time_point=config.preprocess.data.is_timeseries_by_time_point,
                train_time_point=config.preprocess.data.train_time_point,
                val_time_point=config.preprocess.data.val_time_point,
                test_time_point=config.preprocess.data.test_time_point,
                end_time_point=config.preprocess.data.end_time_point,
                test=False,
            )
        data_config.additional_reviews_path = PARENTS_PATH / data_config.additional_reviews_path 
        data_loader = DatasetLoader(data_config = data_config)
        data = data_loader.prepare_train_val_dataset(is_csr=True, filter_config=preprocess_config.filter)
        common_logging(config=config, data=data, logger=logger)

        # 평가 K 설정
        top_k_values_for_pred = config.training.evaluation.top_k_values_for_pred
        top_k_values_for_candidate = config.training.evaluation.top_k_values_for_candidate
        top_k_values = top_k_values_for_pred + top_k_values_for_candidate

        # 메타 구성
        item_meta = build_item_meta_for_rerank(data["diner_meta"])

        N = max(2000, max(top_k_values))
        candidates = np.array(data["most_popular_diner_ids"][:N], dtype=int)
        base_scores = 1.0 / (np.arange(len(candidates)) + 1)

        valid = np.intersect1d(candidates, item_meta["diner_idx"].values)
        mask = np.isin(candidates, valid)
        if len(valid) < len(candidates):
            logger.warning(f"dropped {len(candidates)-len(valid)} candidates not in item_meta")
        candidates = candidates[mask]; base_scores = base_scores[mask]

        reranked_ids, _ = rerank_region_periphery(
            item_ids=candidates,
            base_scores=base_scores,
            item_meta_std=item_meta,
            k=max(top_k_values),
            region_label="서울 강남구",
            hotspot_coords=None,
            n_auto_hotspots=3,
            periphery_strength=0.25,
            periphery_cap=0.3,
            lambda_div=0.3,
            w_cat=0.6,
            w_geo=0.3,
            geo_tau_km=2.0,
        )
        reranked_most_popular = reranked_ids.tolist()

        # 평가
        metric_calculator = MostPopularMetricCalculator(
            top_k_values=top_k_values,
            filter_already_liked=True,
            recommend_batch_size=config.training.evaluation.recommend_batch_size,
            logger=logger,
        )

        metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
            X_train=data["X_train_df"],
            X_val_warm_users=data["X_val_warm_users"],
            X_val_cold_users=data["X_val_cold_users"],
            most_popular_diner_ids=reranked_most_popular,
            filter_already_liked=True,
            most_popular_rec_to_warm_users=True,
        )
        for user_type, metric in metric_dict.items():
            metric_calculator.calculate_mean_metric(metric)
        logger.info("################################ Validation data metric report ################################")
        metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="val")

        metric_dict = metric_calculator.generate_recommendations_and_calculate_metric(
            X_train=data["X_train_df"],
            X_val_warm_users=data["X_test_warm_users"],
            X_val_cold_users=data["X_test_cold_users"],
            most_popular_diner_ids=reranked_most_popular,
            filter_already_liked=True,
            most_popular_rec_to_warm_users=True,
        )
        for user_type, metric in metric_dict.items():
            metric_calculator.calculate_mean_metric(metric)
        logger.info("################################ Test data metric report ################################")
        metric_calculator.report_metric_with_warm_cold_all_users(metric_dict=metric_dict, data_type="test")

        print("Project pipeline finished. See logs for details:", RESULT_PATH / file_name.log)

    except Exception as e:
        print("Error during project run:", repr(e))
        traceback.print_exc()


2025-08-17 23:23:18,292 - yamyam - INFO - model: most_popular
2025-08-17 23:23:18,292 - yamyam - INFO - training results will be saved in C:\Users\LEEYS\Desktop\yamyam-lab\result\untest\most_popular\20250817232110


기존 data가 존재합니다. 파일 경로를 반환합니다.


2025-08-17 23:31:00,933 - preprocess.filter - INFO - Token time for tokenizing: 421.44
2025-08-17 23:31:02,467 - preprocess.filter - INFO - Detected 10 diner_ids with abusive reviews: [20557155.0, 561814157.0, 717255023.0, 1210281986.0, 1210386151.0, 1275807781.0, 1390211388.0, 1420824177.0, 1567102742.0, 1983344097.0]
2025-08-17 23:31:02,663 - preprocess.filter - INFO - Excluded 3204 abusive reviews
2025-08-17 23:31:05,489 - yamyam - INFO - train dataset period: 2024-09-01 <= dt < 2024-12-01
2025-08-17 23:31:05,489 - yamyam - INFO - val dataset period: 2024-12-01 <= dt < 2025-01-01
2025-08-17 23:31:05,489 - yamyam - INFO - test dataset period: 2025-01-01 <= dt < 2025-02-01
2025-08-17 23:31:05,501 - yamyam - INFO - ######## Number of reviews statistics ########
2025-08-17 23:31:05,501 - yamyam - INFO - Number of reviews in train: 666811
2025-08-17 23:31:05,501 - yamyam - INFO - Number of reviews in val: 666811
2025-08-17 23:31:05,501 - yamyam - INFO - Number of reviews in test: 666811


Project pipeline finished. See logs for details: C:\Users\LEEYS\Desktop\yamyam-lab\result\untest\most_popular\20250817232110\log.log
