In [1]:
from pathlib import Path
import sys
import pickle
import os
import polars as pl
from collections import defaultdict
from math import sqrt
from heapq import heappush, heapreplace
from tqdm import tqdm
from math import exp
import numpy as np

In [2]:
def processData(actions: pl.DataFrame):
    """
    将数据转换为user-item-hist_time
    :param actions: user_id,time,history,impressions
    :return: user_id, item_id, time
    """
    click = (
        actions
        .select(["user_id", "time", "impressions"])
        .explode("impressions")
        .with_columns([
            pl.col("impressions").str.split("-").list.get(0).alias("item_id"),
            pl.col("impressions").str.split("-").list.get(1).cast(pl.UInt8).alias("label"),
        ])
        .filter(pl.col("label") == 1)
        .select(["user_id", "item_id", "time"])
    )

    return click


In [3]:
def read_dev_behaviors():
    pass
def read_train_behaviors():
    pass

In [4]:
def build_usercf_from_history(
    click,
    max_hist_len=100,
    cooc_window=10,  # 对 UserCF 可以理解为邻居选择时使用
    topk=50,
    tau=86400*0.55
):
    """
    基于 MIND history 构建 UserCF 相似度矩阵

    :param click: DataFrame，至少包含 user_id, item_id, time
    :param max_hist_len: 每个用户最多使用的历史长度（取最近）
    :param topk: 每个用户保留 topk 个相似用户
    :return: dict[user_id] -> list[(sim, user_j)]
    """
    user_cnt = defaultdict(int)  # 用户活跃度
    cooc = defaultdict(float)    # 共现次数（用户之间）

    # 按 item 分组，记录每个 item 被哪些用户点击过
    item_users = (
        click
        .sort("time")
        .group_by("item_id")
        .agg([
            pl.col("user_id").alias("users"),
            pl.col("time").alias("times"),
        ])
    )

    for row in tqdm(item_users.iter_rows(named=True), total=item_users.height):
        users = row["users"][-max_hist_len:]
        times = row["times"][-max_hist_len:]
        L = len(users)
        if L < 2:
            continue

        for i in range(L):
            u_i = users[i]
            ti = times[i]
            user_cnt[u_i] += 1

            left = max(0, i - cooc_window)
            right = min(L, i + cooc_window + 1)

            for j in range(left, right):
                if i == j:
                    continue
                u_j = users[j]
                tj = times[j]

                dt = abs(ti - tj).total_seconds()
                time_w = np.exp(-dt / tau)
                cooc[(u_i, u_j)] += time_w

    # TopK 相似用户
    user_sim = defaultdict(list)
    for (i, j), cij in cooc.items():
        sim = cij / np.sqrt(user_cnt[i] * user_cnt[j])
        heap = user_sim[i]
        if len(heap) < topk:
            heappush(heap, (sim, j))
        else:
            if sim > heap[0][0]:
                heapreplace(heap, (sim, j))
    return user_sim


In [5]:
def get_users_earliest_hist(data):
    """
    只保留每个 user_id 最旧的一次 impression 的真实点击
    :param data: user_id, time, history, impressions
    :return: user_id, hist(list[str])
    """
    # 1. 每个 user 只保留 time 最小的一条
    data_oldest = (
        data
        .sort("time")
        .group_by("user_id")
        .agg(pl.all().first())
    )

    # 2. 从 impressions 中抽取正样本
    click = (
        data_oldest
        .select(["user_id", "impressions"])
        .explode("impressions")
        .with_columns([
            pl.col("impressions")
            .str.split("-")
            .list.get(0)
            .alias("article_id"),
            pl.col("impressions")
            .str.split("-")
            .list.get(1)
            .cast(pl.Int8)
            .alias("label"),
        ])
        .filter(pl.col("label") == 1)
    )

    # 3. 聚合为 user → hist
    user_hist = (
        click
        .group_by("user_id")
        .agg(pl.col("article_id").alias("hist"))
    )

    return user_hist

def valid_recall(pred, topk=5):
    gt = get_users_earliest_hist(
        pl.read_parquet("behaviors_1.parquet")
    )

    data = pred.join(
        gt,
        on="user_id",
        how="inner"
    )

    for i in range(1, topk + 1):
        k = i * 10

        recall_k = (
            data
            # 1. 截断召回列表
            .with_columns(
                pl.col("rec_list").list.slice(0, k).alias("rec_k"),
                pl.col("hist").list.len().alias("gt_len")
            )
            # 2. 展开 rec_k
            .explode("rec_k")
            # 3. 是否命中用户 earliest hist
            .with_columns(
                pl.col("rec_k").is_in(pl.col("hist")).cast(pl.Int8).alias("hit")
            )
            # 4. user 级聚合
            .group_by(["user_id", "gt_len"])
            .agg(pl.sum("hit").alias("hit_cnt"))
            # 5. user recall
            .with_columns(
                (pl.col("hit_cnt") / pl.col("gt_len")).alias("recall")
            )
            # 6. 所有 user 平均
            .select(pl.col("recall").mean())
            .item()
        )

        print(f"User-Recall@{k}: {recall_k}")



In [6]:
def recall_usercf_valid(
    train_path=None,
    pred_path=None,
    sim_path="usercf_sim.pkl",
    topk=50,
    tau=86400*0.55,
):
    if train_path:
        train = pl.read_parquet(train_path)
    else:
        train = read_train_behaviors()

    # 拆成impression级别
    train = processData(train)

    train_dict=defaultdict(list)
    for row in train.iter_rows(named=True):
        train_dict[row['user_id']].append((row['item_id'], row['time']))

    if sim_path and os.path.exists(sim_path):
        with open(sim_path, "rb") as f:
            user_sim = pickle.load(f)
    else:
        user_sim = build_usercf_from_history(train)
        with open(sim_path, "wb") as f:
            pickle.dump(user_sim, f)

    del train

    if pred_path:
        pred = pl.read_parquet(pred_path)
    else:
        pred = read_dev_behaviors()


    # 只保留每个 user 最早一次 impression
    pred = (
        pred
        .sort("time")
        .group_by("user_id")
        .agg(pl.all().first())
    )
    results = []

    for row in pred.iter_rows(named=True):
        user = row["user_id"]
        hist = row["history"]
        current = row["time"].timestamp()

        scores = defaultdict(float)

        if user not in user_sim:
            continue
        for sim, u_j in user_sim[user]:
            for item_id, t_hist in train_dict.get(u_j, []):
                if item_id not in hist:
                    dt = (current - t_hist.timestamp()) / tau
                    time_w = np.exp(-dt)
                    scores[item_id] += sim * time_w

        if not scores:
            results.append((user, [], []))
            continue

        rec = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topk]
        results.append(
            (
                user,
                [i for i, _ in rec],
                [s for _, s in rec],
            )
        )


    return pl.DataFrame(
        results,
        schema=["user_id", "rec_list", "rec_score"],
    )


In [7]:
res=recall_usercf_valid('behaviors.parquet','behaviors_1.parquet')

100%|██████████| 17304/17304 [00:22<00:00, 780.89it/s]
  return pl.DataFrame(


In [8]:
valid_recall(res)

User-Recall@10: 0.012995384976237619
User-Recall@20: 0.02111357394888329
User-Recall@30: 0.026519578319316086
User-Recall@40: 0.029960741081389613
User-Recall@50: 0.032294012771405055
