In [1]:
import gc
import os
import math
import pickle

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from operator import itemgetter

from sklearn.utils import shuffle
from collections import defaultdict
# from metrics import PrintMetric

import warnings
warnings.filterwarnings("ignore")

raw_data_path = '../rank/examples/dataset/raw_data_sample'
new_data_path = '../rank/examples/dataset/new_data_sample'

os.makedirs(new_data_path, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'metric'

In [None]:
user_info = pd.read_csv(raw_data_path + '/user_info.csv', header=None)
user_info.columns = ["user_id", "设备名称", "操作系统", "所在省", "所在市", "年龄","性别"]

doc_info = pd.read_csv(raw_data_path + '/doc_info.csv', header=None)
doc_info.columns = ["item_id", "标题", "发文时间", "图片数量",  "一级分类", "二级分类", "关键词"]

all_data = pd.read_csv(raw_data_path + '/train_data.csv', header=None)
all_data.columns = ["user_id", "item_id", "展现时间", "网路环境", "刷新次数", "展现位置", "是否点击", "消费时长（秒）"]

In [5]:
all_data['展现时间'] = all_data['展现时间'].astype('str')
all_data['展现时间'] = all_data['展现时间'].apply(lambda x: int(x[:10]))

all_data['展现时间'] = pd.to_datetime(all_data['展现时间'], unit='s', errors='coerce')
all_data['展现时间_日期'] = all_data['展现时间'].dt.day

all_data.head()

Unnamed: 0,user_id,item_id,展现时间,网路环境,刷新次数,展现位置,是否点击,消费时长（秒）,展现时间_日期
0,1000014754,463510256,2021-06-28 01:29:16,5,0,16,0,0,28
1,1000014754,463852707,2021-06-28 01:29:16,5,0,13,1,80,28
2,1000014754,464757134,2021-06-30 11:36:39,5,0,13,1,1050,30
3,1000014754,464617167,2021-06-30 11:36:39,5,0,16,1,286,30
4,1000014754,465426190,2021-07-04 07:07:01,5,0,5,0,0,4


In [6]:
mode = 'debug'

if mode == 'debug':
    all_data = shuffle(all_data)
    all_data.reset_index(drop=True)

    train_data = all_data[(all_data['展现时间_日期'] >= 5) & (all_data['展现时间_日期'] < 6)]
    test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]
else:
    train_data = all_data[(all_data['展现时间_日期'] >= 1) & (all_data['展现时间_日期'] < 6)]
    test_data = all_data.loc[all_data['展现时间_日期'] == 6, :]

# 训练集按照时间排序
train_data.sort_values('展现时间', inplace=True)

del all_data, doc_info, user_info
gc.collect()

179

In [7]:
print(train_data['user_id'].nunique())
print(train_data['item_id'].nunique())

15655
33664


In [9]:
class ItemCF(object):
    def __init__(self, his_data, item2cate):
        self.user_set = set()
        self.his_data = his_data
        self.item2cate = item2cate

        self.item_sim_matrix = dict()
        self.item_interacted_num = defaultdict(int)

    def calculate_similarity_matrix(self):
        # his_data已经按照时间排序....
        user2items = self.his_data.groupby('user_id')['item_id'].apply(list).reset_index()
        # print(f'计算ItemCF第一阶段...')
        pbar = tqdm(total=user2items.shape[0])
        for idx, row in user2items.iterrows():
            self.user_set.add(row['user_id'])
            for idx1, item_1 in enumerate(row['item_id']):
                self.item_interacted_num[item_1] += 1
                self.item_sim_matrix.setdefault(item_1, {})
                for idx2, item_2 in enumerate(row['item_id']):
                    if item_1 == item_2:
                        continue
                    self.item_sim_matrix[item_1].setdefault(item_2, 0)
                    # 新闻阅读可能具有连续性，后续阅读的新闻与前面阅读的新闻相似度更高
                    related_score = 1 if idx1 > idx2 else 0.8
                    # 如果二者类别相同，新闻之间的相似度更高
                    related_score *= 1 if item2cate.get(item_1, None) == item2cate.get(item_2, None) else 0.5

                    # 活跃用户在计算物品之间相似度时，贡献小于非活跃用户
                    self.item_sim_matrix[item_1][item_2] += related_score / math.log(1 + len(row['item_id']))
            pbar.update(1)
        pbar.close()

        # 理论上，物品之间共现的用户越多，相似度越高
        # 但是，热门物品与很多物品之间的相似度都很高
        # print(f'计算ItemCF第二阶段...')
        for item_1, related_items in tqdm(self.item_sim_matrix.items()):
            for item_2, weight in related_items.items():
                # 打压热门物品
                self.item_sim_matrix[item_1][item_2] = \
                    weight / math.sqrt(self.item_interacted_num[item_1] * self.item_interacted_num[item_2])

    def __call__(self, users, _n=50, _topk=20):
        print(f'开始ItemCF召回: Recall@{topk}-Near@{_n}')
        user2items = self.his_data.groupby('user_id')['item_id'].apply(list)
        popular_items = [val[0] for val in sorted(
            self.item_interacted_num.items(), key=lambda x: x[1], reverse=True)[:_topk]]

        user_rec = {}
        for user_id in tqdm(users):
            # 新用户，直接推荐热门物品
            if user_id not in self.user_set:
                user_rec[user_id] = popular_items
            else:
                rank = defaultdict(int)
                his_items = user2items.loc[user_id]
                # 遍历用户历史交互物品
                for his_item in his_items:
                    # 选取与his_item相似度最高的_n个物品
                    for candidate_item, item_smi_score in sorted(self.item_sim_matrix[his_item].items(),
                                                                 key=itemgetter(1), reverse=True)[:_n]:
                        # 如果推荐的物品已经被购买过了，是否纳入推荐（可跳过）
                        # if candidate_item in his_items:
                        #     continue
                        rank[candidate_item] += item_smi_score
                rec_items = [item[0] for item in sorted(rank.items(), key=itemgetter(1), reverse=True)[:_topk]]
                # 如果推荐的物品不够，用热门物品进行填充
                rec_items += popular_items[:topk-len(rec_items)]
                user_rec[user_id] = rec_items

        return user_rec

In [10]:
icf_cls_path = os.path.join(new_data_path, 'item_cf')
os.makedirs(icf_cls_path, exist_ok=True)

In [12]:
demo_icf_path = os.path.join(icf_cls_path, mode+'_ifc.pkl')
if os.path.exists(demo_icf_path):
    with open(demo_icf_path, 'rb') as file:
        demo_icf = pickle.loads(file.read())
        file.close()
else:
    demo_icf = ItemCF(train_data, item2cate)
    demo_icf.calculate_similarity_matrix()
    demo_icf_pkl = pickle.dumps(demo_icf)

    output_icf = open(demo_icf_path, 'wb')
    output_icf.write(demo_icf_pkl)
    output_icf.close()

  0%|          | 0/15655 [00:00<?, ?it/s]

  0%|          | 0/33664 [00:00<?, ?it/s]

In [13]:
n, topk = 50, 100

# 召回
test_users = test_data['user_id'].unique()
icf_rec_result = demo_icf(test_users, n, topk)

test_user_group = test_data.groupby('user_id')['item_id'].agg(list).reset_index()
test_pred = [icf_rec_result[user_id] for user_id in test_user_group['user_id']]
test_true = test_user_group['item_id'].to_list()

开始ItemCF召回: Recall@100-Near@50


  0%|          | 0/13792 [00:00<?, ?it/s]

In [14]:
PrintMetric(test_true, test_pred, topk)

MAP@100:  0.016906571748779006
Recall@100:  0.15798311228206416
Precision@100:  0.027745069605568447
F1@100:  0.03914852311427278
