# Bangumi Collaborative Filtering

本文将使用协同过滤算法为你推荐动画。协同过滤是一种推荐算法，使用 Bangumi 上所有用户的评分数据，找到与你品味相似的用户，然后根据他们的评分数据为你推荐没有看过的动画。

## Setup

在这一步中，我们将导入所需的库并加载数据集。

点击代码框左侧的三角形按钮运行代码。

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity

import json, time
import warnings
warnings.filterwarnings('ignore')

In [3]:
on_kaggle = True
input_dir = '../input/bangumi15m/bangumi15M/raw_data' if on_kaggle else '../ponet/bangumi15M'

collection = pd.read_csv(f'{input_dir}/AnonymousUserCollection.csv')
collection.head()

Unnamed: 0,user_id,subject_id,rating,type,updated_at,subject_type
0,YMG2tLRg1fH87hmA,1836,0,wish,2008-07-14 16:20:43+08:00,2
1,pGIFQxzZVt01SRTu,12,8,collect,2008-07-14 21:00:52+08:00,2
2,pGIFQxzZVt01SRTu,12,8,collect,2008-07-14 21:00:52+08:00,2
3,0qtskFvO7yaAEDf3,12,8,collect,2008-07-15 08:11:52+08:00,2
4,lS7qKhYTgERQroN0,12,0,wish,2008-07-16 18:20:53+08:00,2


In [4]:
collection['type'].value_counts()

type
collect    10752499
wish        2347803
doing        832861
dropped      645453
on_hold      622750
Name: count, dtype: int64

In [5]:
features = ['user_id', 'subject_id', 'rating']
coll = collection[collection['rating'] != 0] # discard unrated
coll = coll[collection['type'] != 'dropped'] # discard dropped
coll = coll[features]
coll.head()

Unnamed: 0,user_id,subject_id,rating
1,pGIFQxzZVt01SRTu,12,8
2,pGIFQxzZVt01SRTu,12,8
3,0qtskFvO7yaAEDf3,12,8
7,lGxqM1UNmnbk0cAa,12,8
8,psXEPFjt7vTxdLKC,12,6


In [6]:
subjects = pd.read_csv(f'{input_dir}/Subjects.csv', index_col=0)
subjects.head()

Unnamed: 0,id,name_cn,name,date,type,score,rating,on_hold,dropped,wish,collect,doing,platform,tags,total_episodes,eps,volumes,locked,nsfw
1,1,第一次的亲密接触,第一次的親密接觸,1998-09-25,1,7.7,"{'rank': 1511, 'total': 74, 'count': {'1': 0, ...",3,2,25,113,1,小说,"[{'name': '痞子蔡', 'count': 24}, {'name': '轻舞飞扬'...",0,0,0,False,False
2,2,,坟场,,3,8.1,"{'rank': 0, 'total': 422, 'count': {'1': 3, '2...",18,20,116,539,48,,"[{'name': '陈绮贞', 'count': 9}, {'name': '中配', '...",0,0,0,True,False
3,3,空中杀手,スカイ・クロラ The Sky Crawlers,2008-08-02,2,7.7,"{'rank': 518, 'total': 1901, 'count': {'1': 1,...",102,24,1160,2597,54,剧场版,"[{'name': '押井守', 'count': 820}, {'name': '剧场版'...",1,1,0,False,False
4,4,合金弹头7,メタルスラッグ7,2008-07-17,4,6.9,"{'rank': 3983, 'total': 154, 'count': {'1': 0,...",5,4,21,213,6,NDS,"[{'name': 'NDS', 'count': 55}, {'name': '合金弹头'...",0,0,0,False,False
5,5,使命召唤4：现代战争,Call of Duty 4: Modern Warfare,2007-11-05,4,8.3,"{'rank': 183, 'total': 1606, 'count': {'1': 0,...",22,17,98,2071,35,全部游戏,"[{'name': 'FPS', 'count': 430}, {'name': '使命召唤...",0,0,0,False,False


In [7]:
def parse_rating(rating, key):
    try:
        rating = json.loads(rating.replace('\'', '\"'))
        return int(rating[key])
    except:
        return np.nan

features = ['id', 'name', 'score', 'rank', 'total']
sub = subjects[subjects['type'] == 2][~subjects['locked']][~subjects['nsfw']] # only anime and not locked or nsfw
sub['name'] = sub['name_cn'].combine_first(sub['name'])
sub['rank'] = sub['rating'].apply(lambda x: parse_rating(x, 'rank'))
sub['total'] = sub['rating'].apply(lambda x: parse_rating(x, 'total'))
sub = sub[sub['rank'] != 0] # discard unranked (voter < 50) for performance
sub = sub[features]

sub.head()

Unnamed: 0,id,name,score,rank,total
3,3,空中杀手,7.7,518,1901
8,8,Code Geass 反叛的鲁路修R2,8.2,104,11695
12,12,人形电脑天使心,7.6,798,3754
50,50,机动战士高达,7.9,327,2557
51,51,CLANNAD,8.4,69,16414


In [8]:
subject_list = sub['id'].tolist()
coll = coll[coll['subject_id'].isin(subject_list)]

coll = coll.drop_duplicates(subset=['user_id', 'subject_id'], keep='first')

print(collection.shape, coll.shape, subjects.shape, sub.shape)

(15201366, 6) (7351395, 3) (434269, 19) (7327, 5)


如上所示，我们将使用 7,327 部动画的 7,351,395 条评分数据构建模型。

## Model

### Train-Test Split

In [9]:
# split train and test. make sure that each user's data is in the same set
def grouped_split(df, group, test_sizes=[0.2]):
    np.random.seed(21474)
    _df = df.copy()
    split_names = ['train'] + [f'test{i}' for i in range(1, len(test_sizes)+1)]
    portions = [1-sum(test_sizes)] + test_sizes
    _df['split'] = np.random.choice(split_names, size=len(_df), p=portions)
    _df['split'] = _df.groupby(group)['split'].transform('first')
    return _df

coll = grouped_split(coll, 'user_id', test_sizes=[0.1, 0.7])
coll['split'].value_counts()

split
test2    5141312
train    1457247
test1     752836
Name: count, dtype: int64

In [10]:
train = coll[coll['split'] == 'train']
test1 = coll[coll['split'] == 'test1']
test2 = coll[coll['split'] == 'test2']
assert len(set(train['user_id']) & set(test1['user_id'])) == 0

In [None]:
# kf = KFold(n_splits=5, shuffle=True, random_state=21474)
# for train_idx, test_idx in kf.split(coll):
#     print(len(train_idx), len(test_idx))

### Collaborative Filtering

出于内存限制，我们只使用 20% 数据作为训练集，另外 10% 数据作为测试集。

In [11]:
features = ['user_id', 'subject_id']
target = 'rating'
X = coll[features]
y = coll[target]

In [None]:
# rmse_scores = []
# user_subject_matrix = coll.pivot(index='user_id', columns='subject_id', values='rating')
# for train_idx, test_idx in kf.split(coll):
#     train = coll.iloc[train_idx]
#     test = coll.iloc[test_idx]
#     user_similarity = cosine_similarity(user_subject_matrix.fillna(0))
#     for target_user_id in test['user_id'].unique():
#         predicted_ratings = []
#         for item_id in user_subject_matrix.columns:
#             target_user_ratings = user_subject_matrix.loc[target_user_id]
#             similar_users = user_similarity[target_user_id].argsort()[::-1][1:]

#             if pd.isnull(target_user_ratings[item_id]):
#                 numerator = sum(user_similarity[target_user_id][similar_user] * user_subject_matrix.loc[similar_user, item_id]
#                                 for similar_user in similar_users)
#                 denominator = sum(abs(user_similarity[target_user_id][similar_user])
#                                 for similar_user in similar_users)
#                 predicted_rating = numerator / denominator if denominator != 0 else 0
#                 predicted_ratings.append((item_id, predicted_rating))

#         actual_ratings = test[test['user_id'] == target_user_id].set_index('subject_id')['rating']
#         predicted_ratings_dict = dict(predicted_ratings)
#         rmse = mean_squared_error(actual_ratings, [predicted_ratings_dict.get(item_id, 0) for item_id in actual_ratings.index], squared=False)
#         rmse_scores.append(rmse)
    
# print(rmse_scores)
# print(np.mean(rmse_scores))

In [12]:
def concat_df(train, test):
    return pd.concat([train, test]).reset_index(drop=True)

train_test1 = concat_df(train, test1)
user_subject_matrix = train_test1.pivot(index='user_id', columns='subject_id', values='rating')
train_test1.shape, user_subject_matrix.shape

((2210083, 4), (34251, 6630))

In [13]:
user_similarity = cosine_similarity(user_subject_matrix.fillna(0))

In [16]:
# for performance, we use user_similarity (np.ndarray) instead of user_similarity_df (pd.DataFrame)

# user_similarity_df = pd.DataFrame(user_similarity, index=user_subject_matrix.index, columns=user_subject_matrix.index)

rmse_scores = []

for target_user_id in test1['user_id'].unique():
    predicted_ratings = []
    for item_id in user_subject_matrix.columns:
        target_user_ratings = user_subject_matrix.loc[target_user_id]
        # similar_users = user_similarity_df.loc[target_user_id].sort_values(ascending=False)[1:].index
        target_user_idx = user_subject_matrix.index.get_loc(target_user_id)
        similar_users = user_similarity[target_user_idx].argsort()[::-1][1:]

        if pd.isnull(target_user_ratings[item_id]):
            # numerator = sum(user_similarity_df.loc[target_user_id, similar_user] * user_subject_matrix.loc[similar_user, item_id]
                            # for similar_user in similar_users)
            numerator = sum(user_similarity[target_user_idx][similar_user] * user_subject_matrix.loc[similar_user, item_id]
                            for similar_user in similar_users)
            # denominator = sum(abs(user_similarity_df.loc[target_user_id, similar_user])
            #                 for similar_user in similar_users)
            denominator = sum(abs(user_similarity[target_user_idx][similar_user])
                            for similar_user in similar_users)
            predicted_rating = numerator / denominator if denominator != 0 else 0
            predicted_ratings.append((item_id, predicted_rating))

    actual_ratings = test1[test1['user_id'] == target_user_id].set_index('subject_id')['rating']
    predicted_ratings_dict = dict(predicted_ratings)
    rmse = mean_squared_error(actual_ratings, [predicted_ratings_dict.get(item_id, 0) for item_id in actual_ratings.index], squared=False)
    rmse_scores.append(rmse)

print(np.mean(rmse_scores))

AttributeError: 'numpy.ndarray' object has no attribute 'loc'