In [12]:
import pandas as pd
from pandasql import sqldf
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from tqdm import tqdm
sns.set()

In [13]:
PATH = 'ml-latest-small'
# tags_df = pd.read_csv(f'{PATH}/tags.csv')
# movie_info_df = pd.read_csv(f'{PATH}/movies.csv')

In [14]:
ratings_df = pd.read_csv(f'{PATH}/ratings.csv', dtype={'userId':'int', 'movieId':'int'})
ratings_df['date'] = pd.to_datetime(ratings_df['timestamp'],unit='s').dt.strftime('%Y-%m-%d')

In [15]:
ratings_df.userId.unique().size

610

In [16]:
ratings_df_train = ratings_df[ratings_df.date<'2017-01-01']
ratings_df_test = ratings_df[ratings_df.date>='2017-01-01']

ratings_df_train.shape, ratings_df_test.shape

((86220, 5), (14616, 5))

In [17]:
user_item_dict = defaultdict(dict)
item_user_dict = defaultdict(dict)
for row in tqdm(ratings_df_train.iterrows()):
    rating = row[1]['rating']
    item_id = int(row[1]['movieId'])
    user_id = int(row[1]['userId'])
    user_item_dict[user_id][item_id] = rating
    item_user_dict[item_id][user_id] = rating

86220it [00:06, 12592.23it/s]


In [18]:
def similarity(user_item_dict, item_a_id, item_b_id):
    user_set_a = set(item_user_dict[item_a_id])
    user_set_b = set(item_user_dict[item_b_id])
    user_set_a_b = user_set_a & user_set_b
    if not user_set_a_b:
        return 0
    
    len_a = sum(map(lambda x:x**2,item_user_dict[item_a_id].values()))**0.5
    len_b = sum(map(lambda x:x**2,item_user_dict[item_b_id].values()))**0.5
    a_dot_b = sum([item_user_dict[item_a_id][key]*item_user_dict[item_b_id][key] \
            for key in user_set_a_b])
    return  round(a_dot_b/(len_a * len_b), 2)

In [19]:
def get_i2i_matrix(ratings_df_train):
    item_item_dict = {item:list() for item in ratings_df_train.movieId.unique()}
    for item_a_id in tqdm(item_item_dict):
        for item_b_id in item_item_dict:
            distance = similarity(user_item_dict, item_a_id, item_b_id)
            if distance!=0:
                item_item_dict[item_a_id].append((item_b_id, distance))
        item_item_dict[item_a_id].sort(key=lambda x:x[1], reverse=True)
    return item_item_dict

In [21]:
item_item_dict = get_i2i_matrix(ratings_df_train)

100%|██████████| 8283/8283 [08:50<00:00, 15.61it/s]  


In [None]:
len(item_item_dict)

# Recall

In [390]:
len(item_item_dict)

8283

In [403]:
def get_user_valued_item(ratings_df_train, ratings_df_test):
    user_viewd_dict = dict()
    test_user_id_list = ratings_df_test.userId.unique()
    for user_id in test_user_id_list:
        user_log = ratings_df_train[ratings_df_train.userId == user_id]
        user_viewd = user_log.movieId.unique()
        if len(user_viewd)!=0:
            user_viewd_dict[user_id] = user_viewd
    return user_viewd_dict

In [406]:
user_viewd_dict = get_user_valued_item(ratings_df_train, ratings_df_test)

In [408]:
print(user_viewd_dict.keys())
print('user_viewd_dict size: ', len(user_viewd_dict.keys()))

dict_keys([15, 18, 21, 68, 103, 105, 106, 112, 119, 125, 210, 233, 249, 282, 292, 305, 318, 339, 341, 408, 414, 443, 448, 462, 511, 534, 601, 610])
user_viewd_dict size:  28


In [490]:
user_recall_dict = dict()
for key in tqdm(user_viewd_dict):
    recall_set = set()
    for item_id in user_viewd_dict[key]:
        recall_set.update((item_item_dict[item_id]))
    user_recall_dict[key] = recall_set

100%|██████████| 28/28 [00:25<00:00,  1.09it/s]


In [491]:
TOP_K = 100
recall_dict = {}
for uid in user_viewd_dict.keys():
    recall = pd.DataFrame(user_recall_dict[uid], columns = ['item_id','cosine']).groupby('item_id').sum(). \
        sort_values(by=['cosine'], ascending = False).head(TOP_K).index
    recall_dict[uid] = set(recall)

# Evaluate

In [492]:
# 估计Top 100 Recall

In [495]:
max_rate = []
for uid in recall_dict:
    df = ratings_df_test[ratings_df_test.userId ==uid]
    recall_item_id_set = recall_dict[uid] & set(df.movieId)
    rating_ls = df[df.movieId.isin(recall_item_id_set)].rating
    if len(rating_ls)!=0:
        max_rate.append((max(rating_ls)))
    else:
        max_rate.append(0)
    ls.append(len(recall_item_id_set))

sum([i!=0 for i in max_rate])/len(max_rate)

0.39285714285714285