In [1]:
import pandas as pd

In [2]:
m_cols = ['movie_id','title','genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat',names=m_cols,sep='::',encoding='latin-1',engine='python')

In [3]:
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [4]:
t_cols = ['user_id','movie_id','tag','timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat',names=t_cols,sep='::',engine='python')
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()
user_tagged_movies.head()

Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [16]:
print(f'tag種類={len(user_tagged_movies.tag.unique())}')
print(f'タグレコード数={len(user_tagged_movies)}')
print(f'タグが付いている映画数={len(user_tagged_movies.movie_id.unique())}')

tag種類=15241
タグレコード数=95580
タグが付いている映画数=7601


In [18]:
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})
movies = movies.merge(movie_tags,on='movie_id',how='left')
movies.head()

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie]
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin..."


In [20]:
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat',names=r_cols,sep='::',engine='python')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [21]:
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings['user_id'].isin(valid_user_ids)]

In [22]:
movielens = ratings.merge(movies,on='movie_id')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
1,139,122,3.0,974302621,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
2,149,122,2.5,1112342322,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
3,182,122,3.0,943458784,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
4,215,122,4.5,1102493547,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."


In [23]:
import numpy as np

In [24]:
movielens.groupby('user_id').agg({'movie_id':len}).agg({'movie_id':[min,max,np.mean,len]})

Unnamed: 0,movie_id
min,20.0
max,1668.0
mean,132.83
len,1000.0


In [25]:
movielens.groupby('movie_id').agg({'user_id':len}).agg({'user_id':[min,max,np.mean,len]})

Unnamed: 0,user_id
min,1.0
max,496.0
mean,19.719418
len,6736.0


In [26]:
print(f'values={len(movielens)}')
movielens.groupby('rating').agg({'movie_id':len})

values=132830


Unnamed: 0_level_0,movie_id
rating,Unnamed: 1_level_1
0.5,851
1.0,4847
1.5,1247
2.0,10292
2.5,3729
3.0,31706
3.5,9661
4.0,39917
4.5,6949
5.0,23631


In [27]:
movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False,method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank'] <= 5]

In [31]:
from typing import List
from typing import Dict
from sklearn.metrics import mean_squared_error

In [29]:
def calc_rmse(self,true_rating:List[float],pred_rating:List[float])->float:
    return np.sqrt(mean_squared_error(true_rating,pred_rating))

In [32]:
def calc_recal_at_k(
        true_user2items:Dict[int,List[int]],
        pred_user2items:Dict[int,List[int]],
        k:int
    )->float:
    scores = []
    for user_id in true_user2items.keys():
        r_at_k = recall_at_k(true_user2items[user_id],pred_user2items[user_id],k)
        scores.append(r_at_k)
    return np.mean(scores)

In [33]:
def _recall_at_k(self,true_items:List[int],pred_items:List[int],k:int)->float:
    if len(true_items)==0 or k==0:
        return 0.0
    r_at_k = (len(set(true_items) & set(pred_items[:k])))/len(true_items)
    return r_at_k

In [34]:
def calc_precision_at_k(
        true_user2items:Dict[int,List[int]],
        pred_user2items:Dict[int,List[int]],
        k:int
    )->float:
    scores = []
    for user_id in true_user2items.keys():
        p_at_k = _precision_at_k(true_user2items[user_id],pred_user2items[user_id],k)
        scores.append(p_at_k)
    return np.mean(scores)

In [36]:
def _precision_at_k(true_items:List[int],pred_items:List[int],k:int)->float:
    if k==0:
        return 0.0
    p_at_k = (len(set(true_items) & set(pred_items[:k])))/k
    return p_at_k

In [38]:
import pandas as pd
import os
from util.models import Dataset


class DataLoader:
    def __init__(self, num_users: int = 1000, num_test_items: int = 5, data_path: str = "../data/ml-10M100K/"):
        self.num_users = num_users
        self.num_test_items = num_test_items
        self.data_path = data_path

    def load(self) -> Dataset:
        ratings, movie_content = self._load()
        movielens_train, movielens_test = self._split_data(ratings)
        # ranking用の評価データは、各ユーザーの評価値が4以上の映画だけを正解とする
        # キーはユーザーID、バリューはユーザーが高評価したアイテムIDのリスト
        movielens_test_user2items = (
            movielens_test[movielens_test.rating >= 4].groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
        )
        return Dataset(movielens_train, movielens_test, movielens_test_user2items, movie_content)

    def _split_data(self, movielens: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
        # 学習用とテスト用にデータを分割する
        # 各ユーザの直近の５件の映画を評価用に使い、それ以外を学習用とする
        # まずは、それぞれのユーザが評価した映画の順序を計算する
        # 直近付与した映画から順番を付与していく(0始まり)
        movielens["rating_order"] = movielens.groupby("user_id")["timestamp"].rank(ascending=False, method="first")
        movielens_train = movielens[movielens["rating_order"] > self.num_test_items]
        movielens_test = movielens[movielens["rating_order"] <= self.num_test_items]
        return movielens_train, movielens_test

    def _load(self) -> (pd.DataFrame, pd.DataFrame):
        # 映画の情報の読み込み(10197作品)
        # movie_idとタイトル名のみ使用
        m_cols = ["movie_id", "title", "genre"]
        movies = pd.read_csv(
            os.path.join(self.data_path, "movies.dat"), names=m_cols, sep="::", encoding="latin-1", engine="python"
        )
        # genreをlist形式で保持する
        movies["genre"] = movies.genre.apply(lambda x: x.split("|"))

        # ユーザが付与した映画のタグ情報の読み込み
        t_cols = ["user_id", "movie_id", "tag", "timestamp"]
        user_tagged_movies = pd.read_csv(
            os.path.join(self.data_path, "tags.dat"), names=t_cols, sep="::", engine="python"
        )
        # tagを小文字にする
        user_tagged_movies["tag"] = user_tagged_movies["tag"].str.lower()
        movie_tags = user_tagged_movies.groupby("movie_id").agg({"tag": list})

        # タグ情報を結合する
        movies = movies.merge(movie_tags, on="movie_id", how="left")

        # 評価データの読み込み
        r_cols = ["user_id", "movie_id", "rating", "timestamp"]
        ratings = pd.read_csv(os.path.join(self.data_path, "ratings.dat"), names=r_cols, sep="::", engine="python")

        # user数をnum_usersに絞る
        valid_user_ids = sorted(ratings.user_id.unique())[: self.num_users]
        ratings = ratings[ratings.user_id <= max(valid_user_ids)]

        # 上記のデータを結合する
        movielens_ratings = ratings.merge(movies, on="movie_id")

        return movielens_ratings, movies

## 2各種アルゴリズムの実装

In [42]:
import dataclasses
@dataclasses.dataclass(frozen=True)
class Dataset:
    train:pd.DataFrame
    test:pd.DataFrame
    test_user2items:Dict[int,List[int]]
    item_content:pd.DataFrame

In [48]:
@dataclasses.dataclass(frozen=True)
class RecommendResult:
    rating:pd.DataFrame
    user2items:Dict[int,List[int]]

In [49]:
from abc import ABC,abstractmethod

In [50]:
class BaseRecommender(ABC):
    @abstractmethod
    def recommend(self,dataset:Dataset,**kwargs)->RecommendResult:
        pass
    
    def run_sample(self)->None:
        movielens = DataLoader(rum_users=1000,num_test_items=5,data_path="../data/ml-10M100K/").load()
        recommend_results = self.recommend(movielens)
        metrics = MetrixCalculator().calc(
            movielens.test.rating.tolist(),
            movielens.test_user2items,
            recommend_result.user2items,
            k=10,
        )
        print(metrics)