In [0]:
# データをダウンロード
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"

In [0]:
!unzip ml-100k.zip

In [0]:
ls "ml-100k/"

In [0]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [0]:
d = pd.read_csv("ml-100k/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))

In [0]:
d.head()

In [0]:
# ratingが3より大きいものを選抜
d = d[d["rating"]>3]

In [0]:
d.head()

In [0]:
d.rating.unique()

In [0]:
d.drop(labels=["timestamp"], axis=1, inplace=True)

In [0]:
d.head()

In [0]:
d.loc[:,"rating"] = 1

In [0]:
d.head()

In [0]:
d.rating.unique()

In [0]:
d.shape

In [0]:
d.drop_duplicates(keep='first', inplace=True)

In [0]:
d.shape

In [0]:
d.head()

In [0]:
d = pd.pivot_table(d, values='rating', index='item_id', columns='user_id', aggfunc='sum', fill_value=0)

In [0]:
d.head()

In [0]:
d.reset_index(inplace=True)

In [0]:
d.head()

In [0]:
# コサイン距離計算用の関数作成
def cosine_distance(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [0]:
# 今回はitem_id=53の映画を選択し、その類似の映画を出力する
wanted_item = 53
search_item = d[d["item_id"]==wanted_item]

In [0]:
search_item

In [0]:
search_item = search_item.iloc[0,1:].values

In [0]:
search_item

In [0]:
d.shape

In [0]:
# item_idがwanted_item以外を取得
others = d[~(d["item_id"]==wanted_item)]

In [0]:
others.shape

In [0]:
others.head()

In [0]:
# 二つの映画(item_id=53とそれ以外)ごとのコサイン距離を網羅的に計算する
# 結果の入れ物(リスト)を作成
distance = []
# 繰り返し回数は、othersの行数
num_iteration = others.shape[0]
for i in range(num_iteration):
    # 二つの映画(item_id=53とそれ以外)のコサイン距離を計算する
    x = cosine_distance(search_item, others.iloc[i,1:].values)
    # 結果をdistance(リスト)に格納する
    distance.append(x)
# 距離情報にitem_id=53以外の映画のidを割り振る
distance1 = pd.Series(distance, index = others.item_id)

In [0]:
# 結果
distance1

In [0]:
# item_id=53の映画に類似した映画10選を抽出
result = distance1.sort_values(ascending=False)[0:10]

In [0]:
result

In [0]:
# データフレームを作成し、item_id列とsimilarity列を作成
result2 = pd.DataFrame({"item_id":list(result.index),
                  "similarity":result.values})

In [0]:
result2

In [0]:
d_item = pd.read_csv("ml-100k/u.item", sep='|',encoding="Latin1",header=None,usecols=[0,1,2,4])

In [0]:
d_item.head()

In [0]:
d_item.columns =   ["item_id","item_name","release_date","url"]

In [0]:
d_item.head()

In [0]:
d_item.drop(labels=["release_date","url"], axis=1, inplace=True)

In [0]:
d_item.head()

In [0]:
result3 = pd.merge(result2, d_item, how='inner', on='item_id')

In [0]:
result3.head()

In [0]:
# 最後は映画名を上からリストとして出力
list(result3.item_name)