In [0]:
# データをダウンロード
!wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"

In [0]:
# データを解凍
!unzip ml-100k.zip

In [0]:
ls "ml-100k/"

In [0]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [0]:
d = pd.read_csv("ml-100k/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))

In [0]:
# ratingが3より大きいものを選抜
d = d[d["rating"]>3]

In [0]:
d.drop(labels=["timestamp"], axis=1, inplace=True)

In [0]:
# user_idが100,000のものがもしあれば消す(今回100,000を新しい顧客idにする)
d = d[~(d["user_id"]==100000)]

In [0]:
d.loc[:,"rating"] = 1

In [0]:
d.head()

In [0]:
# 好む(1)か、好まない(0)かを記述
movie_53 = 1
movie_71 = 1
movie_134 = 0
movie_213 = 1
movie_820 = 1

In [0]:
# データフレーム作成
d1 = pd.DataFrame({"user_id":100000,
                   "item_id":[53,71,134,213,820],
                   "rating":[movie_53,movie_71,movie_134,movie_213,movie_820]})

In [0]:
d1

In [0]:
# d1とdを縦に結合
d = pd.concat([d1, d])

In [0]:
d.head(10)

In [0]:
d.drop_duplicates(keep='first', inplace=True)

In [0]:
d = pd.pivot_table(d, values='rating', index='user_id', columns='item_id', aggfunc='sum', fill_value=0)

In [0]:
d.reset_index(inplace=True)

In [0]:
d.head(3)

In [0]:
# コサイン距離計算用の関数作成
def cosine_distance(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [0]:
# 新しい顧客情報をここに入れる
main_customer = 100000
search_user = d[d["user_id"]==main_customer]

In [0]:
search_user = search_user.iloc[0,1:].values

In [0]:
search_user

In [0]:
# user_idがmain_user以外を取得
others = d[~(d["user_id"]==main_customer)]

In [0]:
distance = []
num_iteration = others.shape[0]
for i in range(num_iteration):
    x =  cosine_distance(search_user, others.iloc[i,1:].values)
    distance.append(x)
distance1 = pd.Series(distance, index = others.user_id)

In [0]:
#user_idがmain_customerに近い人20人
result = distance1.sort_values(ascending=False)[0:20]

In [0]:
result

In [0]:
# set型にする
similar_users = set(result.index)

In [0]:
d = pd.read_csv("ml-100k/u.data",sep='\t',names="user_id,item_id,rating,timestamp".split(","))
# ratingが3より大きいものを選抜
d = d[d["rating"]>3]
# timestamp列を削除
d.drop(labels=["timestamp"], axis=1, inplace=True)

In [0]:
# 20人のユーザを選抜する
d = d[d['user_id'].isin(similar_users)]

In [0]:
# 選んだ映画は除く
d = d[~(d['item_id'].isin([53,71,134,213,820]))]

In [0]:
d.head(3)

In [0]:
# ユーザが20であることを確認する
d.user_id.unique()

In [0]:
d.loc[:,"rating"] = 1

In [0]:
d.drop_duplicates(keep='first', inplace=True)

In [0]:
d.drop(labels=["user_id"], axis=1, inplace=True)

In [0]:
d = d.groupby(['item_id'], as_index = True, sort = False).sum()

In [0]:
d.head()

In [0]:
# より頻度高く見られている映画上位10個
result = d.sort_values(by = "rating", ascending=False)[0:10]

In [0]:
result.head(10)

In [0]:
# 似ている人が良く見ている映画10選
list(result.index)

In [0]:
d2 = pd.DataFrame({"item_id":list(result.index),
                   "num":1})

In [0]:
d2.head()

In [0]:
d_item = pd.read_csv("ml-100k/u.item", sep='|',encoding="Latin1",header=None,usecols=[0,1,2,4])
d_item.columns =   ["item_id","item_name","release_date","url"]
result = pd.merge(d2, d_item.iloc[:,[0,1]], how='inner', on='item_id')

In [0]:
result.head()

In [0]:
result.drop(labels=["item_id","num"], axis=1, inplace=True)

In [0]:
list(result.item_name)