# Movie Recommendation (Movielens)
- 유저가 영화에 대해 평점을 매긴 데이터 (MovieLens 1M Dataset)
```
wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
mv ml-1m.zip ~/aiffel/recommendata_iu/data
unzip ml-1m.zip
```
- 방법 : 별점을 시청횟수로 해석해서 생각하겠습니다. 또한 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외하겠습니다.

## 데이터 준비

In [156]:
import pandas as pd
import numpy as np
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [157]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [6]:
# rating 컬럼의 이름을 count로 바꿉니다.
#ratings.rename(columns={'rating':'count'}, inplace=True)

In [236]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 데이터 탐색

In [8]:
# Ratings에 있는 유니크한 영화 개수
ratings['movie_id'].nunique()

3628

In [9]:
# Ratings에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [158]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [43]:
pd.merge([ratings, movies], join='outer', axis=1, join_axes='movie_id

TypeError: concat() got an unexpected keyword argument 'join_axes'

In [56]:
# 가장 인기있는 영화 30개 (인기순)
movie_popular = ratings.groupby('movie_id')['user_id'].count()
movie_popular

movie_id
1       2000
2        551
3        339
4        102
5        214
        ... 
3948     752
3949     280
3950      47
3951      36
3952     353
Name: user_id, Length: 3628, dtype: int64

In [159]:
ratings.tail()

Unnamed: 0,user_id,movie_id,rating,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


## 내가 선호하는 영화를 5가지 골라서 rating에 추가하기

In [160]:
movies[movies['genre']=='Animation']
movies[movies['genre']=='Comedy']
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [176]:
# 내가 좋아하는 영화. 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = [0, 720, 2120, 3924, 248]

# 맨 마지막보다 하나 많은 user_id가 위 영화를 5점씩 rating했다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'movie_id': my_favorite, 'rating':[5]*5, 'timestamp':[0]*5 })

if not ratings.isin({'user_id':['6041']})['user_id'].any():  # user_id에 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다. 

ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

Unnamed: 0,user_id,movie_id,rating,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,6041,0,5,0
1,6041,720,5,0
2,6041,2120,5,0
3,6041,3924,5,0
4,6041,248,5,0


## CSR matrix 만들기

In [193]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].max()
num_movie = ratings['movie_id'].max()



print(num_user)
print(num_movie)
print(ratings.rating.shape)
print(ratings.movie_id.shape)
print(ratings.user_id.shape)

csr_data = csr_matrix((ratings.rating, (ratings.user_id, ratings.movie_id)), shape= (num_user+1, num_movie+1))
csr_data

6041
3952
(836483,)
(836483,)
(836483,)


<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 모델 훈련하기

In [196]:
from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장하고 있는 부분입니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [223]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=10000, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [221]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [224]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [225]:
me = 6041
toy_story = 0
me_vector, toy_story_vector = als_model.user_factors[me], als_model.item_factors[0]


In [226]:
me_vector

array([ 0.10877027, -0.14634965, -0.0753423 , ...,  0.00763978,
       -0.17173173, -0.07484483], dtype=float32)

In [227]:
toy_story_vector

array([0.0093808 , 0.00074432, 0.00059208, ..., 0.00222996, 0.00390381,
       0.00184442], dtype=float32)

In [228]:
np.dot(me_vector, toy_story_vector)

0.9688995

In [None]:
## 나의 선호도 파악하기

In [237]:
movies.head()


Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [242]:
movies[movies['movie_id']==211].title

209    Browning Version, The (1994)
Name: title, dtype: object

In [243]:
BrowningVersion = 211
BrowningVersion_vector = als_model.item_factors[BrowningVersion]
np.dot(me_vector, BrowningVersion_vector)

-0.0018900707

In [None]:
## 내가 좋아하는 영화와 비슷한 영화 추천받기

In [248]:
movie_unique = movies["title"].unique()
movie_to_idx = {v:k for k, v in enumerate(movie_unique)}
favorite_movie = 'Jumanji (1995)' # toy story
favorite_movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(favorite_movie_id, N=15)
similar_movie

[(1, 0.6186715),
 (1543, 0.43756187),
 (3043, 0.43733948),
 (1470, 0.43722677),
 (3205, 0.43709266),
 (3280, 0.43708375),
 (1335, 0.43680143),
 (3381, 0.43675593),
 (607, 0.43672532),
 (1360, 0.43667927),
 (3482, 0.4366577),
 (3530, 0.43659982),
 (3647, 0.43655202),
 (868, 0.43654305),
 (2223, 0.43651214)]

In [251]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}

In [252]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [253]:
get_similar_movie('Jumanji (1995)')

['Jumanji (1995)',
 'Contact (1997)',
 "'Night Mother (1986)",
 'Kissed (1996)',
 'Single White Female (1992)',
 'Perils of Pauline, The (1947)',
 'Star Trek: First Contact (1996)',
 'Grumpy Old Men (1993)',
 'Hellraiser: Bloodline (1996)',
 'Grease 2 (1982)']

## 내가 가장 좋아할 만한 영화 추천 받기

In [254]:
user = 6041
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(1148, 0.008480709),
 (891, 0.0077360123),
 (3245, 0.0074620657),
 (3804, 0.0070732273),
 (398, 0.006861901),
 (742, 0.006823357),
 (1782, 0.0066647716),
 (2461, 0.006594857),
 (1983, 0.0065675303),
 (1640, 0.006509304),
 (2895, 0.0064921305),
 (670, 0.0064772414),
 (806, 0.0061814394),
 (3542, 0.0061563626),
 (3519, 0.006036306),
 (3853, 0.006011989),
 (3054, 0.0059974696),
 (130, 0.005876746),
 (820, 0.0058342637),
 (3447, 0.00579297)]

In [255]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Two or Three Things I Know About Her (1966)',
 'Vertigo (1958)',
 'Big Trees, The (1952)',
 'Couch in New York, A (1996)',
 'Open Season (1996)',
 'Vermont Is For Lovers (1992)',
 'Leather Jacket Love Story (1997)',
 'Beneath the Planet of the Apes (1970)',
 'Hocus Pocus (1993)',
 'Red Corner (1997)',
 'Julien Donkey-Boy (1999)',
 'They Bite (1996)',
 'Two Deaths (1995)',
 'Saludos Amigos (1943)',
 'King of Marvin Gardens, The (1972)',
 'Return of the Fly (1959)',
 'Spring Fever USA (a.k.a. Lauderdale) (1989)',
 'Jade (1995)',
 'Stonewall (1995)',
 'Bell, Book and Candle (1958)']