# 영화 추천 시스템 만들기

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
ratings.rename(columns={'rating':'num'}, inplace=True)

In [4]:
ratings.num.unique()

array([5, 3, 4])

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies['title'].head(10)

0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
5                           Heat (1995)
6                        Sabrina (1995)
7                   Tom and Huck (1995)
8                   Sudden Death (1995)
9                      GoldenEye (1995)
Name: title, dtype: object

In [8]:
## 좋아하는 영화 : Blade, GoldenEye, Die Hard, Raiders of the Lost Ark, Matrix

In [9]:
my_favorite = ['Blade (1998)', 'GoldenEye (1995)', 'Die Hard (1988)', 'Raiders of the Lost Ark (1981)', 'Matrix, The (1999)']

In [10]:
my_favorite_movie_id = []
for a in my_favorite:
    ans = movies[movies['title'] == a]['movie_id']
    my_favorite_movie_id.append(ans)
    print(ans)

2098    2167
Name: movie_id, dtype: int64
9    10
Name: movie_id, dtype: int64
1023    1036
Name: movie_id, dtype: int64
1180    1198
Name: movie_id, dtype: int64
2502    2571
Name: movie_id, dtype: int64


In [11]:
my_favorite_movie_id = [2167, 10, 1036, 1198, 2571]

In [12]:
my_playlist = pd.DataFrame({'user_id': [6041]*5, 'num': 5, 'movie_id' : my_favorite_movie_id})
my_playlist

Unnamed: 0,user_id,num,movie_id
0,6041,5,2167
1,6041,5,10
2,6041,5,1036
3,6041,5,1198
4,6041,5,2571


In [13]:
matrix = pd.concat([ratings, my_playlist])
matrix.tail(10)
del matrix['timestamp']

In [14]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = matrix['user_id'].unique()
movie_unique = matrix['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [15]:
temp_user_data = matrix['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(matrix):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    matrix['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = matrix['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(matrix):
    print('movie column indexing OK!!')
    matrix['movie_id'] = temp_movie_data
else:
    print('artist column indexing Fail!!')

matrix

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,num
0,0,0,5
1,0,1,3
2,0,2,3
3,0,3,4
4,0,4,5
...,...,...,...
0,6039,174,5
1,6039,766,5
2,6039,194,5
3,6039,120,5


In [16]:
from scipy.sparse import csr_matrix

num_user = matrix['user_id'].nunique()
num_movie = matrix['movie_id'].nunique()

csr_data = csr_matrix((matrix.num, (matrix.user_id, matrix.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [17]:
matrix.count

<bound method DataFrame.count of     user_id  movie_id  num
0         0         0    5
1         0         1    3
2         0         2    3
3         0         3    4
4         0         4    5
..      ...       ...  ...
0      6039       174    5
1      6039       766    5
2      6039       194    5
3      6039       120    5
4      6039       124    5

[836483 rows x 3 columns]>

In [18]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   num        836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [19]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [20]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [21]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [22]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [23]:
yb, DH = user_to_idx[6041], movie_to_idx[1036]
yb_vector, DH_vector = als_model.user_factors[yb], als_model.item_factors[DH]


In [24]:
yb_vector

array([-0.29932597,  0.9283609 ,  1.023103  , -0.7722561 , -0.23729199,
       -0.11008844, -0.1581727 ,  0.4342204 , -0.2744641 , -0.3557448 ,
       -0.17221062, -0.00703315,  0.41097468,  0.818961  , -0.09599485,
        0.60233444,  0.6443691 ,  0.89599794, -0.5419207 , -0.18859406,
        0.15673783,  0.09297113,  0.04493131,  0.44475615, -0.099811  ,
        0.35150024, -0.1513979 ,  0.23466541, -1.0120753 ,  0.88326097,
        0.90294987,  0.6593593 , -0.34638086,  0.28650638, -0.33841744,
        0.3312461 , -0.64576703, -0.11461654,  0.61357844, -0.4899233 ,
        0.7843154 , -0.00468509, -0.10940877,  0.2331701 , -0.20803158,
        0.22057885,  0.4990506 ,  0.30316976,  0.72897834,  0.5275617 ,
        0.22241141, -0.59207946, -0.19727807, -0.5307408 , -0.9829467 ,
       -1.2285731 , -0.65077376, -0.4243282 ,  0.6361243 , -0.7268025 ,
        0.14457211, -0.01808147,  0.00589072,  0.10085419, -0.06238931,
        0.9746168 ,  0.06978201,  0.14922017, -0.34038234,  0.44

In [25]:
DH_vector

array([ 0.0131229 ,  0.02054163,  0.00252338, -0.01636476, -0.00355286,
        0.00139825, -0.00743355,  0.02705442, -0.03189367, -0.01053684,
       -0.01476574,  0.01684297,  0.01780818,  0.02739183, -0.01111834,
        0.02907916,  0.01862744,  0.01065291, -0.01322952, -0.02443216,
       -0.01088474,  0.01072131, -0.0053929 ,  0.01630711,  0.00824364,
        0.01752654,  0.01039477,  0.00976697,  0.00029497, -0.00208213,
        0.03434581,  0.03874246,  0.00336395,  0.00400394,  0.00279335,
        0.01612006, -0.01566236,  0.00474388, -0.00146738, -0.00899119,
        0.0084745 ,  0.01347997, -0.0022108 , -0.00107829,  0.01056064,
       -0.00703388,  0.00413245,  0.02746085,  0.03508498,  0.02711458,
        0.03246763,  0.0070092 ,  0.0133503 , -0.00931301, -0.02950811,
        0.00107794, -0.01217131, -0.00487335,  0.0247417 ,  0.01243615,
        0.02711762,  0.0052835 ,  0.00784851,  0.02754832, -0.00671674,
       -0.00336301, -0.00021953, -0.00571869,  0.00252665,  0.00

In [26]:
np.dot(yb_vector, DH_vector)

0.42209977

### 내가 좋아하는 영화 5개를 넣고 돌렸으며 그 중 다이하드의 csr_matrix를 구하고 내적하지 42.2%의 일치율이 나타났다.

## Matrix와 비슷한 영화 찾기

In [31]:
favorite_movie = 2571 # Matrix, The (1999)
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(124, 0.16781338),
 (92, 0.13310587),
 (62, 0.116115786),
 (141, 0.098923795),
 (200, 0.09786876),
 (145, 0.09153855),
 (107, 0.09082525),
 (175, 0.08515513),
 (375, 0.08288437),
 (317, 0.07596511),
 (75, 0.07295919),
 (44, 0.06591252),
 (117, 0.06336731),
 (236, 0.06109853),
 (372, 0.060823444)]

In [39]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
movie_ids = [idx_to_movie[i[0]] for i in similar_movie]
print(movie_ids)

[2571, 589, 2916, 457, 1240, 1527, 480, 1580, 1573, 32, 1610, 260, 1196, 377, 3793]


In [38]:
for a in movie_ids:
    print(movies[movies['movie_id'] == a]['title'])

2502    Matrix, The (1999)
Name: title, dtype: object
585    Terminator 2: Judgment Day (1991)
Name: title, dtype: object
2847    Total Recall (1990)
Name: title, dtype: object
453    Fugitive, The (1993)
Name: title, dtype: object
1220    Terminator, The (1984)
Name: title, dtype: object
1491    Fifth Element, The (1997)
Name: title, dtype: object
476    Jurassic Park (1993)
Name: title, dtype: object
1539    Men in Black (1997)
Name: title, dtype: object
1533    Face/Off (1997)
Name: title, dtype: object
31    Twelve Monkeys (1995)
Name: title, dtype: object
1568    Hunt for Red October, The (1990)
Name: title, dtype: object
257    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object
1178    Star Wars: Episode V - The Empire Strikes Back...
Name: title, dtype: object
373    Speed (1994)
Name: title, dtype: object
3724    X-Men (2000)
Name: title, dtype: object


### Matrix와 비슷한 영화를 찾으니 터미네이터2, 토탈리콜이 가장 유사하게 나왔다. 세 영화를 다 본 사람으로써 상당히 일리가 있는 결과값이라는 생각이 든다.

## 내 취향 저격인 영화 추천하기

In [40]:
user = user_to_idx[6041]
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(172, 0.4966699),
 (200, 0.44015968),
 (117, 0.43875682),
 (44, 0.4122848),
 (92, 0.37262183),
 (141, 0.3459295),
 (64, 0.34288064),
 (179, 0.2914965),
 (62, 0.29137012),
 (651, 0.28643247),
 (75, 0.26272985),
 (710, 0.2610611),
 (193, 0.25360742),
 (378, 0.24068141),
 (188, 0.22846916),
 (5, 0.22176623),
 (111, 0.2124521),
 (156, 0.20870017),
 (48, 0.20754719),
 (197, 0.20518497)]

In [42]:
movie_keys = [idx_to_movie[i[0]] for i in artist_recommended]
for a in movie_keys:
    print(movies[movies['movie_id'] == a]['title'])

1271    Indiana Jones and the Last Crusade (1989)
Name: title, dtype: object
1220    Terminator, The (1984)
Name: title, dtype: object
1178    Star Wars: Episode V - The Empire Strikes Back...
Name: title, dtype: object
257    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object
585    Terminator 2: Judgment Day (1991)
Name: title, dtype: object
453    Fugitive, The (1993)
Name: title, dtype: object
1192    Star Wars: Episode VI - Return of the Jedi (1983)
Name: title, dtype: object
724    Rock, The (1996)
Name: title, dtype: object
2847    Total Recall (1990)
Name: title, dtype: object
1182    Aliens (1986)
Name: title, dtype: object
1568    Hunt for Red October, The (1990)
Name: title, dtype: object
2548    Mummy, The (1999)
Name: title, dtype: object
1196    Alien (1979)
Name: title, dtype: object
1673    Tomorrow Never Dies (1997)
Name: title, dtype: object
2046    Indiana Jones and the Temple of Doom (1984)
Name: title, dtype: object
1179    Princess Bride, The (19

### 나에게 추천해준 영화는 인디아나존스2, 스타워즈 시리즈, 터미네티어시리즈, 더 록 토탈리콜, 에일리언 등을 추천했다. 개인적으로 다 재미이쎅 본 영화들이었고 추천 알고리즘이 잘 돌아간것 같아 뿌듯하다.