# Ex13

In [1]:
! pip install --upgrade pip setuptools wheel
! pip install opencv-python
! pip install --upgrade implicit=='0.4.8'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import scipy
import implicit
import pandas as pd
from implicit.als import AlternatingLeastSquares

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.6
1.7.3
0.4.8


#### 데이터 준비와 전처리

In [4]:
rating_file_path='/content/drive/MyDrive/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
data = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(data)
data.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 3점 이상만 남긴다.
data = data[data['ratings']>=3]
filtered_data_size = len(data)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [6]:
# ratings 컬럼의 이름을 counts로 바꾼다.
data.rename(columns={'ratings':'counts'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
data['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [8]:
# 영화 제목을 보기 위해 메타 데이터를 읽어 온다.
movie_file_path='/content/drive/MyDrive/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# 인기 많은 영화
movie_count = data.groupby('movie_id')['counts'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: counts, dtype: int64

영화 id 2858이 제일 인기가 많았다. 그렇다면 이 영화의 제목은 무엇일까?

In [10]:
movies.head(2790)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
2785,2854,Don't Look in the Basement! (1973),Horror
2786,2855,Nightmares (1983),Horror
2787,2856,I Saw What You Did (1965),Thriller
2788,2857,Yellow Submarine (1968),Animation|Musical


제일 인기가 많았던 영화는 'American Beauty'라는 것을 알 수 있다. 

영화를 추천 받기 위해서는

사용자 초기 정보 값을 입력해야 한다.

예를 들어, 넷플릭스 가입하는 동시 선호하는 영화를 몇 개 고르는 것과 같은 것이다.

'Back to the Future (1985)','Notting Hill (1999)', 'Toy Story (1995)', 'Titanic (1997)','Negotiator, The (1998)'를 좋아하는 영화로 선택


In [11]:
# Back_to_the_Futre, Toy_Story, Negotiation, Titanic, Notting_Hill

my_favorite = [1270, 2671, 3404, 2058, 1]
my_list = pd.DataFrame({'user_id': [9999]*5, 'movie_id': my_favorite, 'counts': [5]*5})
data = data.append(my_list)
data.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,9999,1270,5,
1,9999,2671,5,
2,9999,3404,5,
3,9999,2058,5,
4,9999,1,5,


데이터의 쉽고 용이한 관리를 위해 indexing을 해주겠다.

In [12]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = data['user_id'].unique()
movie_unique = data['movie_id'].unique()

# 유저, 영화 indexing 하는 코드 idx는 index의 약자이다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

# 인덱싱이 잘 되었는지 확인해 보자. 
print(user_to_idx[9999]) 
print(movie_to_idx[1])

6039
40


In [13]:
# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 보자. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거한다. 
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해준다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해준다. 
temp_movie_data = data['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(data):
    print('movie column indexing OK!!')
    data['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

data

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6039,22,5,
1,6039,1151,5,
2,6039,1626,5,
3,6039,791,5,


### 모델링

In [14]:
# 매트릭스 만들기
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_movie = data['movie_id'].nunique()

csr_data = csr_matrix((data.counts, (data.user_id, data.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [15]:
# implicit 라이브러리에서 권장하고 있는 부분이다.
import os
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [16]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=300, regularization=0.01, use_gpu=False, iterations=40, dtype=np.float32)

In [17]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해준다.
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [18]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/40 [00:00<?, ?it/s]

In [19]:
myukim, back_to_the_future = user_to_idx[9999], movie_to_idx[1270]
myukim_vector, back_to_the_future_vector = als_model.user_factors[myukim], als_model.item_factors[back_to_the_future]

In [20]:
myukim_vector

array([-3.26816589e-01, -3.57971519e-01,  2.54630566e-01, -4.75480497e-01,
        5.75823300e-02, -9.35700536e-03, -1.73587538e-02, -2.36170590e-01,
        8.52441788e-02, -1.75755918e-01, -2.07676157e-01,  2.07965404e-01,
        4.72548343e-02,  4.26162243e-01,  2.57680923e-01, -1.40546054e-01,
       -5.48140407e-02, -6.67354345e-01, -6.72793910e-02, -9.31216851e-02,
        5.46592534e-01, -3.37727904e-01, -2.00867698e-01, -4.01182830e-01,
        4.41041917e-01,  1.40213349e-03, -1.82179108e-01,  1.89163233e-03,
        5.28073311e-01,  5.73220372e-01,  3.67888629e-01,  1.86061561e-01,
       -1.53382644e-01,  1.66779920e-01, -1.98465124e-01,  1.43302828e-01,
       -4.21129391e-02,  3.51774804e-02, -3.49238157e-01,  1.44195601e-01,
        1.37004137e-01, -2.45891318e-01,  4.17704843e-02,  2.85230011e-01,
       -6.82941973e-01, -2.29887187e-01,  5.73393941e-01, -6.31186783e-01,
        6.00991771e-02,  3.57204169e-01,  3.39453608e-01,  1.68960392e-01,
        4.18655761e-02,  

In [21]:
back_to_the_future_vector

array([-1.46833528e-02, -1.05408467e-02,  2.93450933e-02,  6.47063181e-03,
        3.06168050e-02,  3.86364534e-02,  7.42237922e-03, -1.31942993e-02,
        6.46320805e-02, -1.00712059e-02,  4.06396529e-03,  4.87374403e-02,
        3.04087233e-02,  4.10583690e-02, -6.29504537e-03, -4.05432023e-02,
        1.55875627e-02, -4.00980413e-02, -1.18003441e-02,  1.32958442e-02,
        2.34499015e-02,  1.11004692e-02, -7.23408815e-03, -2.40296610e-02,
        2.29939539e-02,  2.69690156e-02, -2.59337202e-02, -3.66135291e-03,
        1.43617280e-02,  3.46725956e-02,  1.04869716e-02,  3.33989449e-02,
        1.36725930e-03, -3.80613618e-02,  1.42391464e-02, -2.65100617e-02,
       -3.80628766e-03, -1.24821449e-02,  3.59223187e-02,  2.95776669e-02,
        2.02375725e-02,  5.24632446e-03, -2.05706600e-02,  2.96649598e-02,
       -2.81637534e-04,  2.46164668e-02, -1.28432326e-02, -1.38653675e-03,
       -5.32903522e-03, -7.62835285e-03,  6.78760349e-04,  8.33524205e-03,
        9.76346713e-03, -

### 모델 평가하기

In [22]:
# myukim과 back_to_the_future를 내적하는 코드
np.dot(myukim_vector, back_to_the_future_vector)

0.8783628

선호도가 높게 나온다. 잘 훈련이 진행되었다는 것을 알 수 있다.

내가 선호했던 다른 영화도 이와 같은 결과를 가지는 지 궁금하다.

In [23]:
# movie_to_idx[1]은 toy story를 뜻한다.
toy_story = movie_to_idx[1]
toy_vector = als_model.item_factors[toy_story]
np.dot(myukim_vector, toy_vector)

0.7641408

토이스토리도 선호도가 높게 잘 나온다는 것을 알 수 있다.

### 비슷한 영화 and 추천 영화

#### 비슷한 영화

In [24]:
similar_movie = als_model.similar_items(back_to_the_future, N=15)
similar_movie

[(22, 1.0),
 (674, 0.34656453),
 (13, 0.25399604),
 (2824, 0.25199613),
 (3543, 0.25026911),
 (2714, 0.2485178),
 (1675, 0.24645469),
 (3370, 0.24450088),
 (2827, 0.24442378),
 (1881, 0.24340895),
 (1594, 0.23827834),
 (3559, 0.23801678),
 (3565, 0.2376967),
 (2630, 0.23752226),
 (3434, 0.23697883)]

In [25]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
movie_id=[idx_to_movie[i[0]] for i in similar_movie]
print(movie_id)

[1270, 2011, 2918, 1496, 985, 131, 718, 3294, 1743, 2341, 3939, 3236, 1316, 1741, 1990]


1270은 Back_to_the_Future의 인덱스 번호다. 그래서 비슷한 정도가 1이 나온 것이다.

In [26]:
movies.head(1946)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
1941,2010,Metropolis (1926),Sci-Fi
1942,2011,Back to the Future Part II (1989),Comedy|Sci-Fi
1943,2012,Back to the Future Part III (1990),Comedy|Sci-Fi|Western
1944,2013,"Poseidon Adventure, The (1972)",Action|Adventure


비슷한 영화로 Back_to_the_Future2를 추천해주고 있다.

다음은 추천을 받아보자.

#### 시스템 추천 영화

In [27]:
user = user_to_idx[9999]
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(50, 0.2492904),
 (626, 0.22351332),
 (85, 0.19821772),
 (799, 0.19799773),
 (674, 0.18611379),
 (255, 0.18467164),
 (379, 0.18462588),
 (175, 0.1766848),
 (340, 0.17667776),
 (488, 0.16973342),
 (124, 0.16873184),
 (851, 0.16131353),
 (88, 0.15771165),
 (114, 0.1533191),
 (96, 0.15290098),
 (126, 0.15242697),
 (548, 0.14900714),
 (648, 0.14640275),
 (129, 0.14589402),
 (473, 0.14514595)]

similar 보다 더 유사도가 생각보다 낮게 나온다.

In [28]:
#원래 index로 바꿔주기
[idx_to_movie[i[0]] for i in movie_recommended]

[3114,
 2424,
 3255,
 2724,
 2011,
 1923,
 3256,
 1580,
 1569,
 1307,
 2571,
 2761,
 2278,
 2353,
 1945,
 2396,
 1282,
 2000,
 1597,
 3178]

In [29]:
# 첫번째로 추천해 준 영화
movies.head(3050)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy
3046,3115,Flawless (1999),Drama
3047,3116,Miss Julie (1999),Drama
3048,3117,Ride with the Devil (1999),Drama|Romance|War


Toy Story2를 추천해주고 있다. 그렇다면, 어떠한 영화가 이와 같이 추천을 해줬을까?

In [30]:
Toy_story2 = movie_to_idx[3114]
explain = als_model.explain(user, csr_data, itemid=Toy_story2)

In [31]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[(1, 0.2412372011540577),
 (2058, 0.04195802920332205),
 (2671, 0.02278599672476561),
 (1270, 0.00025268557997781013),
 (3404, -0.06027079447310567)]

전작품인 Toy Story1이 영향을 많이 주고 있다.

In [32]:
#두번째로 추천해준 영화
movies.head(2360)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
2355,2424,You've Got Mail (1998),Comedy|Romance
2356,2425,"General, The (1998)",Crime
2357,2426,"Theory of Flight, The (1998)",Comedy|Drama|Romance
2358,2427,"Thin Red Line, The (1998)",Action|Drama|War


In [33]:
YGM = movie_to_idx[2424]
explain = als_model.explain(user, csr_data, itemid=YGM)
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[(2671, 0.12369346410888168),
 (1270, 0.03970327202371545),
 (1, 0.029666679443015112),
 (2058, 0.021109686871570436),
 (3404, 0.007334407729559025)]

2671은 노팅힐의 인덱스 번호다. 노팅힐의 장르는 로맨틱 코미디이다. 이와 동일한 장르로서, 'You've Got Mail (1998)' 영화를 추천했다고 추측된다.

### 회고

날이 갈 수록 프로젝트는 어렵지만, 재미는 배가 되는 거 같다. 비록, 모든 것이 찍먹인 단계이지만 구현해낼 수 있다는 거에 의미를 두고 싶다. 고통은 당연 뒤따라 오고 있다. 하지만, 고통 뒤에 성장도 꼭 같이 오고 있다고 믿고 있다.


아쉬운 점

1. matrix를 좀 더 공부해야겠다.

