##  Movielens 영화 추천 실습

#### 라이브러리 로드

In [1]:
#kernel tf-210 
import numpy as np
import scipy
import implicit
import pandas as pd
import os

print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


#### 데이터 로드

In [2]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head(10)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


#### 데이터 전처리

In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

#### 영화 메타 데이터 로드

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


#### 평점과 영화데이터 Merge

In [7]:
ratings = pd.merge(ratings, movies[['title','movie_id']], on='movie_id', how='left')

In [8]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp,title
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975)
1,1,661,3,978302109,James and the Giant Peach (1996)
2,1,914,3,978301968,My Fair Lady (1964)
3,1,3408,4,978300275,Erin Brockovich (2000)
4,1,2355,5,978824291,"Bug's Life, A (1998)"
...,...,...,...,...,...
836473,6040,1090,3,956715518,Platoon (1986)
836474,6040,1094,5,956704887,"Crying Game, The (1992)"
836475,6040,562,5,956704746,Welcome to the Dollhouse (1995)
836476,6040,1096,4,956715648,Sophie's Choice (1982)


#### 영화제목을 기준으로 높은 평점 순 정렬

In [9]:
movie_count = ratings.groupby('title')['counts'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

#### 개인선호도 반영

In [10]:
# 선호영화 5개 

my_favorite = ['Fargo', 'Matrix', 'get out' , 'inception', 'gravity']
my_list = pd.DataFrame({'user_id':['test']*5, 'title':my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':['test']})['user_id'].any():
    ratings = ratings.append(my_list)

ratings.tail(10)    

Unnamed: 0,user_id,movie_id,counts,timestamp,title
836473,6040,1090.0,3,956715518.0,Platoon (1986)
836474,6040,1094.0,5,956704887.0,"Crying Game, The (1992)"
836475,6040,562.0,5,956704746.0,Welcome to the Dollhouse (1995)
836476,6040,1096.0,4,956715648.0,Sophie's Choice (1982)
836477,6040,1097.0,4,956715569.0,E.T. the Extra-Terrestrial (1982)
0,test,,5,,Fargo
1,test,,5,,Matrix
2,test,,5,,get out
3,test,,5,,inception
4,test,,5,,gravity


#### 사용자, 영화제목을 중복을 배제하고 index를 매핑

In [11]:
# 유저, 영화제목 기준 중복 제거
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

#### 사용자 id를 중복배제한 후 indexing한 데이터로 대체 

In [12]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
temp_user_data

if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')
    

user_id column indexing OK!!


##### 사용자id를 indexing한 데이터

In [13]:
temp_user_data

0       0
1       0
2       0
3       0
4       0
     ... 
0    6039
1    6039
2    6039
3    6039
4    6039
Name: user_id, Length: 836483, dtype: int64

#### 영화ID칼럼을 기존 영화id칼럼에서 제목을 기준으로 indexing한 데이터로 대체 (3632건)

In [14]:
temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('movie_id column indexing Fail!!')

movie_id column indexing OK!!


##### 영화제목을 중복배제하고 indexing한 데이터 

In [15]:
temp_movie_data

0       0
1       1
2       2
3       3
4       4
     ... 
0    3628
1    3629
2    3630
3    3631
4    3632
Name: title, Length: 836483, dtype: int64

#### user_id와 Movie_id는 index역할을 하며 실제 데이터는 타이틀과 사용자 ID임

In [16]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp,title
0,0,0,5,978300760.0,One Flew Over the Cuckoo's Nest (1975)
1,0,1,3,978302109.0,James and the Giant Peach (1996)
2,0,2,3,978301968.0,My Fair Lady (1964)
3,0,3,4,978300275.0,Erin Brockovich (2000)
4,0,4,5,978824291.0,"Bug's Life, A (1998)"
...,...,...,...,...,...
0,6039,3628,5,,Fargo
1,6039,3629,5,,Matrix
2,6039,3630,5,,get out
3,6039,3631,5,,inception


세로축은 사용자ID에 대한 index 가로축은 영화제목에 대한 index를 기준으로 pivoting( 6040 * 3633 )

In [17]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movies = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings.user_id, ratings.movie_id)), shape=(num_user, num_movies))
csr_data


<6040x3633 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [18]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델의 선언
# 1. factors : 유저와 아이템의 벡터를 몇 차원으로 할 것인지
# 2. regularization : 과적합을 방지하기 위해 정규화 값을 얼마나 사용할 것인지 
# 3. use_gpu : GPU를 사용할 것인지 
# 4. iterations : epochs와 같은 의미입니다.
als_model = AlternatingLeastSquares(factors=100, regularization=0.001, use_gpu=False, iterations=100, dtype=np.float32)

In [19]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3633x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [20]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [21]:
kbc, inception = user_to_idx['test'], movie_to_idx['Fargo']
kbc_vector, inception_vector = als_model.user_factors[kbc], als_model.item_factors[inception]


In [22]:
# 내적하는 코드
np.dot(kbc_vector, inception_vector)

1.7693739e-14

In [23]:
getout = movie_to_idx['get out']
getout_vector = als_model.item_factors[getout]
np.dot(kbc_vector, getout_vector)

4.4956616e-14

In [24]:
# (id, 유사도 )
favorite_movie = 'Matrix'
movie_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(3629, 1.0),
 (3631, 0.9986594),
 (3632, 0.9985732),
 (3630, 0.998209),
 (3628, 0.9931925),
 (3579, 0.9662137),
 (3575, 0.96621263),
 (3578, 0.9662109),
 (3576, 0.96621054),
 (3574, 0.9662084),
 (3572, 0.9662014),
 (3583, 0.96620136),
 (3573, 0.96619976),
 (3580, 0.96619946),
 (3577, 0.96619725)]

items() 함수를 호출하여 Dictionary형태의 데이터형으로 변환

In [25]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_movie = {v:k for k,v in movie_to_idx.items()}

In [26]:
# 비슷한 영화 이름 검색 함수
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

print("슝=3")

슝=3


In [27]:
get_similar_movie('Matrix')

['Matrix',
 'inception',
 'gravity',
 'get out',
 'Fargo',
 'Last of the High Kings, The (a.k.a. Summer Fling) (1996)',
 "Brother's Kiss, A (1997)",
 'Number Seventeen (1932)',
 'War at Home, The (1996)',
 'Century (1993)']

In [28]:
user = user_to_idx['test']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=30, filter_already_liked_items=False)
movie_recommended

[(953, 4.626397e-08),
 (22, 4.5547054e-08),
 (476, 4.375514e-08),
 (26, 4.1678653e-08),
 (29, 4.126872e-08),
 (2286, 4.075807e-08),
 (2609, 4.031739e-08),
 (1508, 4.0045332e-08),
 (107, 3.9165666e-08),
 (2331, 3.7938015e-08),
 (2229, 3.738851e-08),
 (2129, 3.7198756e-08),
 (2661, 3.7112738e-08),
 (361, 3.7062357e-08),
 (2376, 3.6810597e-08),
 (3050, 3.6514336e-08),
 (470, 3.6145842e-08),
 (504, 3.6096434e-08),
 (238, 3.5808878e-08),
 (2239, 3.4932427e-08),
 (13, 3.487571e-08),
 (258, 3.470563e-08),
 (2095, 3.4505366e-08),
 (2044, 3.389316e-08),
 (2593, 3.38558e-08),
 (296, 3.379309e-08),
 (869, 3.3211748e-08),
 (480, 3.3072162e-08),
 (3309, 3.2962657e-08),
 (1393, 3.2879868e-08)]

In [29]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Muppet Movie, The (1979)',
 'Back to the Future (1985)',
 'Wrong Trousers, The (1993)',
 'E.T. the Extra-Terrestrial (1982)',
 'Close Shave, A (1995)',
 'Friday (1995)',
 'Great White Hype, The (1996)',
 'Fantasia 2000 (1999)',
 'Jurassic Park (1993)',
 'Walk on the Moon, A (1999)',
 'Thieves (Voleurs, Les) (1996)',
 'Character (Karakter) (1997)',
 'Family Thing, A (1996)',
 'Casablanca (1942)',
 'Xiu Xiu: The Sent-Down Girl (Tian yu) (1998)',
 'Jude (1996)',
 'Swingers (1996)',
 'Better Off Dead... (1985)',
 'Thin Red Line, The (1998)',
 'East-West (Est-ouest) (1999)',
 "Ferris Bueller's Day Off (1986)",
 'Wild Reeds (1994)',
 'Taking of Pelham One Two Three, The (1974)',
 'Drunken Master (Zui quan) (1979)',
 'My Favorite Season (1993)',
 'Afterglow (1997)',
 'Angel Baby (1995)',
 'Grand Day Out, A (1992)',
 'Hollow Reed (1996)',
 'Paper Chase, The (1973)']

## 회고

- process flow
  . 데이터 다운로드 <br>
  . 랭킹데이터 전처리 <br>
  . 영화 데이터 다운로드 하여 평점정보와 병합하여 하나의 테이블로 매핑<br>
  . 사용자id 인덱싱<br>
  . 영화 타이틀 인덱싱<br>
  . 영화id에 영화 타이틀 인덱싱으로 정보 변경<br>
  . CSR matrix 작성<br>
  . als_model의 입력으로 상품 * 유저형태( 버전별로 차이가 있음 0.6.2 버전에서는 다르게 동작 )<br>
  . als model 모델 구성<br>
  . 훈련<br>
  . 추천시스템 확인<br>
  . 사용자id 와 영화id를 자체 인덱싱하여 csr maxtrix로 pivoting하는 부분까지가 기존과 차이점으로 생각됨<br>
<br>
- 모델 비교<br>
   .als_model 모델에 정규화 파라미터를 조정하는 경우 유사도 측정시 많은 차이점을 보임<br>
<br>