In [221]:
import numpy as np
import scipy
import implicit
import pandas as pd
print(np.__version__)
print(scipy.__version__)
print(implicit.__version__)

1.21.4
1.7.1
0.4.8


## 데이터 준비와 전처리 
Movielens 데이터는 rating.dat 안에 이미 인덱싱까지 완료된 사용자-영화-평점 데이터가 깔끔하게 정리되어 있습니다.

In [222]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [223]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [224]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [225]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [226]:
ratings.head()

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [227]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [228]:
# lower 전처리 하기 

movies["title"]=movies["title"].str.lower()
movies["genre"]=movies["genre"].str.lower()

In [229]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy


ratings에 있는 유니크한 영화 개수  
ratings에 있는 유니크한 사용자 수  
가장 인기 있는 영화 30개(인기순)  

In [230]:
# rating에 있는 유니크한 영화 개수 
ratings["movie_id"].nunique()

3628

In [231]:
# ratings에 있는 유니크한 사용자 수
ratings["user_id"].nunique()

6039

## Rating와 movie 데이터 파악 및 합치기 


In [232]:
ratings[ratings["user_id"]==1]

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [233]:
# ratings 같은 유저 순으로 재정렬 하기 
ratings.sort_values(by=["user_id"],ascending=True,inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
29,1,745,3,978824268
30,1,2294,4,978824291
31,1,3186,4,978300019
32,1,1566,4,978824330


In [234]:
# rating과 movies movie id 기준으로 합치기 
new_movie = pd.merge(ratings,movies,on="movie_id",how="left")
new_movie.head()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,one flew over the cuckoo's nest (1975),drama
1,1,745,3,978824268,"close shave, a (1995)",animation|comedy|thriller
2,1,2294,4,978824291,antz (1998),animation|children's
3,1,3186,4,978300019,"girl, interrupted (1999)",drama
4,1,1566,4,978824330,hercules (1997),adventure|animation|children's|comedy|musical


In [235]:
"""
가장 인기있는 영화 30개순 
인기 있다는건 유저가 많이 본것을 의미 하니까 유저에 대한 counting이 필요 하다 


"""

import seaborn as sns 
import matplotlib.pyplot as plt 

movie_count = new_movie.groupby("movie_id").agg({"user_id":"count"})  

movie_count

Unnamed: 0_level_0,user_id
movie_id,Unnamed: 1_level_1
1,2000
2,551
3,339
4,102
5,214
...,...
3948,752
3949,280
3950,47
3951,36


In [236]:
# 상위 30개를 위한 정렬 하고 영화 아이디 가지고 제목 가져오기 

movie_count=movie_count.sort_values(by=["user_id"],ascending=False)
movie_count

Unnamed: 0_level_0,user_id
movie_id,Unnamed: 1_level_1
2858,3211
260,2910
1196,2885
1210,2716
2028,2561
...,...
1553,1
1548,1
2486,1
138,1


In [237]:
top30 = movie_count.head(30)
top30

Unnamed: 0_level_0,user_id
movie_id,Unnamed: 1_level_1
2858,3211
260,2910
1196,2885
1210,2716
2028,2561
589,2509
593,2498
1198,2473
1270,2460
2571,2434


In [238]:
# user가 많이 본 영화 제목 출력 
top30_index = top30.index.to_list()

for i in range(30):
    print(new_movie[new_movie["movie_id"]==top30_index[i]].values[0])

[2 2858 4 978298434 'american beauty (1999)' 'comedy|drama']
[1 260 4 978300760 'star wars: episode iv - a new hope (1977)'
 'action|adventure|fantasy|sci-fi']
[2 1196 5 978298730
 'star wars: episode v - the empire strikes back (1980)'
 'action|adventure|drama|sci-fi|war']
[2 1210 4 978298151 'star wars: episode vi - return of the jedi (1983)'
 'action|adventure|romance|sci-fi|war']
[1 2028 5 978301619 'saving private ryan (1998)' 'action|drama|war']
[2 589 4 978299773 'terminator 2: judgment day (1991)'
 'action|sci-fi|thriller']
[2 593 5 978298517 'silence of the lambs, the (1991)' 'drama|thriller']
[2 1198 4 978298124 'raiders of the lost ark (1981)' 'action|adventure']
[1 1270 5 978300055 'back to the future (1985)' 'comedy|sci-fi']
[2 2571 4 978299773 'matrix, the (1999)' 'action|sci-fi|thriller']
[2 480 5 978299809 'jurassic park (1993)' 'action|adventure|sci-fi']
[1 2762 4 978302091 'sixth sense, the (1999)' 'thriller']
[1 608 4 978301398 'fargo (1996)' 'crime|drama|thriller']


## 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

In [239]:
# genre . movie id 
def get_movie_genre(favorite_list):
    genre_ = []
 
    #5개만 기준이다
    for i in range(5):
        genre_.extend(movies[movies['title'] == favorite_list[i]]['genre'].to_list())


    return genre_

In [240]:
# 좋아하는 영화 평점 추가 하기 
# user id는 6041번째 
favorite_list =["psycho (1960)","philadelphia (1993)","saving private ryan (1998)","gladiator (2000)","mighty aphrodite (1995)"]

favorit_movie_id = list(map(lambda x : int(movies[movies["title"]==x]["movie_id"].values),favorite_list))

my_movie_data = pd.DataFrame({"user_id":[6041]*5,
                             "movie_id":favorit_movie_id,
                             "counts":[3,4,5,4,5],
                             "genre":get_movie_genre(favorite_list),
                             "title":favorite_list})


In [241]:
# favorite_list = ["Iron man","Spider-Man: No Way Home",
#                 "Gifted","Begin Again","About Time"]

# favorit_rate = [7.9,8.2,7.6,7.4,7.8]
# my_movie_data = pd.DataFrame({"user_id":[6041,6042,6043,6044,6045],
#                              "movie_id":[3953,3954,3955,3956,3957],
#                              "counts":favorit_rate})




In [266]:
if not new_movie.isin({'user_id':[6041]})['user_id'].any():  # user_id에 6041이라는 데이터가 없다면
    new_movie = new_movie.append(my_movie_data) 
new_movie.tail()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,6041,1219,3,,psycho (1960),horror|thriller
1,6041,508,4,,philadelphia (1993),drama
2,6041,2028,5,,saving private ryan (1998),action|drama|war
3,6041,3578,4,,gladiator (2000),action|drama
4,6041,52,5,,mighty aphrodite (1995),comedy


In [267]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = new_movie['user_id'].unique()
# movie_unique = ratings['movie_id'].unique()
movie_unique = new_movie['title'].unique()

# 유저, 영화 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [270]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print('user_id indexing: {}\n'.format(user_to_idx[6041]))

for i in favorit_movie_id:
    print('movie_id indexing: {}'.format(i))

user_id indexing: 6039

movie_id indexing: 1219
movie_id indexing: 508
movie_id indexing: 2028
movie_id indexing: 3578
movie_id indexing: 52


In [273]:
temp_user_ratings = new_movie['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_ratings) == len(new_movie):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    new_movie['user_id'] = temp_user_ratings   # ratings['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# movie_to_idx을 통해 movie 컬럼도 동일한 방식으로 인덱싱해 줍니다. 

temp_movie_ratings = new_movie['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_ratings) == len(new_movie):
    print('movie column indexing OK!!')
    new_movie['movie_id'] = temp_movie_ratings
else:
    print('movie column indexing Fail!!')

new_movie.tail(10)

user_id column indexing Fail!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
836473,6038,1451,4,997453762.0,"dreamlife of angels, the (la vie rêvée des ang...",drama
836474,6038,64,4,997454126.0,"matrix, the (1999)",action|sci-fi|thriller
836475,6038,321,3,964828542.0,out of sight (1998),action|crime|romance
836476,6038,306,4,956705056.0,go (1999),crime
836477,6038,28,4,956715569.0,e.t. the extra-terrestrial (1982),children's|drama|fantasy|sci-fi
0,6039,1011,3,,psycho (1960),horror|thriller
1,6039,428,4,,philadelphia (1993),drama
2,6039,19,5,,saving private ryan (1998),action|drama|war
3,6039,119,4,,gladiator (2000),action|drama
4,6039,1098,5,,mighty aphrodite (1995),comedy


In [274]:
new_movie.isna().sum()

user_id      0
movie_id     0
counts       0
timestamp    5
title        0
genre        0
dtype: int64

## 4) CSR matrix를 직접 만들어 봅시다.

In [276]:
from scipy.sparse import csr_matrix

num_user = new_movie['user_id'].nunique()
num_movie = new_movie['movie_id'].nunique()

csr_data = csr_matrix((new_movie.counts, (new_movie.user_id, new_movie.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [277]:
# ALS 사용하는 MF 

from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [278]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [279]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [280]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [281]:
user=user_to_idx[6041]
user_vector = als_model.user_factors[user]


In [282]:
user_vector  # 사용자 벡터

array([-0.3554642 ,  0.0679852 ,  0.11346073,  0.08256622, -0.7435031 ,
       -0.65676886,  0.19136626,  0.29651153, -0.06436598,  0.08378376,
       -0.17190933,  0.5606403 ,  0.28442636, -0.06091502, -0.07915539,
        0.5549235 ,  0.13248731, -0.03944676,  0.23888218, -0.09676064,
       -0.11705393, -0.25890592,  0.7684527 ,  1.0147793 , -0.02463663,
       -0.016728  , -0.41321677,  0.95431554,  0.5833441 , -0.57188696,
       -0.41599628, -0.14668562,  0.14796191,  0.2826938 , -0.49351427,
       -0.44877365,  0.03723872,  0.01848946, -0.02383014,  0.09247822,
        0.01672223,  0.7230829 , -0.9297584 ,  0.21103665, -0.09434933,
       -0.16251871,  0.27023643, -0.36985937, -0.14715023, -0.29280448,
       -0.0188821 ,  0.61677945,  0.5615376 , -0.09186218, -0.12390497,
       -0.18930545,  0.00918412, -0.5394854 , -0.45827097,  0.13675323,
       -0.196833  ,  0.10242654, -1.6480776 , -0.22934785,  0.3905067 ,
        0.04675451, -1.0041382 ,  0.48357645, -0.41038036,  0.98

## 6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

In [284]:
toy_story_2 = movie_to_idx["saving private ryan (1998)"]
toy_story_2_vector = als_model.item_factors[toy_story_2]
np.dot(user_vector, toy_story_2_vector)

0.5096186

In [289]:
def predict_my_preference(user_vector, title, print_flag=True):
    movie = movie_to_idx.get(title, 0)
    movie_vector = als_model.item_factors[movie]

    # user_vector와 movie_vector를 내적하는 코드
    dot_user_movie = np.dot(user_vector, movie_vector)
    if print_flag:
        print(f'*user_vector 와 movie_vector({title}) 를 내적한 결과: {dot_user_movie:.2f}')
    else:
        return round(dot_user_movie, 2)

In [290]:
predict_my_preference(user_vector, 'saving private ryan (1998)')

*user_vector 와 movie_vector(saving private ryan (1998)) 를 내적한 결과: 0.51


In [292]:
for movie_title in favorite_list:
    predict_my_preference(user_vector, movie_title)

*user_vector 와 movie_vector(psycho (1960)) 를 내적한 결과: 0.27
*user_vector 와 movie_vector(philadelphia (1993)) 를 내적한 결과: 0.17
*user_vector 와 movie_vector(saving private ryan (1998)) 를 내적한 결과: 0.51
*user_vector 와 movie_vector(gladiator (2000)) 를 내적한 결과: 0.48
*user_vector 와 movie_vector(mighty aphrodite (1995)) 를 내적한 결과: 0.22


In [295]:
favorite_movie = 'saving private ryan (1998)'
movie_title = movie_to_idx[favorite_movie]
similar_movies = als_model.similar_items(movie_title, N=15)
similar_movies

[(19, 1.0000001),
 (153, 0.68533653),
 (41, 0.532552),
 (85, 0.46644583),
 (64, 0.41312945),
 (100, 0.4108557),
 (299, 0.40695584),
 (472, 0.39656755),
 (66, 0.391129),
 (3499, 0.3656867),
 (222, 0.3331737),
 (70, 0.32908934),
 (95, 0.32653284),
 (158, 0.32582712),
 (44, 0.31499642)]

In [296]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movies]

['saving private ryan (1998)',
 'braveheart (1995)',
 "schindler's list (1993)",
 'shawshank redemption, the (1994)',
 'matrix, the (1999)',
 'fugitive, the (1993)',
 'good will hunting (1997)',
 'boat, the (das boot) (1981)',
 'silence of the lambs, the (1991)',
 'simon sez (1999)',
 'goodfellas (1990)',
 'star wars: episode v - the empire strikes back (1980)',
 'thelma & louise (1991)',
 'terminator 2: judgment day (1991)',
 'back to the future (1985)']

## 7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [297]:
def get_similar_movie(movie_name: str):
    movie_title = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_title, N=15)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movies]
    return similar_movie

print("슝=3")

슝=3


In [299]:
get_similar_movie('saving private ryan (1998)')

['saving private ryan (1998)',
 'braveheart (1995)',
 "schindler's list (1993)",
 'shawshank redemption, the (1994)',
 'matrix, the (1999)',
 'fugitive, the (1993)',
 'good will hunting (1997)',
 'boat, the (das boot) (1981)',
 'silence of the lambs, the (1991)',
 'simon sez (1999)',
 'goodfellas (1990)',
 'star wars: episode v - the empire strikes back (1980)',
 'thelma & louise (1991)',
 'terminator 2: judgment day (1991)',
 'back to the future (1985)']

## 8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [300]:
user = user_to_idx[6041]

movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(341, 0.3085016),
 (41, 0.27281624),
 (153, 0.25504652),
 (379, 0.24294864),
 (477, 0.24128115),
 (699, 0.22297667),
 (196, 0.21732411),
 (299, 0.19680586),
 (1022, 0.18876389),
 (377, 0.18497783),
 (100, 0.16766089),
 (95, 0.16684276),
 (736, 0.16508892),
 (1549, 0.16260092),
 (376, 0.16148345),
 (44, 0.15476152),
 (31, 0.15435083),
 (66, 0.15381825),
 (200, 0.14475632),
 (12, 0.14461334)]

In [301]:
[idx_to_movie[i[0]] for i in movie_recommended]

['patriot, the (2000)',
 "schindler's list (1993)",
 'braveheart (1995)',
 'exorcist, the (1973)',
 'mission: impossible 2 (2000)',
 'godfather, the (1972)',
 'jaws (1975)',
 'good will hunting (1997)',
 'shining, the (1980)',
 'x-men (2000)',
 'fugitive, the (1993)',
 'thelma & louise (1991)',
 'perfect storm, the (2000)',
 'u-571 (2000)',
 'godfather: part ii, the (1974)',
 'back to the future (1985)',
 'erin brockovich (2000)',
 'silence of the lambs, the (1991)',
 'alien (1979)',
 'rain man (1988)']

# Bert4Rec 



In [2]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm

import tensorflow as tf

# huggingface
from datasets import Dataset
from transformers import BertForMaskedLM, TFBertForMaskedLM, BertConfig, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import create_optimizer, AdamWeightDecay

In [1]:
!git clone https://github.com/FeiSun/BERT4Rec.git

Cloning into 'BERT4Rec'...
remote: Enumerating objects: 51, done.[K
remote: Total 51 (delta 0), reused 0 (delta 0), pack-reused 51[K
Unpacking objects: 100% (51/51), 65.15 MiB | 12.49 MiB/s, done.


In [12]:
pip install --upgrade pip

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
     |████████████████████████████████| 2.1 MB 5.8 MB/s            
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.3.1
    Uninstalling pip-21.3.1:
      Successfully uninstalled pip-21.3.1
Successfully installed pip-22.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install tensorflow

[0m

In [3]:
%%bash
./run_ml-1m.sh

Traceback (most recent call last):
  File "/aiffel/aiffel/BERT4Rec/gen_data_fin.py", line 24, in <module>
    flags = tf.flags
AttributeError: module 'tensorflow' has no attribute 'flags'
Traceback (most recent call last):
  File "/aiffel/aiffel/BERT4Rec/run.py", line 23, in <module>
    import optimization
  File "/aiffel/aiffel/BERT4Rec/optimization.py", line 85, in <module>
    class AdamWeightDecayOptimizer(tf.train.Optimizer):
AttributeError: module 'tensorflow._api.v2.train' has no attribute 'Optimizer'


CalledProcessError: Command 'b'./run_ml-1m.sh\n'' returned non-zero exit status 1.

# 회고 

시간 계산이 부족했다 생각 보다 쉽게 진행 할줄 알았는데 데이터 파악이 완전히 되지 않아서 시간을 많이 소비 한것 같다 

그리고 ALS 행렬 인수 분해가 아니라 bert for Recommendatiaon을 이용해서 추가 작업 하였다 원래는 paper with code 에서 베스트 모델을 하려 했으나 transfermer개념을 같이 적용할수있는 추천 시스템을 찾았고 SASRec을 하려고 했다가 결국 Bert4Rec을 진행하기로 했다 

처음에는 코드 구현까지 하려고 했지만 잘 안되서 github올라와있는 코드로 실행하려고 clone까지 했지만 결과는 위에서 처럼 잘 안되었다 
그래도 BERT4Rec을 공부할수있는 시간이라서 개인적으로 만족한다 이론만으로 아쉬워서 코드까지 이해해보려 시간 투자 많이 했지만 
이제는 포기 할때가 된것 같다 이 부분은 추후에 다시 보도록 하겠다 

