In [1]:
import os
import pandas as pd
from implicit.als import AlternatingLeastSquares
import numpy as np
from scipy.sparse import csr_matrix

### 데이터 준비와 전처리

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


### 데이터 분석

In [7]:
# 분석을 위해 데이터를 병합합니다.
df = movies.set_index("movie_id") # movie_id를 인덱스로 지정합니다.
ratings = ratings.join(df, on = "movie_id") # movie_id를 기준으로 데이터를 병합합니다.

ratings.head()

Unnamed: 0,user_id,movie_id,counts,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [8]:
# ratings에 있는 유니크한 영화의 개수
ratings['movie_id'].nunique()

3628

In [9]:
#rating에 있는 유니크한 사용자 수
ratings['user_id'].nunique()

6039

In [10]:
# 가장 인기있는 영화 30개
popular_movies = ratings.groupby('title')['user_id'].count().sort_values(ascending=False).head(30)
print('가장 인기 있는 영화 30개:\n {}'.format(popular_movies))

가장 인기 있는 영화 30개:
 title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                       

### 내가 선호하는 영화를 5가지 골라서 ratings 에 추가

In [11]:
using_cols = ['title', 'user_id', 'counts']
ratings = ratings[using_cols]
ratings['title'] = ratings['title'].str.slice(start=0, stop=-7)
    
my_favorite = ['Toy Story', 'Bad Boys', 'Grumpier Old Men', 'Waiting to Exhale', 'Father of the Bride Part II']

my_movielist = pd.DataFrame({"user_id":['my']*5, 'title':my_favorite, 'counts': 5.0})

if not ratings.isin({'user_id':['my']})['user_id'].any():
    ratings = ratings.append(my_movielist)

ratings.tail(10)

Unnamed: 0,title,user_id,counts
1000203,Platoon,6040,3.0
1000205,"Crying Game, The",6040,5.0
1000206,Welcome to the Dollhouse,6040,5.0
1000207,Sophie's Choice,6040,4.0
1000208,E.T. the Extra-Terrestrial,6040,4.0
0,Toy Story,my,5.0
1,Bad Boys,my,5.0
2,Grumpier Old Men,my,5.0
3,Waiting to Exhale,my,5.0
4,Father of the Bride Part II,my,5.0


In [13]:
user_unique = ratings['user_id'].unique()
title_unique = ratings['title'].unique()

user_to_idx={v:k for k, v in enumerate(user_unique)}
title_to_idx={v:k for k, v in enumerate(title_unique)}

print(user_to_idx['my'])
print(title_to_idx['Toy Story'])

6039
40


In [14]:
temp_user_ratings = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_ratings) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK')
    ratings['user_id'] = temp_user_ratings   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail')

# movie_to_idx을 통해 title 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_ratings = ratings['title'].map(title_to_idx.get).dropna()
if len(temp_movie_ratings) == len(ratings):
    print('movie column indexing OK')
    ratings['title'] = temp_movie_ratings
else:
    print('movie column indexing Fail')

ratings

user_id column indexing OK
movie column indexing OK


Unnamed: 0,title,user_id,counts
0,0,0,5.0
1,1,0,3.0
2,2,0,3.0
3,3,0,4.0
4,4,0,5.0
...,...,...,...
0,40,6039,5.0
1,2029,6039,5.0
2,1844,6039,5.0
3,397,6039,5.0


## CSR matrix 만들기

In [15]:
num_user = ratings['user_id'].nunique()
num_movie = ratings['title'].nunique()

print(ratings.counts)
csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape=(num_user, num_movie))
csr_data

0    5.0
1    3.0
2    3.0
3    4.0
4    5.0
    ... 
0    5.0
1    5.0
2    5.0
3    5.0
4    5.0
Name: counts, Length: 836483, dtype: float64


<6040x3586 sparse matrix of type '<class 'numpy.float64'>'
	with 834213 stored elements in Compressed Sparse Row format>

### als_model = AlternatingLeastSquares 모델을 구성하고 훈련시키기

In [16]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [33]:
als_model=AlternatingLeastSquares(factors=1500, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)

In [34]:
# als 모델은 input으로 item X user 꼴의 matrix를 받기 때문에 Transpose
csr_data_transpose = csr_data.T
csr_data_transpose

<3586x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 834213 stored elements in Compressed Sparse Column format>

In [35]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/100 [00:00<?, ?it/s]

In [40]:
my, toystory = user_to_idx['my'], title_to_idx['Toy Story']
my_vector, toystory_vector = als_model.user_factors[my], als_model.item_factors[toystory]

In [41]:
np.dot(my_vector, toystory_vector)

0.99423003

### 내가 좋아하는 영화와 비슷한 영화를 추천 받기

In [42]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

def get_similar_title(title_name: str):
    title_id = title_to_idx[title_name]
    similar_title = als_model.similar_items(title_id)
    similar_title = [idx_to_title[i[0]] for i in similar_title]
    return similar_title

In [43]:
get_similar_title('Toy Story') 

['Toy Story',
 'Soft Toilet Seats',
 'To Have, or Not',
 'An Unforgettable Summer',
 'Truce, The',
 'Jerry & Tom',
 'Slappy and the Stinkers',
 'Circus',
 'For Ever Mozart',
 'Mascara']

In [44]:
get_similar_title('Bad Boys') 

['Bad Boys',
 'Race the Sun',
 'Tough and Deadly',
 'Master Ninja I',
 "Where's Marlowe?",
 'Boy Called Hate, A',
 'American Strays',
 'Yankee Zulu',
 'Bay of Blood (Reazione a catena)',
 'Tokyo Fist']

### 내가 좋아할만한 영화 추천 받기

In [46]:
user = user_to_idx['my']
# recommend에서는 user*item CSR Matrix를 받습니다.
title_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)

In [48]:
[idx_to_title[i[0]] for i in title_recommended]

['With Honors',
 'Mirror Has Two Faces, The',
 'Two if by Sea',
 'Sister Act 2: Back in the Habit',
 'Made in America',
 'Nine Months',
 'Crocodile Dundee II',
 'Soul Food',
 'Boys on the Side',
 'French Kiss',
 'Nothing But Trouble',
 "Preacher's Wife, The",
 'Steamboat Willie',
 'Something to Talk About',
 'Random Hearts',
 'Moonlight and Valentino',
 'Mr. Mom',
 'Indian Summer (a.k.a. Alive & Kicking)',
 'Color of Money, The',
 'Grumpy Old Men']

In [53]:
WH = title_to_idx['Grumpy Old Men']
explain = als_model.explain(user, csr_data, itemid=WH)

In [54]:
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[('Grumpier Old Men', 0.06975104613910332),
 ('Father of the Bride Part II', 0.0030738983772019067),
 ('Toy Story', 0.000951132087618615),
 ('Bad Boys', -0.0015214256119583868),
 ('Waiting to Exhale', -0.0015564764294123776)]

## 회고

- 영화를 잘 몰라서 추천받은 영화가 내가 좋아하는 영화랑 비슷한가 잘 모르겠다.   
  고른 영화도 toy story 말고는 아는게 안보여서 나머지 4개는 아무거나 골라봤다.   
- 모델 훈련시킬때 factor랑 iterations 값을 여러번 늘려봤다. 늘릴때마다 np.dot로 구한 벡터 내적값은 계속 커져서 1에 가까워졌다.   
  근데 얼마나 더 늘려도 되는지는 잘 모르겠다.   
- csr_matrix 개념이 조금 어려웠다. 좀 찾아보니까 이해는 됐다.