In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Dataset

In [2]:
path = '../data/movielens/'

ratings_df = pd.read_csv(path + 'ratings.csv')
tags_df    = pd.read_csv(path + 'tags.csv')
movies_df  = pd.read_csv(path + 'movies.csv', index_col='movieId') 

# Genre를 이용한 movie representation
- genre TF-IDF representation

In [15]:
from collections import defaultdict

In [20]:
# DF값을 구하기 위해 장르 갯수 count
genre_count = defaultdict(int)

for genres in movies_df['genres'].map(lambda x: x.split('|')):
    for genre in genres:
        genre_count[genre] += 1

In [22]:
print(f'총 장르의 갯수 = {len(genre_count)}')

genre_count

총 장르의 갯수 = 20


defaultdict(int,
            {'Adventure': 1263,
             'Animation': 611,
             'Children': 664,
             'Comedy': 3756,
             'Fantasy': 779,
             'Romance': 1596,
             'Drama': 4361,
             'Action': 1828,
             'Crime': 1199,
             'Thriller': 1894,
             'Horror': 978,
             'Mystery': 573,
             'Sci-Fi': 980,
             'War': 382,
             'Musical': 334,
             'Documentary': 440,
             'IMAX': 158,
             'Western': 167,
             'Film-Noir': 87,
             '(no genres listed)': 34})

In [23]:
# genre IDF의 값을 구한다

total_count = len(movies_df)
genre_IDF = {}

for genre in genre_count.keys():
    genre_IDF[genre] = np.log10(total_count / genre_count[genre])

genre_IDF

{'Adventure': 0.8872447746804204,
 'Animation': 1.2026069149931968,
 'Children': 1.1664800458677336,
 'Comedy': 0.41392254164167785,
 'Fantasy': 1.0971106675631868,
 'Romance': 0.7856152382210405,
 'Drama': 0.3490620385623247,
 'Action': 0.7266719338379385,
 'Crime': 0.9098289421369025,
 'Thriller': 0.7112681505684965,
 'Horror': 0.9983092704481497,
 'Mystery': 1.2304935032683613,
 'Sci-Fi': 0.9974220495432562,
 'War': 1.4065847623240424,
 'Musical': 1.4649016584241867,
 'Documentary': 1.3451954487495636,
 'IMAX': 1.7899910382813284,
 'Western': 1.7659316540881678,
 'Film-Noir': 2.0491288726171324,
 '(no genres listed)': 2.457169208193496}

In [26]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [29]:
movie_genre_matrix = movies_df['genres'].str.get_dummies(sep='|')
movie_genre_matrix.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
# TF-IDF representation
for col in movie_genre_matrix.columns:
    movie_genre_matrix[col] *= genre_IDF[col]

movie_genre_matrix.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.0,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.0,1.097111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.887245,0.0,1.16648,0.0,0.0,0.0,0.0,1.097111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.349062,0.0,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Tag를 이용한 movie representation

In [36]:
print(tags_df.shape)
tags_df.head()

(3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [32]:
# DF값을 구하기 위해 tag 갯수 count
tag_count = defaultdict(int)

for tags in tags_df['tag'].map(lambda x: x.split(',')):
    for tag in tags:
        tag_count[tag] += 1

In [35]:
print(f'총 태그의 갯수 = {len(tag_count)}')

총 태그의 갯수 = 1589


In [60]:
movie_tag_matrix = pd.DataFrame(
    index=list(set(tags_df['movieId'])),
    columns=tag_count.keys()
).fillna(0)

for _, group in tqdm(tags_df.groupby('movieId')):
    movieId_idx = group['movieId'].unique()
    for tag in group['tag'].values:
        movie_tag_matrix.loc[movieId_idx, tag] += 1

100%|██████████| 1572/1572 [00:02<00:00, 567.59it/s]


In [61]:
print(movie_tag_matrix.shape)
movie_tag_matrix.head()

(1572, 1589)


Unnamed: 0,funny,Highly quotable,will ferrell,Boxing story,MMA,Tom Hardy,drugs,Leonardo DiCaprio,Martin Scorsese,way too long,...,societal criticism,TERRORISM,British,Romans,70mm,for katie,austere,gun fu,heroic bloodshed,Heroic Bloodshed
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# 잘된건가?
movie_tag_matrix.sum()[:5]

funny              23
Highly quotable     3
will ferrell        4
Boxing story        1
MMA                 1
dtype: int64

In [64]:
# 된것같다
tag_count['funny']

23

In [67]:
# tag IDF의 값을 구한다

total_count = len(tags_df)
tag_IDF = {}

for tag in tag_count.keys():
    tag_IDF[tag] = np.log10(total_count / tag_count[tag])

In [68]:
# TF-IDF representation
for col in movie_tag_matrix.columns:
    movie_tag_matrix[col] *= tag_IDF[col]

movie_tag_matrix.head()

Unnamed: 0,funny,Highly quotable,will ferrell,Boxing story,MMA,Tom Hardy,drugs,Leonardo DiCaprio,Martin Scorsese,way too long,...,societal criticism,TERRORISM,British,Romans,70mm,for katie,austere,gun fu,heroic bloodshed,Heroic Bloodshed
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final movie representation

In [69]:
print(movie_genre_matrix.shape)
print(movie_tag_matrix.shape)

# tag가 없는 영화가 많다.

(9742, 20)
(1572, 1589)


In [71]:
movie_representation = pd.concat([movie_genre_matrix, movie_tag_matrix], axis=1).fillna(0)

print(movie_representation.shape)

(9742, 1609)


In [80]:
movie_representation.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,societal criticism,TERRORISM,British,Romans,70mm,for katie,austere,gun fu,heroic bloodshed,Heroic Bloodshed
1,0.0,0.0,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.887245,0.0,1.16648,0.0,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.349062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 유사도 평가

In [81]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=a.index, columns=b.index)

    return result_df

In [82]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)

In [83]:
print(cs_df.shape)
cs_df.head()

(9742, 9742)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
1,1.0,0.066058,0.004477,0.024031,0.006015,0.0,0.008444,0.196133,0.0,0.078065,...,0.034027,0.154562,0.042347,0.160942,0.0,0.206305,0.224784,0.0,0.137748,0.055394
2,0.066058,1.0,0.0,0.0,0.0,0.0,0.0,0.215845,0.0,0.085911,...,0.0,0.0,0.0,0.0,0.0,0.096864,0.10554,0.0,0.0,0.0
3,0.004477,0.0,1.0,0.161382,0.008777,0.0,0.056704,0.0,0.0,0.0,...,0.005259,0.0,0.061791,0.0,0.0,0.018281,0.019919,0.0,0.0,0.080829
4,0.024031,0.0,0.161382,1.0,0.047108,0.0,0.304339,0.0,0.0,0.0,...,0.028226,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.006015,0.0,0.008777,0.047108,1.0,0.0,0.652762,0.0,0.0,0.0,...,0.007065,0.0,0.083012,0.0,0.0,0.02456,0.02676,0.0,0.0,0.108589


In [85]:
cs_df[1].sort_values(ascending=False)[:5]

1         1.000000
122918    0.371489
3754      0.298322
53121     0.298322
4886      0.298322
Name: 1, dtype: float64

In [88]:
for i in cs_df[1].sort_values(ascending=False)[:5].index:
    print(movies_df.loc[i], end='\n\n')

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object

title     Guardians of the Galaxy 2 (2017)
genres             Action|Adventure|Sci-Fi
Name: 122918, dtype: object

title     Adventures of Rocky and Bullwinkle, The (2000)
genres       Adventure|Animation|Children|Comedy|Fantasy
Name: 3754, dtype: object

title                          Shrek the Third (2007)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 53121, dtype: object

title                           Monsters, Inc. (2001)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 4886, dtype: object



# 예측, 성능 평가

In [89]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

In [104]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
77701,483,8529,4.0,1215545278
94477,599,33437,2.5,1498518389
36246,247,5349,2.0,1467645405
17483,111,7361,3.5,1516140853
100300,610,57504,4.5,1493847901


In [114]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
41008,276,780,5.0,858350384
94274,599,7624,2.5,1519235950
77380,483,1320,2.5,1215895327
29744,202,3448,3.0,974924072
40462,274,60291,4.0,1296947017


In [115]:
test_df['pred_rating_TFIDF'] = 0

In [119]:
for i in tqdm(range(len(test_df))):
    # 예측할 유저와 영화
    tmp = test_df.iloc[i, :]

    # 예측할 유저의 train 영화 rating에
    # similarity를 weight로 하여 가중평균내서 예측
    user_info = train_df.loc[train_df['userId'] == tmp['userId'], ['movieId', 'rating']]
    user_rated_movieID = user_info['movieId'].values
    user_rated_rating  = user_info['rating'].values
    sim_weight = cs_df.loc[tmp['movieId'], user_rated_movieID].values
    # 가중평균
    result = np.sum(sim_weight * user_rated_rating / (np.sum(sim_weight) + 1))

    # 결과 저장
    test_df.iloc[i, -1] = result

100%|██████████| 20168/20168 [00:29<00:00, 686.02it/s]


In [120]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,pred_rating_TFIDF
41008,276,780,5.0,858350384,3.261812
94274,599,7624,2.5,1519235950,2.706046
77380,483,1320,2.5,1215895327,3.268273
29744,202,3448,3.0,974924072,3.702311
40462,274,60291,4.0,1296947017,3.127068


In [121]:
mse = mean_squared_error(test_df['rating'].values, test_df['pred_rating_TFIDF'].values)
rmse = np.sqrt(mse)

print(f'TFIDF를 이용한 결과 RMSE = {rmse:.2f}')

TFIDF를 이용한 결과 RMSE = 1.21
