In [4]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

### MovieLens 데이터셋 불러오기

In [2]:
root_path = os.getcwd()
path = os.path.join(root_path, 'data/ml-latest-small/')

In [3]:
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')

In [6]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


### Genres 를 이용한 movie representation

In [35]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))
total_genres

['Mystery',
 'Documentary',
 '(no genres listed)',
 'Fantasy',
 'Thriller',
 'Sci-Fi',
 'Children',
 'War',
 'Romance',
 'Western',
 'IMAX',
 'Crime',
 'Adventure',
 'Drama',
 'Musical',
 'Action',
 'Horror',
 'Comedy',
 'Film-Noir',
 'Animation']

In [40]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre] + 1

In [41]:
genre_count

{'Mystery': 573,
 'Documentary': 440,
 '(no genres listed)': 34,
 'Fantasy': 779,
 'Thriller': 1894,
 'Sci-Fi': 980,
 'Children': 664,
 'War': 382,
 'Romance': 1596,
 'Western': 167,
 'IMAX': 158,
 'Crime': 1199,
 'Adventure': 1263,
 'Drama': 4361,
 'Musical': 334,
 'Action': 1828,
 'Horror': 978,
 'Comedy': 3756,
 'Film-Noir': 87,
 'Animation': 611}

In [42]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
    
genre_count

{'Mystery': 1.2304935032683613,
 'Documentary': 1.3451954487495636,
 '(no genres listed)': 2.457169208193496,
 'Fantasy': 1.0971106675631865,
 'Thriller': 0.7112681505684965,
 'Sci-Fi': 0.9974220495432563,
 'Children': 1.1664800458677336,
 'War': 1.4065847623240424,
 'Romance': 0.7856152382210405,
 'Western': 1.7659316540881678,
 'IMAX': 1.7899910382813284,
 'Crime': 0.9098289421369025,
 'Adventure': 0.8872447746804204,
 'Drama': 0.3490620385623247,
 'Musical': 1.4649016584241867,
 'Action': 0.7266719338379385,
 'Horror': 0.9983092704481497,
 'Comedy': 0.4139225416416778,
 'Film-Noir': 2.0491288726171324,
 'Animation': 1.2026069149931968}

### genre 를 이용한 Movie representation 생성

In [43]:
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)

for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

9742it [00:31, 312.03it/s]


In [44]:
genre_representation

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


### Tag를 이용한 Movie representation 생성

In [46]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [49]:
movies_df.loc[60756] # Comedy -> funny : Reasonable !

title     Step Brothers (2008)
genres                  Comedy
Name: 60756, dtype: object

In [59]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x:x.strip(), list([tag for sublist in tag_column for tag in sublist])))))
print(len(unique_tags))

1589


In [62]:
### Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key: tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(','):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'Sean Connery': 3.196452541703389,
 'Tarantino': 2.7193312869837265,
 'Stoner Movie': 3.196452541703389,
 'Jude Law': 2.895422546039408,
 'stupid': 3.196452541703389,
 'unusual': 3.196452541703389,
 'Hungary': 3.196452541703389,
 'Boxing story': 3.196452541703389,
 'black-and-white': 3.196452541703389,
 'lawn mower': 3.196452541703389,
 'masculinity': 3.196452541703389,
 'terminal illness': 2.7193312869837265,
 'insomnia': 3.196452541703389,
 'wonderwoman': 3.196452541703389,
 'missing children': 3.196452541703389,
 'Everything you want is here': 3.196452541703389,
 'dumpster diving': 3.196452541703389,
 'family': 2.351354501689132,
 'wapendrama': 3.196452541703389,
 'iconic': 3.196452541703389,
 'celebrity fetishism': 3.196452541703389,
 'sentimental': 2.895422546039408,
 'intense': 2.5943925503754266,
 'homosexuality': 3.196452541703389,
 'stupid but funny': 3.196452541703389,
 'King Arthur': 2.5943925503754266,
 'scenic': 3.196452541703389,
 'wizards': 3.196452541703389,
 'Tolstoy'

In [65]:
# create tag representation
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))

for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x:x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x:x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)
    
tag_representation = tag_representation.sort_index(0)
tag_representation

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1572/1572 [02:55<00:00,  8.96it/s]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# tag representation 확인

In [66]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [67]:
tag_representation.loc[1].dropna()

fun      2.497483
pixar    2.895423
Name: 1, dtype: object

In [68]:
tag_representation.loc[2].dropna()

Robin Williams      2.719331
fantasy             2.418301
game                3.196453
magic board game    3.196453
Name: 2, dtype: object

### Fianl Movie Representation 생성

In [70]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
movie_representation.describe()

(9742, 1609)


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
count,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,...,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0,9742.0
mean,0.008576,0.136354,0.115027,0.075425,0.079506,0.159587,0.111978,0.060756,0.156257,0.087728,...,0.000328,0.000328,0.000328,0.000837,0.000328,0.000328,0.000328,0.000328,0.000328,0.001241
std,0.144915,0.283726,0.298052,0.291593,0.293989,0.201476,0.298916,0.279366,0.173581,0.297591,...,0.032385,0.032385,0.032385,0.047715,0.032385,0.032385,0.032385,0.032385,0.032385,0.054775
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.349062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.457169,0.726672,0.887245,1.202607,1.16648,0.413923,0.909829,1.345195,0.349062,1.097111,...,3.196453,3.196453,3.196453,2.719331,3.196453,3.196453,3.196453,3.196453,3.196453,2.418301


### 콘텐츠 유사도 평가 (cosine similarity)

In [71]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])
    return result_df

In [72]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,...,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,...,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,...,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,...,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,...,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [81]:
cs_df[1].sort_values(ascending=False)

2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64

### 유사도 평가 결과

In [85]:
print(movies_df.loc[2])
print(movies_df.loc[46972])
print(movies_df.loc[158813])
print(movies_df.loc[80748])

title                 Jumanji (1995)
genres    Adventure|Children|Fantasy
Name: 2, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     Alice Through the Looking Glass (2016)
genres                Adventure|Children|Fantasy
Name: 158813, dtype: object
title     Alice in Wonderland (1933)
genres    Adventure|Children|Fantasy
Name: 80748, dtype: object


### 성능평가

In [86]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [88]:
test_userids = list(set(test_df.userId.values))

In [93]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']] # (n, 9742) 차원 : n은 유저가 평점을 매긴 영화수
    user_ratings_df = user_record_df[['rating']] # (n, 1) 차원
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1) # (9742, 1) # 유저가 매긴 영화유사도 합
    
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_ratings_df.to_numpy()).flatten() / (sim_sum + 1)
    
    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]
    
    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 610/610 [00:06<00:00, 92.27it/s]


In [96]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1,4.145652,1,4.0,964982703
1,50,3.650755,1,5.0,964982931
2,216,2.670124,1,5.0,964981208
3,223,2.612844,1,3.0,964980985
4,231,4.215284,1,5.0,964981179
5,235,3.61982,1,4.0,964980908
6,316,4.136756,1,3.0,964982310
7,457,3.218743,1,5.0,964981909
8,543,3.729524,1,4.0,964981179
9,592,4.024728,1,4.0,964982271


In [98]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.40606646706041 1.1857767357561078
