In [49]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [50]:
path = '../data/movielens/'

In [51]:
ratings_df = pd.read_csv(path + 'ratings.csv')
tags_df    = pd.read_csv(path + 'tags.csv')
movies_df  = pd.read_csv(path + 'movies.csv', index_col='movieId') 

# Simple Algorithm
- train, test set split
- 사용하는 데이터에 따른 결과비교
- RMSE 이용

In [52]:
print(movies_df.shape)
movies_df.tail(3)

(9742, 2)


Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [53]:
# user movie matrix 만들기

user_movie_matrix = ratings_df.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

user_movie_matrix.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [54]:
user_info_df = pd.DataFrame(ratings_df.groupby('userId')['movieId'].count())
user_info_df.columns = ['num_user_rated']

movie_info_df = pd.DataFrame(ratings_df.groupby('movieId')['userId'].count())
movie_info_df.columns = ['num_movie_rated']

In [55]:
user_info_df.head(3)

Unnamed: 0_level_0,num_user_rated
userId,Unnamed: 1_level_1
1,232
2,29
3,39


In [56]:
movie_info_df.head(3)

Unnamed: 0_level_0,num_movie_rated
movieId,Unnamed: 1_level_1
1,215
2,110
3,52


## train, test split

In [57]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

In [58]:
print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [59]:
# test에는 있지만 train에는 없는 영화 또는 유저

only_test_user = len(set(test_df['userId'].unique()) - set(train_df['userId']))
only_test_movie = len(set(test_df['movieId'].unique()) - set(train_df['movieId']))

print(f'only test 유저 = {only_test_user}')
print(f'only test 영화 = {only_test_movie}')

only test 유저 = 0
only test 영화 = 749


## random하게 예측

In [60]:
ratings_range = np.arange(0.5, 5.5, 0.5)
ratings_range

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

In [61]:
test_df['pred_ratings_random'] = np.random.choice(ratings_range, len(test_df))

In [62]:
random_mse = mean_squared_error(test_df['rating'], test_df['pred_ratings_random'])
random_rmse = np.sqrt(random_mse)

print(f'랜덤하게 예측한 경우 RMSE = {random_rmse:.2f}')

랜덤하게 예측한 경우 RMSE = 1.92


## 영화 평균 평점기반 예측하기
- train에 있는 영화는 train의 평균값 이용
- 없으면 random하게 예측

In [63]:
train_movie_avg_df = train_df.groupby('movieId').mean()
train_movie_avg_df.head(3)

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,310.583333,3.922222,1130660000.0
2,325.446809,3.420213,1134787000.0
3,282.230769,3.24359,1009713000.0


In [64]:
def movieAvg_rating_prediction(x):
    if x in train_movie_avg_df.index:
        pred_rating = train_movie_avg_df.loc[x, 'rating']
    else:
        pred_rating = np.random.choice(ratings_range, 1)
    return pred_rating

In [65]:
test_df['pred_ratings_movieAvg'] = test_df['movieId'].map(movieAvg_rating_prediction)

In [66]:
test_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_movieAvg
41008,276,780,5.0,858350384,4.5,3.448171
94274,599,7624,2.5,1519235950,3.5,3.0
77380,483,1320,2.5,1215895327,3.5,3.092105


In [67]:
random_mse = mean_squared_error(test_df['rating'], test_df['pred_ratings_movieAvg'])
random_rmse = np.sqrt(random_mse)

print(f'영화별 평균으로 예측한 경우 RMSE = {random_rmse:.2f}')

영화별 평균으로 예측한 경우 RMSE = 1.03


## 유저 평균 평점기반 예측하기
- train에 있는 유저별 평균값 이용
- 없으면 random하게 예측

In [68]:
train_user_avg_df = train_df.groupby('userId').mean()
train_user_avg_df.head(3)

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1857.067039,4.346369,964986500.0
2,68653.076923,3.903846,1445715000.0
3,8303.4,2.483333,1306464000.0


In [69]:
def userAvg_rating_prediction(x):
    if x in train_user_avg_df.index:
        pred_rating = train_user_avg_df.loc[x, 'rating']
    else:
        pred_rating = np.random.choice(ratings_range, 1)
    return pred_rating

In [73]:
test_df['pred_ratings_userAvg'] = test_df['userId'].map(userAvg_rating_prediction)

In [74]:
test_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,pred_ratings_random,pred_ratings_movieAvg,pred_ratings_userAvg
41008,276,780,5.0,858350384,4.5,3.448171,4.361111
94274,599,7624,2.5,1519235950,3.5,3.0,2.641414
77380,483,1320,2.5,1215895327,3.5,3.092105,3.628664


In [75]:
random_mse = mean_squared_error(test_df['rating'], test_df['pred_ratings_userAvg'])
random_rmse = np.sqrt(random_mse)

print(f'영화별 평균으로 예측한 경우 RMSE = {random_rmse:.2f}')

영화별 평균으로 예측한 경우 RMSE = 0.95
