# Movie Recommender System 

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mlt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import spatial

### 1. Dataset

In [2]:
rating_df = pd.read_csv("ratings_small.csv")

In [3]:
rating_df.drop('timestamp', axis=1, inplace=True)

In [4]:
print(rating_df.shape)
rating_df.head()

(100004, 3)


Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


### 2. Explore Dataset

In [5]:
unique_user = rating_df['userId'].unique()
print(len(unique_user))
# print(unique_user)

671


In [6]:
unique_movie = rating_df['movieId'].unique()
len(unique_movie)

9066

In [7]:
unique_rating = rating_df['rating'].unique()
unique_rating

array([ 2.5,  3. ,  2. ,  4. ,  3.5,  1. ,  5. ,  4.5,  1.5,  0.5])

In [8]:
# rating
rating_count = rating_df.groupby('rating').size().reset_index(name='rating_count')
rating_count

Unnamed: 0,rating,rating_count
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [16]:
# userID
user_count_df = rating_df.groupby('userId').size().reset_index(name='user_count')
user_count_df = user_count_df.sort_values(by=['user_count'], ascending=False)
user_count_df

Unnamed: 0,userId,user_count
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610
451,452,1340
467,468,1291
379,380,1063
310,311,1019
29,30,1011


In [17]:
movie_count_df = rating_df.groupby('movieId').size().reset_index(name='movie_count')
movie_count_df = movie_count_df.sort_values(by='movie_count', ascending=False)
movie_count_df

Unnamed: 0,movieId,movie_count
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291
427,480,274
2062,2571,259
0,1,247
472,527,244
522,589,237


### 3. Preprocessing

필터링
- user의 최소 평가 수 (user_limit)
- movie의 최소 평가 수

In [18]:
user_limit, movie_limit = 100, 100

In [19]:
filtered_userID = list(user_count_df[user_count_df['user_count'] > user_limit]['userId'])

In [20]:
print(len(filtered_userID))
print(filtered_userID[:5])

258
[547, 564, 624, 15, 73]


In [21]:
filtered_movieID = list(movie_count_df[movie_count_df['movie_count'] > movie_limit]['movieId'])

In [22]:
print(len(filtered_movieID))
print(filtered_movieID[:5])

149
[356, 296, 318, 593, 260]


In [23]:
filtered_df = rating_df[rating_df['userId'].isin(filtered_userID)]
filtered_df = filtered_df[filtered_df['movieId'].isin(filtered_movieID)]
print(filtered_df.shape)
filtered_df.tail(3)

(15567, 3)


Unnamed: 0,userId,movieId,rating
99987,671,5349,4.0
99989,671,5445,4.5
99994,671,5952,5.0


### 4. Pivoting Filtered Dataset

In [24]:
user_df = filtered_df.pivot_table(values='rating', index=['userId'], columns=['movieId'], aggfunc=np.average, fill_value=0, dropna=False)

In [25]:
user_df.head()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,4.0,0.0,5.0,4.0,4.0,4.0,0.0,4.5,0.0,0.0
15,2.0,2.0,4.0,3.0,3.0,4.0,3.0,1.0,2.5,5.0,...,1.0,3.5,1.0,1.5,5.0,0.5,2.0,4.5,4.5,5.0
17,0.0,0.0,4.5,0.0,4.5,4.5,0.0,4.5,0.0,5.0,...,0.0,0.0,4.5,4.5,4.5,4.5,0.0,0.0,0.0,0.0
19,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
def cosine_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
     
    # 모든 idx가 0이어서 전부 drop될 수 있기 때문에, 에러를 방지
    if len(idx) == 0 :
        return 0
    
    #각 벡터들을 어레이화 한다.
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    
    # vector_2 동일한 과정
    idx = vector_2.nonzero()[0]
    if len(idx) == 0 :
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    
    # 코사인 유사도 계산
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [61]:
def euclidean_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
     
    # 모든 idx가 0이어서 전부 drop될 수 있기 때문에, 에러를 방지
    if len(idx) == 0 :
        return 0
    
    #각 벡터들을 어레이화 한다.
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    
    # vector_2 동일한 과정
    idx = vector_2.nonzero()[0]
    if len(idx) == 0 :
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    
    # 유클리디안 유사도 계산
    return np.linalg.norm(vector_1 - vector_2)

In [62]:
sm_df_eu = similarity_matrix(user_df,euclidean_similarity)
sm_df_eu.head()

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,4.821825,12.308534,8.01561,6.557439,7.071068,6.422616,7.713624,5.59017,7.81025,...,3.741657,7.697402,2.236068,4.690416,4.153312,1.0,5.196152,5.220153,7.681146,4.743416
8,4.821825,0.0,12.893797,7.106335,4.795832,3.162278,6.745369,5.958188,5.291503,7.017834,...,4.062019,6.519202,2.291288,5.722762,4.1833,0.866025,4.0,3.968627,5.338539,4.716991
15,12.308534,12.893797,0.0,14.256577,11.0,8.803408,13.35103,15.572412,12.951834,17.31329,...,7.648529,13.856406,8.717798,21.142375,11.83216,9.874209,9.26013,14.40486,14.3527,14.017846
17,8.01561,7.106335,14.256577,0.0,8.0,7.516648,8.440972,9.26013,6.324555,11.07926,...,6.480741,9.082951,5.656854,11.989579,5.315073,3.605551,6.63325,8.42615,9.420722,7.937254
19,6.557439,4.795832,11.0,8.0,0.0,7.81025,6.745369,6.184658,6.22495,9.539392,...,6.082763,6.480741,3.354102,9.124144,5.567764,2.645751,7.211103,5.894913,7.483315,4.974937


In [59]:
def similarity_matrix(df, similarity_func):
    
    # 유저 아이디를 인덱스로 저장
    index = df.index
    
    # 데이터프레임 전치
    df = df.T
    

    # .items() 아이템 하나씩 빼오는 ... 함수
    matrix_ = []
    for idx_1, value_1 in df.items():
        row = []
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix_.append(row)
    
    
    return pd.DataFrame(matrix_, columns=index, index=index)

In [60]:
sm_df_co = similarity_matrix(user_df, cosine_similarity)
sm_df_co.head()

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.0,0.991116,0.956762,0.948457,0.985932,0.980286,0.981591,0.982744,0.986789,0.979119,...,0.979131,0.951088,0.986368,0.991149,0.983037,0.997707,0.970241,0.994377,0.968998,0.985579
8,0.991116,1.0,0.914253,0.966828,0.972568,0.985269,0.964117,0.98201,0.984022,0.971471,...,0.974777,0.947942,0.970261,0.988689,0.979823,0.998645,0.972875,0.990196,0.974638,0.982713
15,0.956762,0.914253,1.0,0.914953,0.950125,0.950927,0.906975,0.923247,0.888292,0.920392,...,0.957841,0.856947,0.893839,0.917356,0.900642,0.873927,0.938017,0.930106,0.903008,0.892096
17,0.948457,0.966828,0.914953,1.0,0.949537,0.933276,0.939038,0.961024,0.966644,0.94202,...,0.96375,0.933889,0.869626,0.947757,0.964055,0.960849,0.932213,0.964792,0.933463,0.952986
19,0.985932,0.972568,0.950125,0.949537,1.0,0.963805,0.955135,0.980127,0.954985,0.962846,...,0.971151,0.9665,0.980166,0.979269,0.957911,0.977106,0.962211,0.979273,0.95424,0.971782


In [29]:
def mean_score(df, sm_df, target, closer_count):
    
    sms_df = sm_df.drop(target) # 자기 자신의 데이터는 제거
    sms_df = sms_df.sort_values(target, ascending=False) # 내림차순
    sms_df = sms_df[target][:closer_count] # 가까운 / 유사한 데이터를 얼마나 가져올 것인지
    
    smsw_df = df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns=df.columns)
    ms_df.loc['user'] = df.loc[target]
    ms_df.loc['mean'] = smsw_df.mean()
    
    return ms_df

In [65]:
ms_df_eu = mean_score(user_df, sm_df_eu, 4, 5 )
ms_df_eu

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,2.6,1.4,1.3,1.6,0.6,2.9,1.3,0.0,1.4,2.8,...,1.8,1.8,1.9,3.0,1.9,1.9,2.5,2.0,2.6,1.7


In [66]:
ms_df_co = mean_score(user_df, sm_df_co, 4, 5 )
ms_df_co

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,...,1.8,0.0,1.7,2.0,1.8,0.8,1.5,0.8,1.9,1.0


In [31]:
def recommend(ms_df):
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df['user']==0]
    recommend_df = recommend_df.sort_values("mean", ascending=False)
    
    return recommend_df, list(recommend_df.index)

In [32]:
recommend_df, recommend_list = recommend(ms_df)
recommend_list[:10]
recommend_df.head()

Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4226,0.0,3.0
2858,0.0,2.8
2959,0.0,2.7
4973,0.0,2.7
912,0.0,2.5


### Evaluation

In [33]:
def mse(value, pred):

    # drop zero value from user data
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]

    # calculate according to the formula
    return sum((value - pred)**2) / len(idx)

In [34]:
def mae(value, pred):

    # drop zero value from user data
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]

    # calculate formula and return result
    return np.absolute(sum(value - pred)) / len(idx)

In [69]:
def evaluate(df, sm_df, algorithm, closer_count):
    
    users = df.index
    evaluate_list = []
    
    for target in users:
        result_df = mean_score(df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(result_df.loc['user'], ms_df.loc['mean']))
        
    return np.average(evaluate_list)

In [70]:
evaluate(user_df, sm_df, mae, 100)

2.6070680909434971

In [73]:
def find_best_mae(user_df, similarity, closer_count):
    sm_df = similarity_matrix(user_df, similarity)
    return evaluate(user_df, sm_df, mae, closer_count)

In [74]:
find_best_mae(user_df, cosine_similarity, 5)

2.6070680909434971

In [78]:
for closer_count in range(1, 5):
    print(closer_count, find_best_mae(user_df, euclidean_similarity, closer_count))


1 2.60706809094
2 2.60706809094
3 2.60706809094
4 2.60706809094
