# [실습] 행렬 분해를 이용한 잠재요인 협업 필터링 실습
- 행렬 분해 잠재 요인 협업 필터링은 SVD, NMF 등을 적용할 수 있음. 
- 일반적으로 행렬 분해에는 SVD가 자주 사용되지만 사용자-아이템 행렬에는 Null값이 많기에 주로 SGD or ALS 기반 행렬 분해를 이용함 

In [55]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np


In [56]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화 정보를 추출해 Series로 반환
    # 반환된 user_rating은 영화명을 인덱스로 가지는 Series 객체임
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상인덱스를 추출해 list 객체로 만듦
    idx = user_rating > 0
    already_seen =user_rating[idx].index.tolist()
    
    # 모든 영화명을 list 객체로 만듦
    movies_list =ratings_matrix.columns.tolist()
    
    #list comprehension으로 already_seen에 해당하는 영화는 movies_list에서 제외함.
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [57]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자 id 인덱스와 unseen_list로 돌아온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬
    recomm_movies = pred_df.loc[userId,unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [58]:
# 실제 행렬과 예측 행렬의 오차를 구하는 함수
def get_rmse(R,P,Q,non_zeros):
    error=0
    
    full_pred_matrix = np.dot(P,Q.T)
    
    #여기서 non_zeros는 아래함수에서 확인 할 수 있다.
    x_non_zero_ind = [non_zeros[0] for non_zeros in non_zeros]
    y_non_zero_ind = [non_zeros[1] for non_zeros in non_zeros]
    
    # 원 행렬 R 에서 0이 아닌 값들만 추출한다.
    R_non_zeros=R[x_non_zero_ind,y_non_zero_ind]
    
    # 예측 행렬에서 원 행렬 R에서 0이 아닌 위치의 값들만 추출하여 저장한다.
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind,y_non_zero_ind]
    
    mse =mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [59]:
def matrix_factorization(R,K, steps=200,learning_rate=0.1, r_lambda=0.01):
    num_users, num_items= R.shape
    # P와 Q 매트릴스의 크기를 지정하고 정규 분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P=np.random.normal(scale=1./K, size=(num_users,K))
    Q=np.random.normal(scale=1./K, size=(num_items,K))
    
    prev_rmse=10000
    break_count=0
    
    # R>0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
    non_zeros = [(i,j,R[i,j]) for i in  range(num_users) for j in range(num_items) if R[i,j]>0]
    
    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):
        for i, j, r in non_zeros :
            #실제 값과 예측값의 차이인 오류 값 구함
            eij = r - np.dot(P[i,:], Q[j,:].T)
            # Regularization을 반영한 SGD 업데이트 공식적용
            P[i,:]=P[i,:]+learning_rate*(eij*Q[j,:]-r_lambda*P[i,:])
            Q[j,:]=Q[j,:]+learning_rate*(eij*P[i,:]-r_lambda*Q[j,:])
            
        rmse = get_rmse(R,P,Q, non_zeros)
        if step %10 ==0 :
            print('iter step:{0}, rmse : {1:4f}'.format(step,rmse))
            
    return P,Q
    

In [60]:
movies = pd.read_csv('./ml-latest-small/movies.csv')
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

rating_moives = pd.merge(ratings,movies, on='movieId')
ratings_matrix = rating_moives.pivot_table('rating',index='userId',columns='title')

In [61]:
# steps = SGD 횟수 / K= 잠재요인 차원 수 / 학습률과 L2 Regularization 계수는 모두 0.01으로 설정
P,Q = matrix_factorization(ratings_matrix.values, K=50, steps=1000, learning_rate=0.01,r_lambda=0.01)
pred_matrix = np.dot(P,Q.T)

iter step:0, rmse : 2.902362
iter step:10, rmse : 0.733577
iter step:20, rmse : 0.511554
iter step:30, rmse : 0.372616
iter step:40, rmse : 0.296082
iter step:50, rmse : 0.252035
iter step:60, rmse : 0.224875
iter step:70, rmse : 0.206855
iter step:80, rmse : 0.194134
iter step:90, rmse : 0.184701
iter step:100, rmse : 0.177429
iter step:110, rmse : 0.171652
iter step:120, rmse : 0.166952
iter step:130, rmse : 0.163053
iter step:140, rmse : 0.159767
iter step:150, rmse : 0.156960
iter step:160, rmse : 0.154534
iter step:170, rmse : 0.152416
iter step:180, rmse : 0.150551
iter step:190, rmse : 0.148895
iter step:200, rmse : 0.147414
iter step:210, rmse : 0.146082
iter step:220, rmse : 0.144876
iter step:230, rmse : 0.143779
iter step:240, rmse : 0.142776
iter step:250, rmse : 0.141855
iter step:260, rmse : 0.141007
iter step:270, rmse : 0.140222
iter step:280, rmse : 0.139493
iter step:290, rmse : 0.138815
iter step:300, rmse : 0.138182
iter step:310, rmse : 0.137589
iter step:320, rmse

In [62]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.032402,3.763177,3.273126,4.407304,3.902972,1.121797,3.430295,2.316132,4.301166,3.850254,...,1.370699,4.06057,3.402056,2.59919,2.780331,3.7503,2.703799,2.080365,3.996262,0.84128
2,3.1134,3.130846,2.855477,3.765964,3.748858,1.210122,4.212414,1.614208,2.880316,3.060063,...,0.921541,3.618902,3.106185,2.500448,2.393956,3.791819,2.507229,1.444825,3.777343,0.627442
3,1.831385,1.039056,1.00832,1.729376,1.770943,0.494093,0.951114,0.550314,2.113869,1.662878,...,0.426308,1.36371,1.85457,1.378624,1.281677,0.732316,2.439503,0.705142,1.146918,0.307928


In [63]:
# 사용자가 관람하지 않은 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix,9)
unseen_list
# # 잠재 요인 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9,unseen_list, top_n=10)
recomm_movies= pd.DataFrame(recomm_movies)
pd.DataFrame(recomm_movies).columns = ['pred_score']
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Gattaca (1997),5.113103
Rear Window (1954),5.055717
Star Wars: Episode V - The Empire Strikes Back (1980),5.009944
Rounders (1998),4.9598
Monty Python and the Holy Grail (1975),4.949565
Roger & Me (1989),4.899791
"Remains of the Day, The (1993)",4.878511
Blade Runner (1982),4.874711
Gandhi (1982),4.869262
There's Something About Mary (1998),4.837735
