# 실습-2: 행렬분해 기반 잠재요인 협업 필터링

In [2]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error

#### 데이터 읽기

In [7]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
print(movies.shape)
print(ratings.shape)

(9724, 3)
(100836, 4)


#### 나의 평점 추가하기

In [8]:
myId = 611  # 나의 사용자 ID

my_ratings = pd.read_csv('data/my_ratings.csv')
ratings = pd.concat([ratings, my_ratings]).reset_index(drop=True)

#### 입력데이터 구성: 평점행렬 만들기

In [9]:
# title 컬럼을 얻기 이해 movies와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# 행은 userID, 열은 title로 pivoting 수행. rating이 없는 값(NaN)은 모두 0으로 변환
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title', fill_value=0)

print(ratings_matrix.shape)
ratings_matrix.head()

(611, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 행렬 분해: 평점 행렬을 두 개의 잠재요인 행렬(P, Q)로 분해

<img align='left' src='MF_process.png' width=500>

In [10]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트. 
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [11]:
start_time = time.time()

P, Q = matrix_factorization(ratings_matrix.values, K=100, steps=100, learning_rate=0.01, r_lambda = 0.01)

print(f"학습시간: {time.time() - start_time}초")

### iteration step :  0  rmse :  3.0046994314586697
### iteration step :  10  rmse :  0.7489747746054122
### iteration step :  20  rmse :  0.521669861592634
### iteration step :  30  rmse :  0.34734389660982545
### iteration step :  40  rmse :  0.24282913880789606
### iteration step :  50  rmse :  0.1830319153379088
### iteration step :  60  rmse :  0.148279517430645
### iteration step :  70  rmse :  0.12718385213555722
### iteration step :  80  rmse :  0.11372255324248283
### iteration step :  90  rmse :  0.10471701625997655
학습시간: 74.2532479763031초


#### 추천 상품 결정:  개인화된 영화 추천

영화 평점 예측
$$
\Large \hat{R}_{u,j} = P(user factors)_{u} * Q(item factors)^{t}_{j}
$$

In [13]:
ratings_pred = np.dot(P, Q.T)
ratings_pred = pd.DataFrame(ratings_pred, ratings_matrix.index, ratings_matrix.columns)
ratings_pred

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.166021,4.200217,3.658159,4.871261,4.582442,1.408091,4.084563,2.237121,4.051789,4.063255,...,1.352431,4.298289,4.012517,2.915547,2.833435,4.408563,3.300981,2.210013,3.972810,0.907376
2,3.136889,3.785463,3.278285,4.345182,4.207082,1.251305,3.879234,1.920114,3.598183,3.602276,...,1.139016,3.867181,3.410814,2.714988,2.520304,4.109295,2.748226,1.768497,4.387495,0.815035
3,2.313241,2.207249,1.852403,2.770960,2.502623,0.853263,0.688585,1.475665,1.396629,2.307521,...,0.748546,2.947673,2.201291,2.031104,1.638842,2.825367,2.057706,1.087343,2.762688,0.531209
4,2.326404,3.174058,2.803822,3.676083,3.430203,0.911712,2.959026,1.912923,3.077000,2.804170,...,1.136800,2.910734,2.879772,2.120276,2.104392,2.805356,1.408830,1.591467,3.374330,0.615892
5,2.472269,3.266027,2.813199,3.709518,3.581844,0.959584,2.246236,1.803193,2.093001,2.983518,...,1.067458,3.897750,2.937974,2.162814,2.211847,3.443770,1.916608,1.495078,3.038577,0.693480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2.718651,3.562000,3.052932,4.164270,3.821150,1.169343,4.102837,1.995165,4.259152,3.413347,...,1.077317,3.586657,3.654013,2.411229,2.524589,4.156963,2.800339,2.103601,4.097448,0.817399
608,2.321609,3.096772,2.655043,3.744791,3.611240,1.088349,2.531599,1.662220,3.750253,3.197248,...,0.850744,3.094331,3.086850,2.160749,2.107502,4.482426,3.443981,1.898165,2.628739,0.674464
609,2.641720,3.326177,2.885629,3.826177,3.638213,1.082328,2.986626,1.845522,3.198048,3.122554,...,1.068343,3.309990,3.097797,2.288281,2.291946,3.969521,2.599743,1.650859,3.249075,0.705423
610,3.916783,3.655495,3.154933,4.212608,3.921237,1.162983,2.740816,2.403291,3.548568,3.638002,...,1.219034,3.975193,3.450531,2.957653,2.607252,4.284172,2.097215,1.529194,3.742142,0.803171


Top-N 영화 추천

In [14]:
# 추천 영화의 수 설정
N = 10
uid = myId

In [15]:
# id로 지정된 사용자의 모든 영화정보 추출하여 Series로 반환함
# 반환된 user_rating은 영화명(title)을 index로 가지는 Series 객체임 
user_rating = ratings_matrix.loc[uid,:]
    
# user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list로 만듬
already_seen = user_rating[user_rating > 0].index.tolist()
   
# list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함
unseen_list = [movie for movie in ratings_matrix.columns.tolist() if movie not in already_seen]
    
# unseen_list에서 가장 평점이 높은 N개의 영화를 추천함 
recomm_items = ratings_pred.loc[uid, unseen_list].sort_values(ascending=False)[:N]
list(recomm_items.index)

['Rushmore (1998)',
 "Singin' in the Rain (1952)",
 'Casablanca (1942)',
 'Philadelphia Story, The (1940)',
 "Schindler's List (1993)",
 'Shawshank Redemption, The (1994)',
 '2001: A Space Odyssey (1968)',
 'Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)',
 'Lion King, The (1994)',
 'Snow White and the Seven Dwarfs (1937)']

추천성능 평가

In [16]:
# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 RMSE를 구함
actual, pred = ratings_matrix.values, ratings_pred.values     
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
rmse = np.sqrt(mean_squared_error(pred, actual))
print(f'RMSE of MFCF@K={N}: ', rmse)

RMSE of MFCF@K=10:  0.0989737147085629
