<font color="#CC3D3D"><p>
# 실습-2: 행렬분해 기반 잠재요인 협업 필터링

In [1]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error

#### 데이터 읽기

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies.shape)
print(ratings.shape)

(9724, 3)
(100836, 4)


<font color="blue"><p>
#### 나의 평점 추가하기

In [3]:
myId = 611  # 나의 사용자 ID

my_ratings = pd.read_csv('my_ratings.csv')
ratings = pd.concat([ratings, my_ratings]).reset_index(drop=True)

#### 입력데이터 구성: 평점행렬 만들기

In [4]:
# title 컬럼을 얻기 이해 movies와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# 행은 userID, 열은 title로 pivoting 수행. rating이 없는 값(NaN)은 모두 0으로 변환
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title', fill_value=0)

print(ratings_matrix.shape)
ratings_matrix.head()

(611, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 행렬 분해: 평점 행렬을 두 개의 잠재요인 행렬(P, Q)로 분해

<img align='left' src='MF_process.png' width=500>

In [5]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트. 
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [6]:
start_time = time.time()

P, Q = matrix_factorization(ratings_matrix.values, K=100, steps=100, learning_rate=0.01, r_lambda = 0.01)

print(f"학습시간: {time.time() - start_time}초")

### iteration step :  0  rmse :  3.004715676003615
### iteration step :  10  rmse :  0.7490124332892476
### iteration step :  20  rmse :  0.5216898664507458
### iteration step :  30  rmse :  0.34734471232005076
### iteration step :  40  rmse :  0.24282259712362586
### iteration step :  50  rmse :  0.18301629564224908
### iteration step :  60  rmse :  0.14826014477188762
### iteration step :  70  rmse :  0.12716232473341463
### iteration step :  80  rmse :  0.11369956761913548
### iteration step :  90  rmse :  0.10469368195658736
학습시간: 52.393646240234375초


#### 추천 상품 결정:  개인화된 영화 추천

영화 평점 예측
$$
\Large \hat{R}_{u,j} = P(user factors)_{u} * Q(item factors)^{t}_{j}
$$

In [7]:
ratings_pred = np.dot(P, Q.T)
ratings_pred = pd.DataFrame(ratings_pred, ratings_matrix.index, ratings_matrix.columns)
ratings_pred

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.169616,4.199187,3.655910,4.873912,4.583276,1.407563,4.078189,2.241684,4.053995,4.063829,...,1.353250,4.303372,4.011128,2.916939,2.835495,4.405313,3.291874,2.207970,3.972283,0.906948
2,3.136965,3.786507,3.278804,4.345719,4.207249,1.251169,3.881588,1.921823,3.602151,3.602383,...,1.139946,3.868367,3.410041,2.715783,2.520062,4.114857,2.747767,1.768224,4.380163,0.815627
3,2.313617,2.207717,1.852325,2.770042,2.501985,0.851961,0.693127,1.475820,1.400210,2.307201,...,0.748395,2.953957,2.201740,2.030834,1.638939,2.828625,2.057815,1.087447,2.753571,0.531407
4,2.327214,3.174079,2.803142,3.674306,3.427778,0.911798,2.960249,1.912998,3.065568,2.804535,...,1.137813,2.902709,2.877994,2.121204,2.102967,2.814823,1.415229,1.590524,3.380690,0.615347
5,2.473435,3.264997,2.811396,3.709888,3.583522,0.958147,2.244079,1.802257,2.082419,2.983655,...,1.066648,3.894618,2.935334,2.162334,2.211328,3.435019,1.923034,1.493153,3.028646,0.693427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2.718211,3.560302,3.050565,4.163545,3.821140,1.169487,4.098664,1.995650,4.262605,3.412441,...,1.077100,3.583227,3.653135,2.410537,2.525115,4.147398,2.803720,2.103874,4.099833,0.817214
608,2.321036,3.098346,2.655487,3.746104,3.612234,1.087877,2.528291,1.662882,3.750422,3.197070,...,0.850628,3.100060,3.087286,2.160317,2.108605,4.482631,3.442883,1.897884,2.629323,0.674272
609,2.641266,3.325538,2.884389,3.826449,3.639354,1.081501,2.983781,1.845790,3.198856,3.123408,...,1.067236,3.309280,3.097354,2.286760,2.292331,3.961010,2.597533,1.650669,3.248967,0.705048
610,3.916792,3.655669,3.154456,4.212374,3.920619,1.162643,2.739655,2.403770,3.549280,3.637610,...,1.219196,3.974733,3.450483,2.957660,2.607135,4.283726,2.097706,1.529203,3.744538,0.803113


Top-N 영화 추천

In [8]:
# 추천 영화의 수 설정
N = 10
uid = myId

In [9]:
# id로 지정된 사용자의 모든 영화정보 추출하여 Series로 반환함
# 반환된 user_rating은 영화명(title)을 index로 가지는 Series 객체임 
user_rating = ratings_matrix.loc[uid,:]
    
# user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list로 만듬
already_seen = user_rating[user_rating > 0].index.tolist()
   
# list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함
unseen_list = [movie for movie in ratings_matrix.columns.tolist() if movie not in already_seen]
    
# unseen_list에서 가장 평점이 높은 N개의 영화를 추천함 
recomm_items = ratings_pred.loc[uid, unseen_list].sort_values(ascending=False)[:N]
list(recomm_items.index)

['Lock, Stock & Two Smoking Barrels (1998)',
 'Babe (1995)',
 'Bossa Nova (2000)',
 'Shawshank Redemption, The (1994)',
 'Braveheart (1995)',
 'Sound of Music, The (1965)',
 'Finding Nemo (2003)',
 'E.T. the Extra-Terrestrial (1982)',
 'Big Short, The (2015)',
 'Moby Dick (1956)']

추천성능 평가

In [10]:
# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 RMSE를 구함
actual, pred = ratings_matrix.values, ratings_pred.values     
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
rmse = np.sqrt(mean_squared_error(pred, actual))
print(f'RMSE of UBCF@K={N}: ', rmse)

RMSE of UBCF@K=10:  0.09895099262867213


<font color="#CC3D3D"><p>
# End