In [1]:
from tqdm import tqdm

### 경사하강을 이용한 행렬 분해

**원본 행렬 R 및 R을 분해할 P와 Q를 임의의 정규분포를 가진 랜덤값으로 초기화**

In [2]:
# 실습
import numpy as np

# 원본 행렬 R 생성, 분해 행렬 P와 Q 초기화, 잠재요인 차원 K는 3 설정. 
R = np.array([[4, np.NaN, np.NaN, 2, np.NaN ],
              [np.NaN, 5, np.NaN, 3, 1 ],
              [np.NaN, np.NaN, 3, 4, 4 ],
              [5, 2, 1, 2, np.NaN ]])

num_users, num_items = R.shape
K=3         # 4 X 3 * 3 X 5 

# P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 random한 값으로 입력합니다. 
np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))  # 정규분포
Q = np.random.normal(scale=1./K, size=(num_items, K))
print("P:",P)
print("Q:",Q)

P: [[ 0.54144845 -0.2039188  -0.17605725]
 [-0.35765621  0.28846921 -0.76717957]
 [ 0.58160392 -0.25373563  0.10634637]
 [-0.08312346  0.48736931 -0.68671357]]
Q: [[-0.1074724  -0.12801812  0.37792315]
 [-0.36663042 -0.05747607 -0.29261947]
 [ 0.01407125  0.19427174 -0.36687306]
 [ 0.38157457  0.30053024  0.16749811]
 [ 0.30028532 -0.22790929 -0.04096341]]


**비용계산 함수를 생성. 분해된 행렬 P와 Q.T를 내적하여 예측 행렬 생성하고**

**실제 행렬에서 널이 아닌 값의 위치에 있는 값만 예측 행렬의 값과 비교하여 RMSE값을 계산하고 반환**

In [3]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros): # 예측행렬  
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros] # 행 인덱스
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros] # 열 인덱스
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]          # 실제값
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind] # 상응되는 예측값
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

**경사하강법에 기반하여 P와 Q의 원소들을 업데이트 수행**


In [69]:
# R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
# 행 인덱스, 열 인덱스, 값 

non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]

'''
for j in range(num_items)
    for i in range(num_users)
        if R[i,j] > 0 :
            non_zeros.append(i, j, R[i,j])

'''

steps=1000
learning_rate=0.01   # 학습률
r_lambda=0.01        # 규제

# SGD 기법으로 P와 Q 매트릭스를 계속 업데이트. 
for step in range(steps):
    for i, j, r in non_zeros:
        # 실제 값과 예측 값의 차이인 오류 값 구함
        eij = r - np.dot(P[i, :], Q[j, :].T)   # eij"
        # Regularization을 반영한 SGD 업데이트 공식 적용
        P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
        Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])

    rmse = get_rmse(R, P, Q, non_zeros) # 호출 # non_zeros
    if (step % 50) == 0 :
        print("### iteration step : ", step," rmse : ", rmse)

### iteration step :  0  rmse :  1.630025058954674
### iteration step :  50  rmse :  0.03954728735598557
### iteration step :  100  rmse :  0.01039168753012056
### iteration step :  150  rmse :  0.009893510294040883
### iteration step :  200  rmse :  0.010007416875073715
### iteration step :  250  rmse :  0.010144118854553396
### iteration step :  300  rmse :  0.010280490699482735
### iteration step :  350  rmse :  0.010414945607089795
### iteration step :  400  rmse :  0.01054722719934397
### iteration step :  450  rmse :  0.010677181829211744
### iteration step :  500  rmse :  0.010804669187297219
### iteration step :  550  rmse :  0.010929555972390474
### iteration step :  600  rmse :  0.0110517161882142
### iteration step :  650  rmse :  0.011171031962425597
### iteration step :  700  rmse :  0.011287394356545752
### iteration step :  750  rmse :  0.011400704105292038
### iteration step :  800  rmse :  0.011510872266534295
### iteration step :  850  rmse :  0.011617820768890183
###

In [5]:
# 업데이트된 행렬 확인
pred_matrix = np.dot(P, Q.T)
print('예측 행렬:\n', np.round(pred_matrix, 3))

예측 행렬:
 [[-0.099 -0.135  0.033  0.116  0.216]
 [-0.288  0.339  0.332 -0.178 -0.142]
 [ 0.01  -0.23  -0.08   0.163  0.228]
 [-0.313  0.203  0.345 -0.    -0.108]]


### 행렬 분해 기반의 잠재 요인 협업 필터링 실습

**경사하강법 기반의 행렬 분해 함수 생성**

In [6]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):   # 사용자 - 아이템 평점 data 
    
    # 초기화
    num_users, num_items = R.shape
    
    # P와 Q 매트릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력합니다. 
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    # break_count = 0
       
    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장. 
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
   
    # SGD기법으로 P와 Q 매트릭스를 계속 업데이트. 
    for step in tqdm(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i, :] - r_lambda*Q[j,:])
       
        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)
            
    return P, Q

In [7]:
import pandas as pd
import numpy as np

movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]

# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

In [8]:
ratings_matrix.shape

(610, 9719)

In [9]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

  0%|          | 1/200 [00:01<03:36,  1.09s/it]

### iteration step :  0  rmse :  2.9023619751336867


  6%|▌         | 11/200 [00:12<03:35,  1.14s/it]

### iteration step :  10  rmse :  0.7335768591017927


 10%|█         | 21/200 [00:24<03:27,  1.16s/it]

### iteration step :  20  rmse :  0.5115539026853442


 16%|█▌        | 31/200 [00:35<03:18,  1.17s/it]

### iteration step :  30  rmse :  0.37261628282537446


 20%|██        | 41/200 [00:47<03:10,  1.20s/it]

### iteration step :  40  rmse :  0.2960818299181014


 26%|██▌       | 51/200 [00:59<02:54,  1.17s/it]

### iteration step :  50  rmse :  0.2520353192341643


 30%|███       | 61/200 [01:11<02:44,  1.18s/it]

### iteration step :  60  rmse :  0.22487503275269854


 36%|███▌      | 71/200 [01:23<02:33,  1.19s/it]

### iteration step :  70  rmse :  0.2068545530233154


 40%|████      | 81/200 [01:35<02:23,  1.21s/it]

### iteration step :  80  rmse :  0.19413418783028688


 46%|████▌     | 91/200 [01:47<02:14,  1.23s/it]

### iteration step :  90  rmse :  0.18470082002720406


 50%|█████     | 101/200 [02:00<02:03,  1.24s/it]

### iteration step :  100  rmse :  0.17742927527209104


 56%|█████▌    | 111/200 [02:12<01:48,  1.21s/it]

### iteration step :  110  rmse :  0.1716522696470749


 60%|██████    | 121/200 [02:24<01:35,  1.21s/it]

### iteration step :  120  rmse :  0.16695181946871723


 66%|██████▌   | 131/200 [02:36<01:23,  1.21s/it]

### iteration step :  130  rmse :  0.16305292191997542


 70%|███████   | 141/200 [02:48<01:11,  1.21s/it]

### iteration step :  140  rmse :  0.15976691929679646


 76%|███████▌  | 151/200 [03:00<00:59,  1.21s/it]

### iteration step :  150  rmse :  0.1569598699945732


 80%|████████  | 161/200 [03:12<00:46,  1.20s/it]

### iteration step :  160  rmse :  0.1545339818671543


 86%|████████▌ | 171/200 [03:25<00:35,  1.21s/it]

### iteration step :  170  rmse :  0.15241618551077643


 90%|█████████ | 181/200 [03:37<00:23,  1.22s/it]

### iteration step :  180  rmse :  0.1505508073962831


 96%|█████████▌| 191/200 [03:49<00:10,  1.21s/it]

### iteration step :  190  rmse :  0.14889470913232092


100%|██████████| 200/200 [04:00<00:00,  1.20s/it]


In [10]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)

In [11]:
ratings_pred_matrix.tail()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,3.153469,3.536398,3.128222,3.875599,3.821362,1.010045,2.28445,2.287194,4.148796,3.487517,...,1.105429,3.647394,3.342759,2.446343,2.406133,3.613071,1.849264,1.526109,2.860815,0.673099
607,2.502048,3.577547,3.095692,4.081789,4.038996,1.095224,3.698198,1.904581,5.347442,3.11465,...,0.963407,3.230442,3.526719,2.084052,2.119598,4.135631,3.124109,2.362195,3.421672,0.758574
608,2.154503,3.01906,2.679379,3.56755,3.483444,0.909505,2.387003,1.644313,3.090541,3.197815,...,0.818661,3.372644,2.936734,2.049884,2.097775,4.452331,3.504461,1.906708,2.41956,0.701739
609,2.566479,3.285659,2.910122,3.717481,3.66558,1.036884,2.908776,1.844237,2.86899,3.175409,...,1.003584,3.03354,2.943151,2.173251,2.234396,3.839926,2.520343,1.603675,2.970382,0.63688
610,3.951789,3.549175,3.156069,4.050809,4.001964,1.206514,3.086769,2.383306,3.426834,3.763558,...,1.223902,3.977474,3.464065,2.972646,2.538213,4.019379,2.012463,1.528829,4.11926,0.75123


In [12]:
ratings_matrix.tail()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,


In [13]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [14]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [15]:
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재요인 기반 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601


In [66]:
ratings_matrix.loc[9,:][ratings_matrix.loc[9,:] == 5.].sort_values(ascending = False)

title
Adaptation (2002)                                                                 5.0
Austin Powers in Goldmember (2002)                                                5.0
Back to the Future (1985)                                                         5.0
Citizen Kane (1941)                                                               5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Producers, The (1968)                                                             5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Name: 9, dtype: float64

In [67]:
ratings_pred_matrix.loc[9,:][(ratings_pred_matrix.loc[9,:] < 5.2) & (ratings_pred_matrix.loc[9,:] > 4.8)].sort_values(ascending = False)

title
Roger & Me (1989)                                                                 5.191962
Gattaca (1997)                                                                    5.183179
Ben-Hur (1959)                                                                    5.130463
Rosencrantz and Guildenstern Are Dead (1990)                                      5.087375
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.054309
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.052807
Big Lebowski, The (1998)                                                          5.038690
Star Wars: Episode V - The Empire Strikes Back (1980)                             4.989601
Adaptation (2002)                                                                 4.988033
Citizen Kane (1941)                                                               4.986151
Producers, The (1968)                                                             4.

In [None]:
# loc <-> iloc 주의

In [60]:
ratings_pred_matrix.iloc[1] # 숫자 기준 0 ~

title
'71 (2014)                                   3.170119
'Hellboy': The Seeds of Creation (2004)      3.657992
'Round Midnight (1986)                       3.308707
'Salem's Lot (2004)                          4.166521
'Til There Was You (1997)                    4.311890
                                               ...   
eXistenZ (1999)                              4.232789
xXx (2002)                                   2.911602
xXx: State of the Union (2005)               1.634576
¡Three Amigos! (1986)                        4.135735
À nous la liberté (Freedom for Us) (1931)    0.725684
Name: 2, Length: 9719, dtype: float64

In [61]:
ratings_pred_matrix.loc[1] # 인덱스 기준

title
'71 (2014)                                   3.055084
'Hellboy': The Seeds of Creation (2004)      4.092018
'Round Midnight (1986)                       3.564130
'Salem's Lot (2004)                          4.502167
'Til There Was You (1997)                    3.981215
                                               ...   
eXistenZ (1999)                              3.475076
xXx (2002)                                   3.253458
xXx: State of the Union (2005)               2.161087
¡Three Amigos! (1986)                        4.010495
À nous la liberté (Freedom for Us) (1931)    0.859474
Name: 1, Length: 9719, dtype: float64

In [57]:
ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941
4,2.628629,3.03555,2.575746,3.706912,3.430636,0.706441,3.33028,1.978826,4.560368,2.77571,...,1.046116,2.912178,2.479592,2.231915,1.888629,2.211364,0.645603,1.585734,3.542892,0.59154
5,2.116148,3.084761,2.747679,3.78349,3.94699,0.883259,1.958953,1.757317,2.054312,2.775258,...,0.956159,3.893975,2.717024,2.002443,2.053337,3.983639,2.099626,1.423718,2.490428,0.531403
