In [1]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_ratings  = pd.read_csv('./test_data/ratings.csv')
df_movies  = pd.read_csv('./test_data/movies.csv')

In [3]:
df_user_movie_ratings = df_ratings.pivot(index='userId',columns='movieId',values='rating').fillna(0)

In [22]:
df_user_movie_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# matrix는 pivot_table 값을 numpy matrix로 만든 것 
matrix = df_user_movie_ratings.values

# user_ratings_mean은 사용자의 평균 평점 (axis=1은 각 행의 평균을 계산)
user_ratings_mean = np.mean(matrix, axis = 1)

# R_user_mean : 사용자-영화에 대해 사용자 평균 평점을 뺀 것.
# reshape(-1, 1) : -1 : 1차원 배열을 2차원 배열로 바꿀 수 있음(해당 차원의 크기를 자동으로 계산, 1 : 해당 차원의 크기를 1로 만들어라)
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)

<class 'numpy.ndarray'>


In [6]:
pd.DataFrame(matrix_user_mean, columns = df_user_movie_ratings.columns).head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,3.97077,...,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128


In [20]:
type(matrix_user_mean)

numpy.ndarray

In [7]:
# scipy에서 제공해주는 svd.  
# U 행렬, sigma 행렬, V 전치 행렬을 반환.
U, sigma, Vt = svds(matrix_user_mean, k = 12)

In [8]:
# 특이값 벡터를 대각 행렬 형태로 변환해줌
sigma = np.diag(sigma)

In [9]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [10]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_movie_ratings.columns)

In [11]:
df_svd_preds

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.079686,0.021779,-0.013837,-0.005870,-0.028877,0.032371,0.000715,-0.004428,-0.005219,0.038195,...,-0.004324,-0.004352,0.010478,-0.004256,-0.003944,-0.005674,0.018157,-0.005575,-0.005297,-0.003766
1,1.428452,1.608841,0.529476,0.168278,0.520809,1.107473,0.529719,0.089376,0.296270,1.970031,...,0.013227,-0.002275,0.020680,-0.005245,-0.007644,-0.021019,0.031243,-0.000957,-0.000753,0.026901
2,0.977246,0.396971,0.000299,0.027444,0.021287,0.141458,-0.057134,0.031633,-0.012538,0.383576,...,0.002761,0.004907,-0.014190,-0.000251,-0.006007,-0.003189,-0.026916,0.014637,0.013287,-0.005741
3,1.870844,1.169993,0.252202,0.094831,-0.181713,-0.511953,-0.027820,-0.143080,0.013247,1.461694,...,0.026412,-0.027245,0.054681,0.018450,0.034544,-0.035740,0.088889,-0.019365,-0.017113,0.066559
4,1.182777,0.924903,0.075998,0.061505,0.602680,-0.159825,0.339925,0.081534,-0.079666,0.535018,...,-0.029124,-0.029357,0.009064,-0.029092,-0.030890,-0.057453,0.026344,-0.024027,-0.024614,-0.032752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,1.577140,1.002294,0.699893,0.157323,0.631404,1.447100,0.658630,0.027763,0.347034,1.393400,...,-0.000740,0.005554,-0.022873,0.002715,0.005142,0.009042,-0.033241,-0.003929,-0.003212,0.000658
667,0.405596,0.011198,0.022474,0.025968,-0.015645,0.222755,-0.074666,0.000733,0.001368,0.074340,...,-0.004024,0.005189,-0.008046,0.005195,0.004814,-0.007049,-0.015048,0.005298,0.005562,-0.014007
668,0.360704,-0.000016,0.086261,0.012227,0.090441,0.053472,0.028401,-0.010500,0.008004,-0.021811,...,-0.002954,-0.000573,-0.012094,0.003460,0.005672,-0.002675,-0.017659,-0.001340,-0.001189,-0.004946
669,1.054516,0.265079,0.223782,0.054706,0.189281,0.584825,0.129335,0.017404,0.088330,0.435521,...,-0.002090,0.004452,-0.039603,0.003322,0.002830,0.001331,-0.061556,0.005344,0.004928,-0.008484


In [12]:
def recommend_movies(df_svd_preds, user_id, ori_movies_df, ori_ratings_df, num_recommendations=5):
    
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다. (사용자가 이미 평가한 영화 평점을 가져옴)
    user_data = ori_ratings_df[ori_ratings_df.userId == user_id]

    # 위에서 뽑은 user_data와 원본 영화 데이터를 합친다. (사용자의 평가 기록을 원본 데이터와 병합하여 사용자가 이미 시청한 영화 목록을 가져와 평점이 높은 순으로 정렬)
    user_history = user_data.merge(ori_movies_df, on = 'movieId').sort_values(['rating'], ascending=False)

    # 현재는 index로 적용이 되어있으므로 user_id - 1을 해야함.
    user_row_number = user_id - 1 
    
    # SVD를 통해 예측된 사용자의 영화 평점을 기반으로, 사용자 index에 따라 영화 평점이 높은 순으로 영화 데이터 정렬
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출 (isin : movieid가 있는지 여부, ~가 부정 연산자)
    recommendations = ori_movies_df[~ori_movies_df['movieId'].isin(user_history['movieId'])]

    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(), on = 'movieId')
    
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
                      
    return user_history, recommendations

In [13]:
already_rated, predictions = recommend_movies(df_svd_preds, 10, df_movies, df_ratings, 100)

In [18]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [14]:
predictions

Unnamed: 0,movieId,title,genres,Predictions
230,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1.694994
1006,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,1.185642
517,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,1.010450
688,858,"Godfather, The (1972)",Crime|Drama,0.955112
1561,2028,Saving Private Ryan (1998),Action|Drama|War,0.943250
...,...,...,...,...
1024,1288,This Is Spinal Tap (1984),Comedy,0.323532
978,1242,Glory (1989),Drama|War,0.322991
2910,3702,Mad Max (1979),Action|Adventure|Sci-Fi,0.316233
1487,1954,Rocky (1976),Drama,0.313612


In [15]:
def evaluate_recommendations(user_id, recommendations, actual_ratings, k=20):

    # 사용자가 실제로 높게 평가한 영화들
    actual_liked_movies = actual_ratings[actual_ratings['userId'] == user_id]
    actual_liked_movies = actual_liked_movies[actual_liked_movies['rating'] >= 1.0]  # 평점 4 이상만 추출
    actual_liked_set = set(actual_liked_movies['movieId'])
    
    # 추천된 영화들
    recommended_set = set(recommendations['movieId'])
    
    # 정확도 계산
    precision = len(recommended_set & actual_liked_set) / len(recommended_set) if len(recommended_set) > 0 else 0
    recall = len(recommended_set & actual_liked_set) / len(actual_liked_set) if len(actual_liked_set) > 0 else 0
    
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {"precision": precision, "recall": recall, "f1_score": f1_score}

# 추천 결과 평가
evaluation_results = evaluate_recommendations(10, predictions, df_ratings)
print(evaluation_results)


{'precision': 0.0, 'recall': 0.0, 'f1_score': 0}
