In [1]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_ratings  = pd.read_csv('./test_data/ratings.csv')
df_movies  = pd.read_csv('./test_data/movies.csv')

In [3]:
df_user_movie_ratings = df_ratings.pivot(index='userId',columns='movieId',values='rating').fillna(0)

In [4]:
# matrix는 pivot_table 값을 numpy matrix로 만든 것 
matrix = df_user_movie_ratings.values

# user_ratings_mean은 사용자의 평균 평점 
user_ratings_mean = np.mean(matrix, axis = 1)

# R_user_mean : 사용자-영화에 대해 사용자 평균 평점을 뺀 것.
matrix_user_mean = matrix - user_ratings_mean.reshape(-1, 1)

In [5]:
pd.DataFrame(matrix_user_mean, columns = df_user_movie_ratings.columns).head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,...,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625,-0.005625
1,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,3.97077,...,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923,-0.02923
2,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,...,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075,-0.020075
3,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,3.902162,...,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838,-0.097838
4,-0.043128,-0.043128,3.956872,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,...,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128,-0.043128


In [6]:
# scipy에서 제공해주는 svd.  
# U 행렬, sigma 행렬, V 전치 행렬을 반환.
U, sigma, Vt = svds(matrix_user_mean, k = 12)

In [7]:
sigma = np.diag(sigma)

In [8]:
# U, Sigma, Vt의 내적을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [9]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = df_user_movie_ratings.columns)

In [16]:
def recommend_movies(df_svd_preds, user_id, ori_movies_df, ori_ratings_df, num_recommendations=5):
    
    #현재는 index로 적용이 되어있으므로 user_id - 1을 해야함.
    user_row_number = user_id - 1 
    
    # 최종적으로 만든 pred_df에서 사용자 index에 따라 영화 데이터 정렬 -> 영화 평점이 높은 순으로 정렬 됌
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다. 
    user_data = ori_ratings_df[ori_ratings_df.userId == user_id]
    
    # 위에서 뽑은 user_data와 원본 영화 데이터를 합친다. 
    user_history = user_data.merge(ori_movies_df, on = 'movieId').sort_values(['rating'], ascending=False)
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출
    recommendations = ori_movies_df[~ori_movies_df['movieId'].isin(user_history['movieId'])]

    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge( pd.DataFrame(sorted_user_predictions).reset_index(), on = 'movieId')
    
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
                      

    return user_history, recommendations

In [17]:
already_rated, predictions = recommend_movies(df_svd_preds, 10, df_movies, df_ratings, 10)

9125


In [21]:
def evaluate_recommendations(user_id, recommendations, actual_ratings, k=20):

    # 사용자가 실제로 높게 평가한 영화들
    actual_liked_movies = actual_ratings[actual_ratings['userId'] == user_id]
    actual_liked_movies = actual_liked_movies[actual_liked_movies['rating'] >= 1.0]  # 평점 4 이상만 추출
    actual_liked_set = set(actual_liked_movies['movieId'])
    
    # 추천된 영화들
    recommended_set = set(recommendations['movieId'])
    
    # 정확도 계산
    precision = len(recommended_set & actual_liked_set) / len(recommended_set)
    recall = len(recommended_set & actual_liked_set) / len(actual_liked_set)
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

# 추천 결과 평가
evaluation_results = evaluate_recommendations(10, predictions, df_ratings)
print(evaluation_results)

{'precision': 0.0, 'recall': 0.0, 'f1_score': 0}


In [15]:
predictions

Unnamed: 0,movieId,title,genres,Predictions
230,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1.694994
1006,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi,1.185642
517,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,1.01045
688,858,"Godfather, The (1972)",Crime|Drama,0.955112
1561,2028,Saving Private Ryan (1998),Action|Drama|War,0.94325
954,1214,Alien (1979),Horror|Sci-Fi,0.915187
264,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.910561
519,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,0.861366
2249,2858,American Beauty (1999),Drama|Romance,0.836982
3823,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,0.804188


In [18]:
len(already_rated)

46