In [18]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [19]:
rating_data = pd.read_csv('./test_data/ratings.csv')
movie_data = pd.read_csv('./test_data/movies.csv')

In [20]:
rating_data.drop('timestamp', axis = 1, inplace = True)
rating_data.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [21]:
movie_data.drop('genres', axis = 1, inplace = True)
movie_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [22]:
user_movie_data = pd.merge(rating_data, movie_data, on = 'movieId')
user_movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,31,2.5,Dangerous Minds (1995)
1,7,31,3.0,Dangerous Minds (1995)
2,31,31,4.0,Dangerous Minds (1995)
3,32,31,4.0,Dangerous Minds (1995)
4,36,31,3.0,Dangerous Minds (1995)


In [23]:
user_movie_rating = user_movie_data.pivot_table('rating', index = 'userId', columns='title').fillna(0)

In [24]:
# Hyper Parameter Setting
r_lambda = 40
nf = 600
alpha = 40
from tqdm import tqdm
import numpy as np


# sample rating matrix
# R = np.array([[0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0],
#               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
#               [0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0],
#               [0, 3, 4, 0, 3, 0, 0, 2, 2, 0, 0],
#               [0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0],
#               [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
#               [0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5],
#               [0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4],
#               [0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0],
#               [0, 0, 0, 3, 0, 0, 0, 0, 4, 5, 0]])

R = user_movie_rating #shape = (671, 9064)

nu = R.shape[0] #671
ni = R.shape[1] #9064

# initialize X and Y with very small values
X = np.random.rand(nu, nf) * 0.01 #shape = (671, 600)
Y = np.random.rand(ni, nf) * 0.01 #shape = (9064, 600)

P = np.copy(R)
P[P > 0] = 1
C = 1 + alpha * R #alpha = 40
C = C.to_numpy()
# Define Loss Function
def loss_function(C, P, xTy, X, Y, r_lambda):
    predict_error = np.square(P - xTy)
    confidence_error = np.sum(C * predict_error)
    regularization = r_lambda * (np.sum(np.square(X)) + np.sum(np.square(Y)))
    total_loss = confidence_error + regularization
    return np.sum(predict_error), confidence_error, regularization, total_loss

# Define User Opimizer Function
def optimize_user(X, Y, C, P, nu, nf, r_lambda):
    # Y = number of Items, shape = (9064, 600)
    yT = np.transpose(Y) #shape = (600, 9064)
    for u in range(nu):
        Cu = np.diag(C[u]) #shape = (9064, 9064)
        yT_Cu_y = np.matmul(np.matmul(yT, Cu), Y) #{(600, 9064)*(9064, 9064)}*(9064, 600) = (600, 600)
        lI = np.dot(r_lambda, np.identity(nf))
        yT_Cu_pu = np.matmul(np.matmul(yT, Cu), P[u])
        X[u] = np.linalg.solve(yT_Cu_y + lI, yT_Cu_pu)

# Define Item Opimizer Function
def optimize_item(X, Y, C, P, ni, nf, r_lambda):
    # X = number of users, shape = (671, 600)
    xT = np.transpose(X) #shape = (600, 671)
    for i in range(ni):
        Ci = np.diag(C[:, i]) #shape = (671, 671)
        xT_Ci_x = np.matmul(np.matmul(xT, Ci), X) #{(600, 671)*(671, 671)}*(671, 600) = (600, 600)
        lI = np.dot(r_lambda, np.identity(nf))
        xT_Ci_pi = np.matmul(np.matmul(xT, Ci), P[:, i])
        Y[i] = np.linalg.solve(xT_Ci_x + lI, xT_Ci_pi)

# Run Learning
predict_errors = []
confidence_errors = []
regularization_list = []
total_losses = []

for i in tqdm(range(50)):
    if i%2 == 0:   
        optimize_user(X, Y, C, P, nu, nf, r_lambda)
        optimize_item(X, Y, C, P, ni, nf, r_lambda)
    else:
        optimize_item(X, Y, C, P, ni, nf, r_lambda)
        optimize_user(X, Y, C, P, nu, nf, r_lambda)
        
    predict = np.matmul(X, np.transpose(Y))
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, predict, X, Y, r_lambda)
    
    predict_errors.append(predict_error)
    confidence_errors.append(confidence_error)
    regularization_list.append(regularization)
    total_losses.append(total_loss)

# Make Result Graph
from matplotlib import pyplot as plt
%matplotlib inline

plt.subplots_adjust(wspace=100.0, hspace=20.0)
fig = plt.figure()
fig.set_figheight(10)
fig.set_figwidth(10)
predict_error_line = fig.add_subplot(2, 2, 1)
confidence_error_line = fig.add_subplot(2, 2, 2)
regularization_error_line = fig.add_subplot(2, 2, 3)
total_loss_line = fig.add_subplot(2, 2, 4)

predict_error_line.set_title("Predict Error") 
predict_error_line.plot(predict_errors)

confidence_error_line.set_title("Confidence Error")
confidence_error_line.plot(confidence_errors)

regularization_error_line.set_title("Regularization")
regularization_error_line.plot(regularization_list)

total_loss_line.set_title("Total Loss")
total_loss_line.plot(total_losses)
plt.show()

  0%|          | 0/50 [01:13<?, ?it/s]


KeyboardInterrupt: 

In [45]:
# ALS 알고리즘 종료 후
# 1. 예측 행렬 생성 2. 추천 대상 선정 3. 추천
def recommend_movies(predict_matrix, user_id, original_ratings, num_recommendations=5):
    # 사용자의 평가 데이터를 가져옴
    user_rating = original_ratings.iloc[user_id].values
    unrated_movies = np.where(user_rating == 0)[0]
    rated_movies = np.where(user_rating == 1)[0]
    #print(len(rated_movies))
    #print(len(unrated_movies))

    # 평가하지 않은 영화에 대한 예측 점수 추출
    recommendations = predict_matrix[user_id, unrated_movies]

    # 가장 높은 예측 점수를 가진 영화 인덱스 추출
    recommended_movie_indices = np.argsort(recommendations)[::-1][:num_recommendations]
    top_movie_indices = np.argsort(predict[user_id])[-5:][::-1]
    top_movie_scores = predict[user_id][top_movie_indices]
    
    #print("Top 5 predicted scores and their movie indices:")
    #print(list(zip(top_movie_indices, top_movie_scores)))

    # 영화 ID 추출
    recommended_movie_ids = unrated_movies[recommended_movie_indices]

    # 영화 제목 매핑
    recommended_movies = movie_data[movie_data['movieId'].isin(recommended_movie_ids)]
    missing_ids = set(recommended_movie_ids) - set(recommended_movies['movieId'])
    
    if missing_ids:
        print(f"Missing movie IDs from movie_data: {missing_ids}")

    return recommended_movies[['movieId', 'title']]
    # recommended_movies = movie_data.loc[movie_data['movieId'].isin(recommended_movie_ids)]
    # print(recommend_movies)
    # return recommended_movies[['movieId', 'title']]

# 예측 행렬 생성
predict = np.matmul(X, Y.T)

# 사용자 ID 예시 (예: 첫 번째 사용자)
user_id = 10
recommended_movies = recommend_movies(predict, user_id, R, num_recommendations=20)
recommended_movies


Missing movie IDs from movie_data: {8005, 7847, 1383, 5545, 6346, 1166, 6833, 7860, 7670, 7703, 8446}


Unnamed: 0,movieId,title
478,534,Shadowlands (1993)
1092,1348,"Nosferatu (Nosferatu, eine Symphonie des Graue..."
1306,1642,Indian Summer (a.k.a. Alive & Kicking) (1996)
2402,2991,Live and Let Die (1973)
3247,4052,Antitrust (2001)
3897,5027,Another 48 Hrs. (1990)
4634,6424,Oscar (1991)
4678,6516,Anastasia (1956)
4959,7055,Swing Time (1936)


In [52]:
def print_diagnostics(user_id, recommendations, actual_ratings):
    print(f"추천된 영화 수: {len(recommendations)}")
    if len(recommendations) > 0:
        print("추천된 영화 IDs:", recommendations['movieId'].tolist())
    
    actual_liked_movies = actual_ratings[(actual_ratings['userId'] == user_id) & (actual_ratings['rating'] >= 4.0)]
    print(f"사용자 {user_id}가(이) 평점 4 이상으로 평가한 영화 수: {len(actual_liked_movies)}")
    if len(actual_liked_movies) > 0:
        print("사용자가 높게 평가한 영화 IDs:", actual_liked_movies['movieId'].tolist())

# 추천 결과 및 실제 사용자 평가 데이터 진단
print_diagnostics(user_id, recommended_movies, rating_data)

추천된 영화 수: 9
추천된 영화 IDs: [534, 1348, 1642, 2991, 4052, 5027, 6424, 6516, 7055]
사용자 10가(이) 평점 4 이상으로 평가한 영화 수: 28
사용자가 높게 평가한 영화 IDs: [50, 152, 318, 345, 735, 1127, 1196, 1197, 1198, 1200, 1210, 1240, 1291, 1358, 1423, 1611, 1704, 1719, 1923, 2344, 2406, 2539, 2571, 2826, 2841, 2890, 2926, 3019]
