In [1]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import ray
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
rating_data = pd.read_csv('./test_data/ratings.csv')
movie_data = pd.read_csv('./test_data/movies.csv')

In [3]:
rating_data.drop('timestamp', axis = 1, inplace = True)
rating_data.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
movie_data.drop('genres', axis = 1, inplace = True)
movie_data.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
user_movie_data = pd.merge(rating_data, movie_data, on = 'movieId')
user_movie_data.head()

Unnamed: 0,userId,movieId,rating,title
0,1,31,2.5,Dangerous Minds (1995)
1,7,31,3.0,Dangerous Minds (1995)
2,31,31,4.0,Dangerous Minds (1995)
3,32,31,4.0,Dangerous Minds (1995)
4,36,31,3.0,Dangerous Minds (1995)


In [6]:
user_movie_rating = user_movie_data.pivot_table('rating', index = 'userId', columns='title').fillna(0)

In [9]:
# Hyper Parameter Setting
ray.shutdown()

r_lambda = 40
nf = 600
alpha = 40

R = user_movie_rating #shape = (671, 9064)

nu = R.shape[0] #671
ni = R.shape[1] #9064

# initialize X and Y with very small values
X = np.random.rand(nu, nf) * 0.01 #shape = (671, 600)
Y = np.random.rand(ni, nf) * 0.01 #shape = (9064, 600)

P = np.copy(R)
P[P > 0] = 1
C = 1 + alpha * R #alpha = 40
C = C.to_numpy()

# Ray 초기화
ray.init()

@ray.remote
# Define Loss Function
def loss_function(C, P, xTy, X, Y, r_lambda):
    predict_error = np.square(P - xTy)
    confidence_error = np.sum(C * predict_error)
    regularization = r_lambda * (np.sum(np.square(X)) + np.sum(np.square(Y)))
    total_loss = confidence_error + regularization
    return np.sum(predict_error), confidence_error, regularization, total_loss

@ray.remote
def optimize_user(X, Y, C, P, nu, nf, r_lambda):
    X = X.copy()  # X 배열 복사
    Y = Y.copy()  # Y 배열 복사
    yT = np.transpose(Y)
    for u in range(nu):
        Cu = np.diag(C[u])
        yT_Cu_y = np.matmul(np.matmul(yT, Cu), Y)
        lI = np.dot(r_lambda, np.identity(nf))
        yT_Cu_pu = np.matmul(np.matmul(yT, Cu), P[u])
        X[u] = np.linalg.solve(yT_Cu_y + lI, yT_Cu_pu)
    return X

@ray.remote
def optimize_item(X, Y, C, P, ni, nf, r_lambda):
    X = X.copy()  # X 배열 복사
    Y = Y.copy()  # Y 배열 복사
    xT = np.transpose(X)
    for i in range(ni):
        Ci = np.diag(C[:, i])
        xT_Ci_x = np.matmul(np.matmul(xT, Ci), X)
        lI = np.dot(r_lambda, np.identity(nf))
        xT_Ci_pi = np.matmul(np.matmul(xT, Ci), P[:, i])
        Y[i] = np.linalg.solve(xT_Ci_x + lI, xT_Ci_pi)
    return Y


# 학습 루프 실행
predict_errors = []
confidence_errors = []
regularization_list = []
total_losses = []

for i in tqdm(range(50)):
    if i % 2 == 0:
        X_id = optimize_user.remote(X, Y, C, P, nu, nf, r_lambda)
        Y_id = optimize_item.remote(X, Y, C, P, ni, nf, r_lambda)
    else:
        Y_id = optimize_item.remote(X, Y, C, P, ni, nf, r_lambda)
        X_id = optimize_user.remote(X, Y, C, P, nu, nf, r_lambda)

    X = ray.get(X_id)
    Y = ray.get(Y_id)

    predict = np.matmul(X, np.transpose(Y))
    predict_error, confidence_error, regularization, total_loss = loss_function(C, P, predict, X, Y, r_lambda)
    
    predict_errors.append(predict_error)
    confidence_errors.append(confidence_error)
    regularization_list.append(regularization)
    total_losses.append(total_loss)

# Make Result Graph
from matplotlib import pyplot as plt
%matplotlib inline

plt.subplots_adjust(wspace=100.0, hspace=20.0)
fig = plt.figure()
fig.set_figheight(10)
fig.set_figwidth(10)
predict_error_line = fig.add_subplot(2, 2, 1)
confidence_error_line = fig.add_subplot(2, 2, 2)
regularization_error_line = fig.add_subplot(2, 2, 3)
total_loss_line = fig.add_subplot(2, 2, 4)

predict_error_line.set_title("Predict Error") 
predict_error_line.plot(predict_errors)

confidence_error_line.set_title("Confidence Error")
confidence_error_line.plot(confidence_errors)

regularization_error_line.set_title("Regularization")
regularization_error_line.plot(regularization_list)

total_loss_line.set_title("Total Loss")
total_loss_line.plot(total_losses)
plt.show()

2024-05-10 20:38:41,075	INFO worker.py:1553 -- Started a local Ray instance.
  0%|          | 0/50 [19:10<?, ?it/s]


KeyboardInterrupt: 