In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')
drive_path = '/content/drive/My Drive/DataHHTQD/MovieData/'

Mounted at /content/drive


In [None]:
ratings = pd.read_csv(drive_path + 'ratings.csv')
movies = pd.read_csv(drive_path + 'movies.csv')

In [None]:
top_users = ratings['userId'].value_counts().nlargest(1000).index
top_movies = ratings['movieId'].value_counts().nlargest(1000).index

In [None]:
filtered_ratings = ratings[ratings['userId'].isin(top_users) & ratings['movieId'].isin(top_movies)]

In [None]:
merge_df=pd.merge(movies,filtered_ratings,on='movieId')
merge_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,28,4.0,961438127
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,188,4.0,1103751789
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,265,5.0,1607904458
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1048,4.0,1619382182
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1411,4.0,1322832011
...,...,...,...,...,...,...
646980,207313,Knives Out (2019),Comedy|Crime|Drama|Mystery|Thriller,199718,3.5,1620242560
646981,207313,Knives Out (2019),Comedy|Crime|Drama|Mystery|Thriller,199816,4.0,1619410044
646982,207313,Knives Out (2019),Comedy|Crime|Drama|Mystery|Thriller,199966,4.5,1603193038
646983,207313,Knives Out (2019),Comedy|Crime|Drama|Mystery|Thriller,200323,4.0,1642100833


In [None]:
user_movie_matrix=merge_df.pivot_table(index='userId', columns='title', values='rating').fillna(0)
user_movie_matrix

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),"13th Warrior, The (1999)",2001: A Space Odyssey (1968),2012 (2009),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28,4.0,4.0,0.0,4.0,0.0,0.0,0.0,3.0,5.0,2.0,...,3.5,3.5,4.0,0.0,3.5,4.0,3.5,5.0,4.0,4.0
188,3.5,4.5,2.5,4.5,4.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,1.0,4.5,3.0,5.0,4.0,2.0,0.0
265,4.5,3.5,0.0,0.0,5.0,0.0,3.0,0.0,0.0,0.5,...,4.0,3.0,2.5,1.5,3.5,5.0,5.0,2.5,5.0,0.0
1048,3.5,0.0,1.5,0.0,4.5,4.5,3.0,2.5,5.0,3.0,...,3.5,3.5,4.0,2.0,2.5,2.5,3.0,2.0,3.5,3.5
1411,4.0,4.0,3.0,0.0,0.0,0.0,4.0,3.0,2.5,4.0,...,0.0,0.0,0.0,4.0,4.5,0.0,0.0,2.5,3.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199816,4.5,3.5,0.0,0.0,4.0,5.0,4.0,3.0,0.5,2.5,...,0.0,3.0,3.5,0.0,3.0,0.0,4.5,4.0,2.5,4.0
199966,0.0,0.0,2.0,3.0,4.0,3.5,0.0,2.0,5.0,0.0,...,0.0,0.0,2.0,0.0,0.0,4.0,0.0,0.0,3.0,3.0
200322,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,1.5,2.0,...,4.0,2.5,3.0,2.5,0.0,4.0,3.0,3.5,3.0,3.5
200323,2.5,3.0,1.5,0.0,4.5,3.0,3.0,2.0,1.0,2.0,...,3.5,3.0,3.0,3.0,3.0,0.0,3.0,0.0,0.0,3.0


In [None]:
R = user_movie_matrix.to_numpy()

In [None]:
def train_test_split(R, test_size=0.2):
    indices = np.array(np.where(~np.isnan(R))).T
    n_samples = indices.shape[0]
    np.random.shuffle(indices)

    n_test_samples = int(n_samples * test_size)
    test_indices = indices[:n_test_samples]
    train_indices = indices[n_test_samples:]

    R_train = R.copy()
    R_test = np.full_like(R, fill_value=np.nan, dtype=float)

    for i, j in test_indices:
        R_test[i, j] = R[i, j]
        R_train[i, j] = np.nan

    return R_train, R_test, train_indices, test_indices

In [None]:
def matrix_factorization(R_train, k=3, n_epochs=50, learning_rate=0.01, lambda_reg=0.1):
    n_users, n_items = R_train.shape
    U = np.random.rand(n_users, k)
    V = np.random.rand(n_items, k)

    # Huấn luyện mô hình
    for epoch in range(n_epochs):
        for i in range(n_users):
            for j in range(n_items):
                if not np.isnan(R_train[i, j]):
                    # Tính toán lỗi
                    error = R_train[i, j] - np.dot(U[i, :], V[j, :].T)
                    # Cập nhật các yếu tố
                    U[i, :] += learning_rate * (error * V[j, :] - lambda_reg * U[i, :])
                    V[j, :] += learning_rate * (error * U[i, :] - lambda_reg * V[j, :])

        # In loss sau mỗi epoch
        if epoch % 10 == 0:
            loss = np.nansum((R_train - np.dot(U, V.T)) ** 2)
            print(f'Epoch {epoch}, Loss: {loss}')

    return U, V

In [None]:
def predict_rating(U, V, user_id, item_id):
    return np.dot(U[user_id, :], V[item_id, :].T)

In [None]:
def evaluate_model(U, V, R_test):
    predictions = []
    true_ratings = []

    for i, j in zip(*np.where(~np.isnan(R_test))):
        predicted_rating = predict_rating(U, V, i, j)
        true_rating = R_test[i, j]

        predictions.append(predicted_rating)
        true_ratings.append(true_rating)

    rmse = np.sqrt(mean_squared_error(true_ratings, predictions))
    return rmse

In [None]:
R_train, R_test, train_indices, test_indices = train_test_split(R, test_size=0.2)

In [None]:
U, V = matrix_factorization(R_train, k=3, n_epochs=50, learning_rate=0.01, lambda_reg=0.1)

Epoch 0, Loss: 2335498.6351467324
Epoch 10, Loss: 1790352.3427018593
Epoch 20, Loss: 1790352.4273975382
Epoch 30, Loss: 1790352.4273973336
Epoch 40, Loss: 1790352.4273973322


In [None]:
userId = 0
movieId = 1

In [None]:
predicted_rating = predict_rating(U, V, userId, movieId)
print(f"Dự đoán rating cho User {userId+1} (người dùng {userId}) và Movie '{user_movie_matrix.columns[movieId]}' là: {predicted_rating:.2f}")

Dự đoán rating cho User 1 (người dùng 0) và Movie '10 Things I Hate About You (1999)' là: 3.18


In [None]:
rmse = evaluate_model(U, V, R_test)
print(f'RMSE trên tập kiểm tra: {rmse}')

RMSE trên tập kiểm tra: 1.5065621123753603
