In [1]:
import surprise
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import matplotlib.pyplot as plt

In [2]:
movies = pd.read_csv(r'D:\Python\DULIEU\movies.csv')
ratings = pd.read_csv(r'D:\Python\DULIEU\ratings.csv')


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
print(len(ratings))

100836


In [6]:
user_counts = ratings['userId'].value_counts()
movie_counts = ratings['movieId'].value_counts()
ratings = ratings[
    ratings['userId'].isin(user_counts[user_counts >= 2].index) &
    ratings['movieId'].isin(movie_counts[movie_counts >= 2].index)
]
print(len(ratings))

97390


In [7]:
ratings_surprise = ratings[['userId', 'movieId', 'rating']]
print(ratings_surprise.head())
print(ratings_surprise.shape)

from surprise import Reader
reader = Reader(rating_scale=(0.5, 5.0))
print("Rating scale:", reader.rating_scale)

from surprise import Dataset
data = Dataset.load_from_df(ratings_surprise, reader)
print("Type of data:", type(data))
print("Number of ratings:", sum(len(r) for r in data.raw_ratings))

from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.2, random_state=3)

   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0
(97390, 3)
Rating scale: (0.5, 5.0)
Type of data: <class 'surprise.dataset.DatasetAutoFolds'>
Number of ratings: 389560


In [8]:
print("tỷ lệ test:", len(testset) / (trainset.n_ratings + len(testset)))

tỷ lệ test: 0.2


In [23]:
from surprise import SVD
svd = SVD(n_factors=100, n_epochs=2000, lr_all=0.004, reg_all=0.005, random_state=42)
print("params:", svd.__dict__)


print("bắt đầu ")
svd.fit(trainset)
print("ok")

params: {'n_factors': 100, 'n_epochs': 2000, 'biased': True, 'init_mean': 0, 'init_std_dev': 0.1, 'lr_bu': 0.004, 'lr_bi': 0.004, 'lr_pu': 0.004, 'lr_qi': 0.004, 'reg_bu': 0.005, 'reg_bi': 0.005, 'reg_pu': 0.005, 'reg_qi': 0.005, 'random_state': 42, 'verbose': False, 'bsl_options': {}, 'sim_options': {'user_based': True}}
bắt đầu 
ok


In [24]:
predictions = svd.test(testset)
print("số lượng dự đoán:", len(predictions))

from surprise import accuracy
rmse = accuracy.rmse(predictions)
print("RMSE:", rmse)

val_true = [pred.r_ui for pred in predictions[:10]]
val_pred = [pred.est for pred in predictions[:10]]
for true, pred in zip(val_true, val_pred):
    print(f"Thực tế: {true:.3f}, Dự đoán: {pred:.3f}")
print("Phương sai thực tế:", np.var(val_true))
print("Phương sai dự đoán:", np.var(val_pred))


số lượng dự đoán: 19478
RMSE: 0.8795
RMSE: 0.8794795050878709
Thực tế: 3.500, Dự đoán: 3.406
Thực tế: 3.000, Dự đoán: 3.222
Thực tế: 4.000, Dự đoán: 3.919
Thực tế: 4.000, Dự đoán: 4.106
Thực tế: 3.000, Dự đoán: 2.763
Thực tế: 4.000, Dự đoán: 3.228
Thực tế: 1.500, Dự đoán: 3.103
Thực tế: 4.500, Dự đoán: 4.220
Thực tế: 2.500, Dự đoán: 3.654
Thực tế: 4.500, Dự đoán: 3.947
Phương sai thực tế: 0.8225
Phương sai dự đoán: 0.211355783136689


In [15]:
def recommend_movies(user_id, svd, movies_df, trainset, top_n=10):
    movie_ids = [trainset.to_raw_iid(iid) for iid in trainset.all_items()]
    predictions = [svd.predict(user_id, mid).est for mid in movie_ids]
    top_indices = np.argsort(predictions)[::-1][:top_n]
    recommendations = []
    for idx in top_indices:
        movie_id = movie_ids[idx]
        rating = predictions[idx]
        try:
            title = movies_df[movies_df['movieId'] == movie_id]['title'].iloc[0]
            recommendations.append({'Title': title, 'Predicted Rating': rating})
        except IndexError:
            print(f"lỗi: movieId {movie_id} không tìm thấy.")
    return pd.DataFrame(recommendations, index=range(1, len(recommendations) + 1))

In [16]:
recommendations = recommend_movies(1, svd, movies, trainset)
print(recommendations)

                                                Title  Predicted Rating
1                  Play Time (a.k.a. Playtime) (1967)               5.0
2                                    127 Hours (2010)               5.0
3                               Shall We Dance (1937)               5.0
4                            Captain Fantastic (2016)               5.0
5   City of Lost Children, The (Cité des enfants p...               5.0
6                                      Memento (2000)               5.0
7                                      Boyhood (2014)               5.0
8                                 Hustler, The (1961)               5.0
9                                  Hoop Dreams (1994)               5.0
10                            Harold and Maude (1971)               5.0


###  Hàm đề xuất cho người dùng mới với đanh giá mới

In [None]:
def recommend_for_new_user(new_ratings, svd, movies_df, trainset, top_n=10):
    new_user_id = max([trainset.to_raw_uid(uid) for uid in trainset.all_users()]) + 1
    new_ratings_df = pd.DataFrame(new_ratings, columns=['movieId', 'rating'])
    new_ratings_df['userId'] = new_user_id
    combined_ratings = pd.concat([ratings[['userId', 'movieId', 'rating']], new_ratings_df])
    new_data = Dataset.load_from_df(combined_ratings[['userId', 'movieId', 'rating']], reader)
    new_trainset = new_data.build_full_trainset()
    svd_new = SVD(n_factors=150, n_epochs=200, lr_all=0.005, reg_all=0.02, random_state=42)
    svd_new.fit(new_trainset)
    movie_ids = [trainset.to_raw_iid(iid) for iid in trainset.all_items()]
    predictions = [svd_new.predict(new_user_id, mid).est for mid in movie_ids]
    top_indices = np.argsort(predictions)[::-1][:top_n]
    recommendations = []
    for idx in top_indices:
        movie_id = movie_ids[idx]
        rating = predictions[idx]
        try:
            title = movies_df[movies_df['movieId'] == movie_id]['title'].iloc[0]
            recommendations.append({'Title': title, 'Predicted Rating': rating})
        except IndexError:
            print(f"Cảnh báo: movieId {movie_id} không tìm thấy.")
    return pd.DataFrame(recommendations, index=range(1, len(recommendations) + 1))

In [18]:
new_user_ratings = [(1, 5.0), (2, 1.0)]
new_recommendations = recommend_for_new_user(new_user_ratings, svd, movies, trainset)
print(new_recommendations)

                                          Title  Predicted Rating
1              Shawshank Redemption, The (1994)          4.435438
2                     Lawrence of Arabia (1962)          4.365897
3                          Touch of Evil (1958)          4.334683
4   Wallace & Gromit: The Wrong Trousers (1993)          4.333141
5                             Casablanca (1942)          4.311231
6                           Forrest Gump (1994)          4.307941
7                Godfather: Part II, The (1974)          4.288378
8                     American History X (1998)          4.284341
9                            Hoop Dreams (1994)          4.256962
10                               Amadeus (1984)          4.246918
