In [3]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic
import time
from surprise import KNNBasic
from surprise.model_selection import cross_validate

# 加载数据
data = pd.read_csv('ratings.csv')
# 通常，数据集包含用户ID、电影ID和评分

# 使用 surprise 库读取数据
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)
# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)



SVD算法

In [5]:
# Use the SVD algorithm
SVD_model = SVD(n_factors=10, random_state=42)

# Train the model on the training set
# 测量SVD模型的训练时间
start_time = time.time()
SVD_model.fit(trainset)
SVD_training_time = time.time() - start_time

# Make predictions on the test set
# 测量SVD模型的预测时间
start_time = time.time()
predictions = SVD_model.test(testset)
SVD_prediction_time = time.time() - start_time

# Evaluate the model using RMSE
rmse = accuracy.rmse(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'SVD training time: {SVD_training_time}')
print(f'SVD prediction time: {SVD_prediction_time}')

RMSE: 0.8759
Root Mean Squared Error (RMSE): 0.8758614050540696
SVD training time: 0.3253140449523926
SVD prediction time: 0.1026604175567627


基于用户的协同过滤算法

In [7]:
# 使用基于用户的协同过滤算法
user_based_model = KNNBasic(sim_options={'user_based': True})
# 测量训练时间
start_time = time.time()
user_based_model.fit(trainset)
user_training_time = time.time() - start_time

# 测量预测时间
start_time = time.time()
user_based_predictions = user_based_model.test(testset)
user_prediction_time = time.time() - start_time
# 评估模型
user_based_rmse = accuracy.rmse(user_based_predictions)

print(f'基于用户的协同过滤模型的RMSE: {user_based_rmse}')
print(f'user training time: {user_training_time}')
print(f'user prediction time: {user_prediction_time}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9561
基于用户的协同过滤模型的RMSE: 0.9560731582415551
user training time: 0.09193634986877441
user prediction time: 0.7813875675201416


基于物品的协同过滤算法

In [8]:
# 使用基于物品的协同过滤算法
item_based_model = KNNBasic(sim_options={'user_based': False})
# 测量训练时间
start_time = time.time()
item_based_model.fit(trainset)
item_training_time = time.time() - start_time

# 测量预测时间
start_time = time.time()
item_based_predictions = item_based_model.test(testset)
item_prediction_time = time.time() - start_time

# 评估模型
item_based_rmse = accuracy.rmse(item_based_predictions)
print(f'基于物品的协同过滤模型的RMSE: {item_based_rmse}')
print(f'item training time: {item_training_time}')
print(f'item prediction time: {item_prediction_time}')

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9123
基于物品的协同过滤模型的RMSE: 0.9123443988683873
item training time: 2.4396169185638428
item prediction time: 3.9093799591064453


交叉验证

In [9]:
models = {
    'SVD': SVD_model,
    'User-Based CF': user_based_model,
    'Item-Based CF': item_based_model
}

# 对每个模型进行交叉验证
for model_name, model in models.items():
    print(f"Performing cross-validation for: {model_name}")
    cv_results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    rmse_avg = sum(cv_results['test_rmse']) / len(cv_results['test_rmse'])
    mae_avg = sum(cv_results['test_mae']) / len(cv_results['test_mae'])
    print(f"{model_name} - Average RMSE: {rmse_avg}, Average MAE: {mae_avg}\n")

Performing cross-validation for: SVD
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8770  0.8598  0.8657  0.8694  0.8702  0.8684  0.0056  
MAE (testset)     0.6704  0.6639  0.6647  0.6713  0.6674  0.6675  0.0030  
Fit time          0.36    0.30    0.36    0.33    0.36    0.34    0.03    
Test time         0.05    0.12    0.06    0.12    0.08    0.09    0.03    
SVD - Average RMSE: 0.8684023024491291, Average MAE: 0.6675477269287742

Performing cross-validation for: User-Based CF
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 s