In [None]:
import pandas as pd
from six.moves import cPickle
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error

import sys
sys.path.append('..')

from src import *

In [None]:
# 设置主题色和ggplot模版
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
PingFang = FontProperties(fname='../assets/PingFang.ttc')
plt.style.use('ggplot')
plt_color = ["#6A9A8B", "#00754B"]


In [None]:
path = '../data/Yelp_final.csv'

yelp = pd.read_csv(path, encoding='utf-8')

In [None]:
# 过滤出位于Phoenix的餐厅评论
phoenix_restaurant_reviews = yelp[(yelp.city == 'Phoenix') &
                                (yelp.categories.str.contains('.Restaurant.', case=False))]
# 打印Phoenix餐厅评论的行列数
print("Phoenix餐厅评论的行列数:", phoenix_restaurant_reviews.shape)

In [None]:
# 获取Phoenix餐厅评论的清洁数据，用户至少有20条评论
sparse_matrix_20_phx = get_sparse_matrix(get_clean_df(
    phoenix_restaurant_reviews,
    min_user_review=20,
    cols=['user_id', 'business_id', 'stars']))

# 获取Phoenix餐厅评论的清洁数据，用户至少有30条评论
sparse_matrix_30_phx = get_sparse_matrix(get_clean_df(
    phoenix_restaurant_reviews,
    min_user_review=30,
    cols=['user_id', 'business_id', 'stars']))

In [None]:
# 打印Phoenix餐厅评论数据的稀疏度，用户至少有20条评论
print("用户至少有20条评论的稀疏度:", get_sparsity(sparse_matrix_20_phx))
print("用户至少有20条评论的矩阵形状:", sparse_matrix_20_phx.shape)

print()

# 打印Phoenix餐厅评论数据的稀疏度，用户至少有30条评论
print("用户至少有30条评论的稀疏度:", get_sparsity(sparse_matrix_30_phx))
print("用户至少有30条评论的矩阵形状:", sparse_matrix_30_phx.shape)

In [None]:
# 获取每位用户至少有10条评论的Phoenix餐厅数据
phoenix_reviews_min_10 = get_clean_df(phoenix_restaurant_reviews,
                                      min_user_review=10,
                                      cols=['user_id', 'business_id', 'stars'])
print("Phoenix餐厅评论数:", phoenix_reviews_min_10.shape[0])

In [None]:
# 打印Phoenix餐厅的唯一餐厅数和唯一用户数
print("Phoenix唯一餐厅数 =", len(phoenix_reviews_min_10.business_id.unique()))
print("Phoenix唯一用户数 =", len(phoenix_reviews_min_10.user_id.unique()))

In [None]:
# 获取Phoenix餐厅的稀疏矩阵
phoenix_ratings_matrix = get_sparse_matrix(phoenix_reviews_min_10)
print("Phoenix餐厅评分矩阵的形状 =", phoenix_ratings_matrix.shape)

In [None]:
# 计算Phoenix餐厅评分矩阵的稀疏度
sparsity_phoenix = 1 - phoenix_ratings_matrix.nnz / \
    (phoenix_ratings_matrix.shape[0] * phoenix_ratings_matrix.shape[1])
print("Phoenix餐厅评分矩阵的稀疏度 =", sparsity_phoenix)

In [None]:
# 处理Phoenix餐厅评分矩阵
phoenix_train_matrix, phoenix_val_matrix, phoenix_test_matrix = train_val_test_split(
    phoenix_ratings_matrix)
print("Phoenix 训练集形状:", phoenix_train_matrix.shape)
print("Phoenix 验证集形状:", phoenix_val_matrix.shape)
print("Phoenix 测试集形状:", phoenix_test_matrix.shape)

In [None]:
# 将稀疏矩阵划分为训练集、验证集和测试集
# 将包含至少20条评论的Phoenix用户数据划分
phoenix_users_train, phoenix_users_val, phoenix_users_test = train_val_test_split(
    sparse_matrix_20_phx, num_review_val=7, num_review_test=3
)

print("Phoenix 用户训练集形状:", phoenix_users_train.shape)
print("Phoenix 用户验证集形状:", phoenix_users_val.shape)
print("Phoenix 用户测试集形状:", phoenix_users_test.shape)

# 4. 模型构建:

## 4.1 奇异值分解SVD

In [None]:
# 对Phoenix餐厅的训练集矩阵进行SVD分解
# SVD分解将矩阵分解为三个子矩阵 U、S 和 Vt
U_phoenix_res, S_phoenix_res, Vt_phoenix_res = np.linalg.svd(
    phoenix_train_matrix.todense())

# 打印最小和最大的奇异值
print("最小的奇异值 =", min(S_phoenix_res))
print("最大的奇异值 =", max(S_phoenix_res))

# 将奇异值对角化，创建对角矩阵
S_phx_res_dia = np.diag(S_phoenix_res)

In [None]:
# 绘制凤凰城前20个奇异值
plt.plot(S_phoenix_res[:20], '-', linewidth=2,
         color=plt_color[1], label='奇异值')

plt.xticks(np.arange(0, len(S_phoenix_res[:20]), 1), fontproperties=PingFang)
plt.xlabel('主成分', fontproperties=PingFang, fontsize=12)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.ylim(60, 240)
plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/01_SVD奇异值分布_前20个主成分.png', dpi=600)
plt.ylabel('奇异值', fontproperties=PingFang, fontsize=12)
plt.title('Phoenix餐厅SVD奇异值分布（前20个主成分）', fontproperties=PingFang, fontsize=14)
plt.show()

In [None]:
# 设置奇异值数量的范围
num_singular_values_range = np.linspace(2, 40, 20, dtype=int)
validation_errors = {}
training_errors = {}

# 计算训练集和验证集在不同奇异值数量下的误差
for num_singular_values in num_singular_values_range:
    validation_errors[num_singular_values] = compute_approximation_error(
        num_singular_values, phoenix_val_matrix.todense(), U_phoenix_res, S_phx_res_dia, Vt_phoenix_res)
    training_errors[num_singular_values] = compute_approximation_error(
        num_singular_values, phoenix_train_matrix.todense(), U_phoenix_res, S_phx_res_dia, Vt_phoenix_res)

# 绘制误差图
plt.plot(validation_errors.keys(), validation_errors.values(),
         label='验证集', color=plt_color[0])
plt.plot(training_errors.keys(), training_errors.values(),
         label='训练集', color=plt_color[1])
plt.xlabel('奇异值数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend(prop=PingFang, loc='lower left')

plt.tight_layout()
plt.savefig('../images/Phoenix/02_不同奇异值数量下的训练集和验证集误差.png', dpi=600)
plt.title('Phoenix不同奇异值数量下的训练和验证误差', fontproperties=PingFang, fontsize=14)
plt.show()

In [None]:
# 分别绘制验证集误差
plt.plot(validation_errors.keys(), validation_errors.values(),
         label='验证集', color=plt_color[0])

# 设置坐标轴标签和图例
plt.xlabel('奇异值数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)

plt.tight_layout()
plt.savefig('../images/Phoenix/03_不同奇异值数量下验证集误差.png', dpi=600)
plt.title('验证集误差随奇异值数量变化图', fontproperties=PingFang, fontsize=14)
# 显示图像
plt.show()

In [None]:
# 使用SVD结果中的前两个主成分绘制散点图
plt.scatter(np.ravel(U_phoenix_res[:, 0]), np.ravel(
    U_phoenix_res[:, 1]), label='观察点', color=plt_color[1], alpha=0.6)

plt.xlabel('主成分 1', fontproperties=PingFang)
plt.ylabel('主成分 2', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/04_前两个主成分散点图.png', dpi=600)

# 设置标题
plt.title('Phoenix餐厅数据的前两个主成分散点图', fontproperties=PingFang, fontsize=14)
# 显示图像
plt.show()

### 4.1.1 计算SVD的最佳验证MSE
计算在使用前10个奇异值进行近似重构时，Phoenix餐厅数据在验证集上的最佳均方误差（MSE）

In [None]:
best_val_mse_svd_phx = compute_approximation_error(
    10, phoenix_val_matrix.todense(), U_phoenix_res, S_phx_res_dia, Vt_phoenix_res)

print('Phoenix餐厅使用SVD的最佳验证集均方误差 (MSE)= ', best_val_mse_svd_phx)

In [None]:
# 获取Phoenix餐厅训练集、验证集和测试集中非零元素的索引

# 训练集非零元素的索引
train_nonzero_indices_phx = phoenix_train_matrix.nonzero()

# 验证集非零元素的索引
val_nonzero_indices_phx = phoenix_val_matrix.nonzero()

# 测试集非零元素的索引
test_nonzero_indices_phx = phoenix_test_matrix.nonzero()

In [None]:
# 使用前10个奇异值进行SVD重构，计算重构后的矩阵
reconstructed_matrix_svd_phx = np.dot(U_phoenix_res[:, :10], np.dot(
    S_phx_res_dia[:10, :10], Vt_phoenix_res[:10, :]))

# 获取训练集、验证集和测试集在重构矩阵中的预测值
train_predictions_svd_phx = np.ravel(
    reconstructed_matrix_svd_phx[train_nonzero_indices_phx])
val_predictions_svd_phx = np.ravel(
    reconstructed_matrix_svd_phx[val_nonzero_indices_phx])
test_predictions_svd_phx = np.ravel(
    reconstructed_matrix_svd_phx[test_nonzero_indices_phx])

## SVD with bias correction

In [None]:
# 计算Phoenix餐厅训练矩阵的全局偏差
global_bias = np.sum(phoenix_train_matrix) / phoenix_train_matrix.nnz

# 去除全局偏差后的训练矩阵
train_matrix_wo_bias_phx = phoenix_train_matrix.todense() - global_bias

# 对去除偏差后的训练矩阵进行SVD分解
U_phx_wo_bias, S_phx_wo_bias, Vt_phx_wo_bias = np.linalg.svd(
    train_matrix_wo_bias_phx)

# 打印最小和最大的奇异值
print("最小的奇异值 =", min(S_phx_wo_bias))
print("最大的奇异值 =", max(S_phx_wo_bias))

# 将奇异值对角化，创建对角矩阵
S_phx_diag_wo_bias = np.diag(S_phx_wo_bias)

In [None]:
# 绘制前1000个奇异值的Elbow图
fig = plt.figure(figsize=(8, 5))
plt.semilogy(S_phx_wo_bias[:1000], '-', linewidth=1, color=plt_color[1])
plt.xlim(-20, 1020)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.xlabel('主成分', fontproperties=PingFang)
plt.ylabel('奇异值', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/05_去除偏差后的SVD_奇异值分布（前1000个）.png', dpi=600)
plt.title('Phoenix去除偏差后的SVD奇异值分布（前1000个）', fontproperties=PingFang)
plt.show()

In [None]:
# 计算去除偏差后的SVD误差
k_wo_bias = np.arange(1, 20)
errors_svd_wo_bias = {}
sm_u_10_phx_res_val_wo_bias = phoenix_val_matrix.todense() - global_bias

for i in k_wo_bias:
    sm_u_10_phx_res_val_wo_bias_rec = np.dot(U_phx_wo_bias[:, :i], np.dot(
        S_phx_diag_wo_bias[:i, :i], Vt_phx_wo_bias[:i, :]))
    idx_wo_bias = np.where(phoenix_val_matrix.todense() > 0)
    diff_wo_bias = sm_u_10_phx_res_val_wo_bias[idx_wo_bias] - \
        sm_u_10_phx_res_val_wo_bias_rec[idx_wo_bias]
    errors_svd_wo_bias[i] = np.linalg.norm(
        diff_wo_bias)**2 / diff_wo_bias.shape[1]

# 绘制误差图
plt.plot(errors_svd_wo_bias.keys(),
         errors_svd_wo_bias.values(), color=plt_color[0],
         label='验证集')
plt.xlabel('主成分数量', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/06_去除偏差后SVD_验证集重构误差.png', dpi=600)
plt.title('Phoenix去除偏差后验证集的SVD误差', fontproperties=PingFang)
plt.show()

## For Restaurants in Phoenix

In [None]:
# 计算Phoenix训练矩阵的用户和物品偏差，并去除偏差
user_bias_phx, item_bias_phx, train_matrix_no_bias_phx = compute_global_user_item_bias(
    phoenix_train_matrix)

In [None]:
# 对去除偏差后的Phoenix训练矩阵进行SVD分解
U_phx_no_bias, S_phx_no_bias, Vt_phx_no_bias = np.linalg.svd(
    train_matrix_no_bias_phx)
print("最小的奇异值 =", min(S_phx_no_bias))
print("最大的奇异值 =", max(S_phx_no_bias))
S_phx_diag_no_bias = np.diag(S_phx_no_bias)

In [None]:
# 绘制前20个奇异值的折线图
fig = plt.figure(figsize=(8, 5))
plt.plot(S_phx_no_bias[:20], '-', linewidth=1, color=plt_color[1])

plt.xlabel('主成分', fontproperties=PingFang)
plt.ylabel('奇异值', fontproperties=PingFang)

# 保存图像，设置dpi=600
plt.tight_layout()
plt.savefig('../images/Phoenix/07_去除偏差后的SVD_奇异值分布.png', dpi=600)
plt.title('Phoenix餐厅去除偏差后的SVD奇异值分布（前20个）', fontproperties=PingFang)
# 显示图形
plt.show()

In [None]:
num_components_wo_bias = np.linspace(2, 40, 20, dtype=int)
errors_svd_no_bias = {}
val_matrix_no_bias_phx = phoenix_val_matrix.todense() - user_bias_phx - \
    item_bias_phx
# 计算去除偏差后的SVD误差
for k in num_components_wo_bias:
    reconstructed_val_matrix_no_bias = np.dot(U_phx_no_bias[:, :k], np.dot(
        S_phx_diag_no_bias[:k, :k], Vt_phx_no_bias[:k, :]))
    non_zero_indices_no_bias = np.where(phoenix_val_matrix.todense() > 0)
    diff_no_bias = val_matrix_no_bias_phx[non_zero_indices_no_bias] - \
        reconstructed_val_matrix_no_bias[non_zero_indices_no_bias]
    errors_svd_no_bias[k] = np.linalg.norm(
        diff_no_bias)**2 / diff_no_bias.shape[1]

# 绘制去除偏差后的SVD误差图
plt.plot(errors_svd_no_bias.keys(), errors_svd_no_bias.values(),
         label='验证集', color=plt_color[0])
plt.xlabel('主成分数量', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)

# 保存图像，设置dpi=600
plt.tight_layout()
plt.savefig('../images/Phoenix/08_去除偏差后的SVD_验证集重构误差.png', dpi=600)
plt.title('去除偏差后的SVD误差', fontproperties=PingFang, fontsize=14)
# 显示图形
plt.show()

## 4.2 修正偏差的余弦相似度

In [None]:
# 计算餐厅相似度矩阵
simi_phx_res = compute_similarity_matrix(phoenix_train_matrix, axis='res')
simi_phx_res = np.array(simi_phx_res)

# 预测餐厅相似度的评分
preds_cosine_phx = predict_top_k(
    np.array(phoenix_train_matrix.todense()), simi_phx_res, kind='res', k=1)

# 计算餐厅相似度的均方误差
MSE_cosine_res = calculate_mse(
    preds_cosine_phx, np.array(phoenix_val_matrix.todense()))
print("餐厅间余弦相似度的MSE =", MSE_cosine_res)

In [None]:
# 计算用户相似度矩阵
simi_phx_users = compute_similarity_matrix(phoenix_train_matrix, axis='users')
simi_phx_users = np.array(simi_phx_users)

# 预测用户相似度的评分
preds_cosine_phx_users = predict_top_k(
    np.array(phoenix_train_matrix.todense()), simi_phx_users, kind='user', k=1)

# 计算用户相似度的均方误差
MSE_cosine_users = calculate_mse(
    preds_cosine_phx_users, np.array(phoenix_val_matrix.todense()))
print("用户间余弦相似度的MSE =", MSE_cosine_users)

In [None]:
# 定义空字典存储不同K值下的验证集和训练集误差
errors_cs_res_phx_val = {}
errors_cs_res_phx_train = {}

# 定义K值的范围
k_cs = np.arange(1, 40)

# 遍历每个K值，计算预测的均方误差（MSE）
for i in k_cs:
    # 使用余弦相似性对Phoenix餐厅的评分数据进行预测
    preds_cs_res_phx = predict_top_k(
        np.array(phoenix_train_matrix.todense()), simi_phx_res, kind='res', k=i)

    # 计算验证集和训练集的均方误差
    error_cs_val = calculate_mse(
        preds_cs_res_phx, np.array(phoenix_val_matrix.todense()))
    error_cs_train = calculate_mse(
        preds_cs_res_phx, np.array(phoenix_train_matrix.todense()))

    # 将计算结果存储到字典中
    errors_cs_res_phx_val[i] = error_cs_val
    errors_cs_res_phx_train[i] = error_cs_train

# 绘制K值与均方误差的关系图
plt.plot(errors_cs_res_phx_val.keys(),
         errors_cs_res_phx_val.values(), label='验证集', color=plt_color[0])
plt.plot(errors_cs_res_phx_train.keys(),
         errors_cs_res_phx_train.values(), label='训练集', color=plt_color[1])


plt.xlabel('邻近用户或物品的数量 K', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/09_余弦相似度模型_不同K值对评分预测MSE的影响.png', dpi=600)
plt.title('余弦相似度下不同K值对Phoenix餐厅评分预测MSE的影响', fontproperties=PingFang)

plt.show()

In [None]:
# 使用余弦相似度和最优K值（40）对Phoenix餐厅的评分数据进行预测
preds_cosine_phx = predict_top_k(
    np.array(phoenix_train_matrix.todense()), simi_phx_res, kind='res', k=40)

# 计算使用余弦相似度得到的预测结果在验证集上的均方误差 (MSE)
Best_Val_MSE_Cosine_phx = calculate_mse(
    preds_cosine_phx, np.array(phoenix_val_matrix.todense()))

# 打印余弦相似度在Phoenix餐厅验证集上的最佳MSE结果
print('Phoenix餐厅余弦相似性模型在K=40时的最佳验证集MSE为 = ',
      Best_Val_MSE_Cosine_phx)

In [None]:
train_preds_cosine_phx = preds_cosine_phx[train_nonzero_indices_phx]
val_preds_cosine_phx = preds_cosine_phx[val_nonzero_indices_phx]
test_preds_cosine_phx = preds_cosine_phx[test_nonzero_indices_phx]


## 3.3 ALS:

In [None]:
# 初始化不同的特征数量和存储误差的列表
num_features = np.linspace(1, 20, 5, dtype=int)
test_error_als = []
train_error_als = []

# 遍历不同的特征数量
for i in num_features:
    # 使用交替最小二乘法（ALS）进行预测
    preds_als = als(np.array(phoenix_train_matrix.todense()),
                    num_features=i, iterations=5)

    # 计算验证集和训练集的均方误差
    test_err = calculate_mse(preds_als, np.array(phoenix_val_matrix.todense()))
    train_err = calculate_mse(
        preds_als, np.array(phoenix_train_matrix.todense()))

    # 将误差添加到列表中
    test_error_als.append(test_err)
    train_error_als.append(train_err)

# 创建图表并设置大小
fig = plt.figure(figsize=(8, 5))

# 绘制特征数量与均方误差的关系
plt.plot(num_features, test_error_als, label='验证集', color=plt_color[0])
plt.plot(num_features, train_error_als, label='训练集', color=plt_color[1])


plt.xlabel('特征向量中的特征数量', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/10_ALS_特征数量与均方误差的关系.png', dpi=600)
plt.title('Phoenix餐厅特征数量与均方误差的关系（ALS）', fontproperties=PingFang)
plt.show()

In [None]:
# 生成对数等比数列，用于正则化参数的测试
user_reg = np.logspace(-4, 3, 7)
val_error_als = {}  # 初始化存储验证集误差的字典

# 创建图形对象，并设置图形大小
fig = plt.figure(figsize=(8, 5))

# 遍历不同的正则化参数
for i in user_reg:
    # 初始化当前正则化参数下的验证集误差列表
    val_error_als[i] = []

    # 使用ALS算法进行预测，同时应用相同的正则化参数于用户和物品
    preds_als = als(np.array(phoenix_train_matrix.todense()), num_features=10,
                    iterations=5, user_regularization=i, item_regularization=i)

    # 计算并存储验证集的均方误差
    val_err = calculate_mse(preds_als, np.array(phoenix_val_matrix.todense()))
    val_error_als[i].append(val_err)

# 绘制正则化参数（对数尺度）与MSE的关系图
plt.plot(np.log10(user_reg), [
         errors[0] for errors in val_error_als.values()], label='验证集MSE', color=plt_color[0])


plt.xlabel('正则化系数（对数尺度）', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/11_ALS_正则化系数对MSE的影响.png', dpi=600)
plt.title('Phoenix餐厅正则化系数对MSE的影响', fontproperties=PingFang)
plt.show()

In [None]:
# 调用ALS函数，设置特征数量为10，迭代次数为5，以及用户和物品的正则化参数均为0.001
preds_als_phx = als(np.array(phoenix_train_matrix.todense()), num_features=10,
                    iterations=5, user_regularization=0.001, item_regularization=0.001)

# 使用calculate_mse函数计算验证集的均方误差
Best_Val_MSE_als_phx = calculate_mse(
    preds_als_phx, np.array(phoenix_val_matrix.todense()))

# 打印验证集的最佳均方误差
print('Phoenix餐厅ALS模型的最佳验证均方误差为：', Best_Val_MSE_als_phx)

In [None]:
train_preds_als_phx = preds_als_phx[train_nonzero_indices_phx]
val_preds_als_phx = preds_als_phx[val_nonzero_indices_phx]
test_preds_als_phx = preds_als_phx[test_nonzero_indices_phx]

## 3.4 带偏差修正的随机梯度下降

In [None]:
# 定义迭代次数范围从10到50，步长为10
iters = np.arange(10, 50, 10)
err_sgd_test = []  # 存储验证集的错误率
err_sgd_train = []  # 存储训练集的错误率

# 遍历不同的迭代次数，需要调整 Iters 超参数
for i in iters:
    print('当前迭代次数 = ', i)

    # 使用带偏差校正的SGD进行模型训练
    preds_sgd = sgd_with_bias_correction(
        np.array(phoenix_train_matrix.todense()), num_features=10, iterations=i)

    # 计算验证集和训练集的均方误差
    err_test = calculate_mse(
        preds_sgd[0], np.array(phoenix_val_matrix.todense()))
    err_train = calculate_mse(
        preds_sgd[0], np.array(phoenix_train_matrix.todense()))

    # 将结果添加到列表中
    err_sgd_test.append(err_test)
    err_sgd_train.append(err_train)


# 绘制迭代次数与MSE的关系图
plt.plot(iters, err_sgd_test, label='验证集', color=plt_color[0])
plt.plot(iters, err_sgd_train, label='训练集', color=plt_color[1])

plt.xlabel('迭代次数 iteration', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/12_带偏差修正的SGD_MSE与迭代次数的关系.png', dpi=600)
plt.title('Phoenix餐厅中带偏差修正的SGD模型中迭代次数对MSE的影响', fontproperties=PingFang)

plt.show()

In [None]:
# 定义特征数量的范围
k = np.linspace(2, 15, 10, dtype=int)
err_sgd_test = []  # 存储验证集的误差
err_sgd_train = []  # 存储训练集的误差

# 遍历不同的特征数量
for i in k:
    print('当前特征数量 k = ', i)

    # 使用带偏差校正的SGD进行模型训练
    preds_sgd = sgd_with_bias_correction(
        np.array(phoenix_train_matrix.todense()), num_features=i, iterations=20)

    # 计算验证集和训续练集的均方误差
    err_test = calculate_mse(
        preds_sgd[0], np.array(phoenix_val_matrix.todense()))
    err_train = calculate_mse(
        preds_sgd[0], np.array(phoenix_train_matrix.todense()))

    # 将结果添加到列表中
    err_sgd_test.append(err_test)
    err_sgd_train.append(err_train)

# 创建图形对象并设置图形大小
fig = plt.figure(figsize=(8, 5))

# 绘制特征数量与MSE的关系图
plt.plot(k, err_sgd_test,  label='验证集', color=plt_color[0])
plt.plot(k, err_sgd_train, label='训练集', color=plt_color[1])

# 设置图表的标题和坐标轴标签

plt.xlabel('特征数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Phoenix/13_带偏差修正的SGD_特征数量与MSE关系.png', dpi=600)
plt.title('Phoenix餐厅中SGD模型的特征数量对MSE的影响', fontproperties=PingFang)
plt.show()

In [None]:
# 使用带偏差校正的SGD训练模型，特征数量为2，迭代次数为20
predictions_sgd_phx, error_array_sgd_phx, user_vec_sgd_phx, res_vec_sgd_phx = sgd_with_bias_correction(
    np.array(phoenix_train_matrix.todense()), num_features=2, iterations=20)

In [None]:
# 重新调用SGD函数，以确保获得预测结果
preds_sgd_phx = sgd_with_bias_correction(
    np.array(phoenix_train_matrix.todense()), num_features=2, iterations=20)

# 计算验证集的均方误差
Best_Val_MSE_sgd_phx = calculate_mse(
    preds_sgd_phx[0], np.array(phoenix_val_matrix.todense()))

# 打印验证集上的最佳均方误差
print('Phoenix餐厅SGD模型的最佳验证均方误差为：', Best_Val_MSE_sgd_phx)

In [None]:
train_preds_sgd_phx = preds_sgd_phx[0][train_nonzero_indices_phx]
val_preds_sgd_phx = preds_sgd_phx[0][val_nonzero_indices_phx]
test_preds_sgd_phx = preds_sgd_phx[0][test_nonzero_indices_phx]

## 将矩阵补全问题转换为回归问题

In [None]:
train_phx = concatenate_user_item_vectors(
    user_vec_sgd_phx, res_vec_sgd_phx, phoenix_train_matrix)
val_phx = concatenate_user_item_vectors(
    user_vec_sgd_phx, res_vec_sgd_phx, phoenix_val_matrix)
test_phx = concatenate_user_item_vectors(
    user_vec_sgd_phx, res_vec_sgd_phx, phoenix_test_matrix)

## Random Forest Regressor

In [None]:
train_mat_phx = np.array(train_phx)
val_mat_phx = np.array(val_phx)
test_mat_phx = np.array(test_phx)
x_train, y_train = train_mat_phx[:, :-1], np.ravel(train_mat_phx[:, -1])
x_val, y_val = val_mat_phx[:, :-1], np.ravel(val_mat_phx[:, -1])

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=5)
rf.fit(train_mat_phx[:, :4], train_mat_phx[:, 4])
preds_rf_phx = rf.predict(val_mat_phx[:, :4]).reshape(-1, 1)
MSE_rf_phx = calculate_mse(preds_rf_phx, val_mat_phx[:, 4])
print('MSE for Random Forest Classifier for Restaurants in Phoenix = ', MSE_rf_phx)

In [None]:
train_preds_rf_phx = rf.predict(train_mat_phx[:, :4])
val_preds_rf_phx = rf.predict(val_mat_phx[:, :4])
test_preds_rf_phx = rf.predict(test_mat_phx[:, :4])

## Ensemble of all the Predictors (SVD, Cosine Model, ALS, SGD, RF)

In [None]:
train_predictions_svd_phx.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
ensemble_training_phx_df = build_ensemble_dataframe(
    train_predictions_svd_phx, train_preds_cosine_phx, train_preds_als_phx, train_preds_sgd_phx, train_preds_rf_phx)
ensemble_validation_phx_df = build_ensemble_dataframe(
    val_predictions_svd_phx, val_preds_cosine_phx, val_preds_als_phx, val_preds_sgd_phx, val_preds_rf_phx)
ensemble_testing_phx_df = build_ensemble_dataframe(
    test_predictions_svd_phx, test_preds_cosine_phx, test_preds_als_phx, test_preds_sgd_phx, test_preds_rf_phx)

In [None]:
y_train_phx = np.ravel(
    np.array(phoenix_train_matrix[train_nonzero_indices_phx]))
y_val_phx = np.ravel(np.array(phoenix_val_matrix[val_nonzero_indices_phx]))
y_test_phx = np.ravel(np.array(phoenix_test_matrix[test_nonzero_indices_phx]))

In [None]:
lr_phx = LinearRegression(fit_intercept=True)
lr_phx.fit(ensemble_training_phx_df.iloc[:, -3:], y_train_phx)
ensemble_val_pred_phx = lr_phx.predict(ensemble_validation_phx_df.iloc[:, -3:])
MSE_ensemble_phx = calculate_mse(ensemble_val_pred_phx, y_val_phx)
print('MSE of the ensemble of Models for restaurants in Phoenix = ', MSE_ensemble_phx)

**Ridge Regression with high penalty**

In [None]:
from sklearn.linear_model import Ridge
lrr_phx = Ridge(alpha=10000)
lrr_phx.fit(ensemble_training_phx_df.iloc[:, -3:], y_train_phx)
ensemble_val_pred_phx = lrr_phx.predict(
    ensemble_validation_phx_df.iloc[:, -3:])
MSE_ensemble_phx = calculate_mse(ensemble_val_pred_phx, y_val_phx)
print('MSE of the ensemble of Models for restaurants in Phoenix = ', MSE_ensemble_phx)

# 4. Testing the best model on the test data and checking the test accuracy
## For the restaurants in Phoenix and Scottsdalle, the best model was the ensemble model.

In [None]:
ensemble_test_pred_phx = lrr_phx.predict(ensemble_testing_phx_df.iloc[:, -3:])
MSE_test_phx = calculate_mse(ensemble_test_pred_phx, y_test_phx)
print('MSE of the ensemble of Models for restaurants in Phoenix (test) = ', MSE_test_phx)