In [None]:
import pandas as pd
from six.moves import cPickle
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error

In [None]:
# 设置主题色和ggplot模版
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
PingFang = FontProperties(fname='../assets/PingFang.ttc')
plt.style.use('ggplot')
plt_color = ["#ACA7CB", "#474554"]

In [None]:
path = '../data/Yelp_final.csv'

yelp = pd.read_csv(path, encoding='utf-8')

In [None]:
# 过滤出位于Scottsdale的餐厅评论
scottsdale_restaurant_reviews = yelp[(yelp.city == 'Scottsdale') &
                                     (yelp.categories.str.contains('.Restaurant.', case=False))]
# 打印Scottsdale餐厅评论的行列数
print("Scottsdale餐厅评论的行列数:", scottsdale_restaurant_reviews.shape)

In [None]:
from src.preprocess import get_clean_df, get_sparsity, get_sparse_matrix

In [None]:
# 获取每位用户至少有10条评论的Scottsdale餐厅数据
scottsdale_reviews_min_10 = get_clean_df(scottsdale_restaurant_reviews,
                                         min_user_review=10, cols=['user_id', 'business_id', 'stars'])
print("Scottsdale餐厅评论数:", scottsdale_reviews_min_10.shape[0])

In [None]:
# 打印Scottsdale餐厅的唯一餐厅数和唯一用户数
print("Scottsdale唯一餐厅数 =", len(scottsdale_reviews_min_10.business_id.unique()))
print("Scottsdale唯一用户数 =", len(scottsdale_reviews_min_10.user_id.unique()))

In [None]:
# 获取Scottsdale餐厅的稀疏矩阵
scottsdale_ratings_matrix = get_sparse_matrix(scottsdale_reviews_min_10)
print("Scottsdale餐厅评分矩阵的形状 =", scottsdale_ratings_matrix.shape)

In [None]:
# 计算Scottsdale餐厅评分矩阵的稀疏度
sparsity_scottsdale = 1 - scottsdale_ratings_matrix.nnz / \
    (scottsdale_ratings_matrix.shape[0] * scottsdale_ratings_matrix.shape[1])
print("Scottsdale餐厅评分矩阵的稀疏度 =", sparsity_scottsdale)

In [None]:
from src.feature_extraction import train_val_test_split

In [None]:
# 处理Scottsdale餐厅评分矩阵
scottsdale_train_matrix, scottsdale_val_matrix, scottsdale_test_matrix = train_val_test_split(
    scottsdale_ratings_matrix)
print("Scottsdale 训练集形状:", scottsdale_train_matrix.shape)
print("Scottsdale 验证集形状:", scottsdale_val_matrix.shape)
print("Scottsdale 测试集形状:", scottsdale_test_matrix.shape)

# 4. 模型构建:

## 4.1 奇异值分解SVD

In [None]:
# 计算斯科茨代尔训练矩阵的奇异值分解
U_sct_res, S_sct_res, Vt_sct_res = np.linalg.svd(
    scottsdale_train_matrix.todense())

# 打印最小和最大的奇异值
print("最小奇异值 = ", min(S_sct_res))
print("最大的奇异值 = ", max(S_sct_res))

# 将奇异值对角化
S_sct_res_dia = np.diag(S_sct_res)

In [None]:
# 绘制前20个奇异值的Elbow图
plt.plot(S_sct_res[:20], '-', linewidth=2,
         color=plt_color[1], label='数据点')

plt.xticks(np.arange(0, len(S_sct_res[:20]), 1), fontproperties=PingFang)
plt.xlabel('主成分', fontproperties=PingFang, fontsize=12)
plt.ylim(40, 180)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend(prop=PingFang)


plt.tight_layout()
plt.savefig('../images/Scottsdale/01_SVD奇异值分布_前20个主成分.png', dpi=600)
plt.ylabel('奇异值', fontproperties=PingFang, fontsize=12)
plt.title('Scottsdale餐厅SVD奇异值分布（前20个主成分）',
          fontproperties=PingFang, fontsize=14)
plt.show()

In [None]:
from src.model import compute_approximation_error

In [None]:
# 设置奇异值数量的范围
num_singular_values_range = np.linspace(2, 40, 20, dtype=int)
errors_svd_val_sct = {}
errors_svd_train_sct = {}

# 计算Scottsdale餐厅数据在不同奇异值数量下的训练误差和验证误差
for num_singular_values in num_singular_values_range:
    errors_svd_val_sct[num_singular_values] = compute_approximation_error(
        num_singular_values, scottsdale_val_matrix.todense(), U_sct_res, S_sct_res_dia, Vt_sct_res)
    errors_svd_train_sct[num_singular_values] = compute_approximation_error(
        num_singular_values, scottsdale_train_matrix.todense(), U_sct_res, S_sct_res_dia, Vt_sct_res)

# 绘制训练误差和验证误差图
plt.plot(errors_svd_val_sct.keys(), errors_svd_val_sct.values(),
         label='验证集', color=plt_color[0])
plt.plot(errors_svd_train_sct.keys(), errors_svd_train_sct.values(),
         label='训练集', color=plt_color[1])

plt.xlabel('奇异值数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)

plt.tight_layout()
plt.savefig('../images/Scottsdale/02_不同奇异值数量下验证集误差.png', dpi=600)
plt.title('Scottsdale餐厅数据不同奇异值数量下的训练和验证误差',
          fontproperties=PingFang, fontsize=14)

plt.show()

However the Validation Error increases after each stages in this

In [None]:
# 单独绘制验证误差图
plt.plot(errors_svd_val_sct.keys(), errors_svd_val_sct.values(),
         label='验证集', color=plt_color[0])

plt.xlabel('奇异值数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)

plt.tight_layout()
plt.savefig('../images/Scottsdale/03_不同奇异值数量下验证集误差.png', dpi=600)
plt.title('Scottsdale餐厅数据不同奇异值数量下的验证误差', fontproperties=PingFang, fontsize=14)
plt.show()

In [None]:
# 使用SVD结果中的前两个主成分绘制散点图
plt.scatter(np.ravel(U_sct_res[:, 0]), np.ravel(
    U_sct_res[:, 1]), label='观察点', color=plt_color[1], alpha=0.6)

plt.xlabel('主成分 1', fontproperties=PingFang)
plt.ylabel('主成分 2', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Scottsdale/04_前两个主成分散点图.png', dpi=600)

# 设置标题
plt.title('Scottsdale餐厅数据的前两个主成分散点图', fontproperties=PingFang, fontsize=14)
# 显示图像
plt.show()

In [None]:
# 计算在使用前16个奇异值进行近似重构时，Scottsdale餐厅数据在验证集上的最佳均方误差（MSE）
best_val_mse_svd_sct = compute_approximation_error(
    16, scottsdale_val_matrix.todense(), U_sct_res, S_sct_res_dia, Vt_sct_res)

print('Scottsdale餐厅使用SVD的最佳验证集均方误差 (MSE)= ', best_val_mse_svd_sct)

In [None]:
# 训练集非零元素的索引
train_nonzero_indices_sct = scottsdale_train_matrix.nonzero()

# 验证集非零元素的索引
val_nonzero_indices_sct = scottsdale_val_matrix.nonzero()

# 测试集非零元素的索引
test_nonzero_indices_sct = scottsdale_test_matrix.nonzero()

In [None]:
# 使用前15个奇异值进行SVD重构，计算重构后的矩阵
reconstructed_matrix_svd_sct = np.dot(U_sct_res[:, :15], np.dot(
    S_sct_res_dia[:15, :15], Vt_sct_res[:15, :]))

# 获取训练集、验证集和测试集在重构矩阵中的预测值
train_predictions_svd_sct = np.ravel(
    reconstructed_matrix_svd_sct[train_nonzero_indices_sct])
val_predictions_svd_sct = np.ravel(
    reconstructed_matrix_svd_sct[val_nonzero_indices_sct])
test_predictions_svd_sct = np.ravel(
    reconstructed_matrix_svd_sct[test_nonzero_indices_sct])

## SVD with bias correction

In [None]:
from src.model import compute_global_user_item_bias

In [None]:
# 对Scottsdale餐厅评分矩阵计算全局用户和商家偏差
user_bias_ur_sct, res_bias_ur_sct, train_ur_sct = compute_global_user_item_bias(
    scottsdale_train_matrix)

In [None]:
# 对去除偏差后的训练矩阵进行SVD分解
U_sct_res_wo_bias_ur, S_sct_res_wo_bias_ur, Vt_sct_res_wo_bias_ur = np.linalg.svd(
    train_ur_sct)
print("最小的奇异值 = ", min(S_sct_res_wo_bias_ur))
print("最大的奇异值 = ", max(S_sct_res_wo_bias_ur))
S_sct_res_dia_wo_bias_ur = np.diag(S_sct_res_wo_bias_ur)

In [None]:
# 绘制前1000个奇异值的Elbow图
fig = plt.figure(figsize=(8, 5))
plt.semilogy(S_sct_res_wo_bias_ur[:1000], '-', linewidth=1, color=plt_color[1])
plt.xlim(-20, 1020)
plt.grid(True, linestyle='--', linewidth=0.5)
plt.xlabel('主成分', fontproperties=PingFang)
plt.ylabel('奇异值', fontproperties=PingFang)
plt.tight_layout()
plt.savefig('../images/Scottsdale/05_去除偏差后的SVD_奇异值分布（前1000个）.png', dpi=600)
plt.title('Scottsdale去除偏差后的SVD奇异值分布（前1000个）', fontproperties=PingFang)
plt.show()

In [None]:
# 设置主成分数量的范围
num_components_wo_bias = np.linspace(2, 40, 20, dtype=int)
errors_svd_wo_bias_ur_sct = {}

# 去除偏差后的验证矩阵
sm_u_10_sct_res_val_wo_bias_ur = scottsdale_val_matrix.todense() - \
    user_bias_ur_sct - res_bias_ur_sct

# 计算不同主成分数量下的重构误差
for i in num_components_wo_bias:
    sm_u_10_sct_res_val_wo_bias_ur_rec = np.dot(U_sct_res_wo_bias_ur[:, :i], np.dot(
        S_sct_res_dia_wo_bias_ur[:i, :i], Vt_sct_res_wo_bias_ur[:i, :]))
    idx_wo_bias_ur_sct = np.where(scottsdale_val_matrix.todense() > 0)
    diff_wo_bias_ur_sct = sm_u_10_sct_res_val_wo_bias_ur[idx_wo_bias_ur_sct] - \
        sm_u_10_sct_res_val_wo_bias_ur_rec[idx_wo_bias_ur_sct]
    errors_svd_wo_bias_ur_sct[i] = np.linalg.norm(
        diff_wo_bias_ur_sct) ** 2 / diff_wo_bias_ur_sct.shape[1]

# 绘制不同主成分数量下的重构误差
plt.plot(errors_svd_wo_bias_ur_sct.keys(),
         errors_svd_wo_bias_ur_sct.values(), label='验证集',
         color=plt_color[0])
plt.xlabel('主成分数量', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang, loc='upper right')
plt.tight_layout()
plt.savefig('../images/Scottsdale/06_去除偏差后的SVD_验证集重构误差.png', dpi=600)
plt.title('Scottsdale餐厅去除偏差后的SVD重构误差', fontproperties=PingFang)
plt.tight_layout()
plt.show()

## 4.2 修正偏差的余弦相似度

In [None]:
from src.model import compute_similarity_matrix, predict_top_k, calculate_mse

In [None]:
# 初始化存储MSE的字典
errors_cs_res_sct_val = {}
errors_cs_res_sct_train = {}

# 定义K值范围
k_cs = np.arange(1, 40)

# 计算Scottsdale餐厅数据的余弦相似性矩阵
simi_sct_res = compute_similarity_matrix(scottsdale_train_matrix, axis='res')
simi_sct_res = np.array(simi_sct_res)

# 对每个K值，使用余弦相似性模型预测评分，并计算MSE
for i in k_cs:
    preds_cs_res_sct = predict_top_k(
        np.array(scottsdale_train_matrix.todense()), simi_sct_res, kind='res', k=i)
    error_cs_val = calculate_mse(
        preds_cs_res_sct, np.array(scottsdale_val_matrix.todense()))
    error_cs_train = calculate_mse(
        preds_cs_res_sct, np.array(scottsdale_train_matrix.todense()))
    errors_cs_res_sct_val[i] = error_cs_val
    errors_cs_res_sct_train[i] = error_cs_train

# 绘制不同K值对应的MSE曲线
plt.plot(errors_cs_res_sct_val.keys(),
         errors_cs_res_sct_val.values(), label='验证集', color=plt_color[0])
plt.plot(errors_cs_res_sct_train.keys(),
         errors_cs_res_sct_train.values(), label='训练集', color=plt_color[1])

plt.xlabel('邻近用户或物品的数量 K', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Scottsdale/09_余弦相似度模型_不同K值对评分预测MSE的影响.png', dpi=600)
plt.title('余弦相似度下不同K值对Scottsdale餐厅评分预测MSE的影响', fontproperties=PingFang)

plt.show()

In [None]:
# 使用K=40的余弦相似性模型对Scottsdale训练集数据进行预测
preds_cosine_sct = predict_top_k(
    np.array(scottsdale_train_matrix.todense()), simi_sct_res, kind='res', k=40)

# 计算预测结果与验证集数据的均方误差（MSE）
Best_Val_MSE_Cosine_sct = calculate_mse(
    preds_cosine_sct, np.array(scottsdale_val_matrix.todense()))

# 打印最佳验证集均方误差
print('Scottsdale餐厅余弦相似性模型在K=40时的最佳验证集MSE为 = ', Best_Val_MSE_Cosine_sct)

In [None]:
train_preds_cosine_sct = preds_cosine_sct[train_nonzero_indices_sct]
val_preds_cosine_sct = preds_cosine_sct[val_nonzero_indices_sct]
test_preds_cosine_sct = preds_cosine_sct[test_nonzero_indices_sct]


## 3.3 ALS:

In [None]:
from src.model import als

In [None]:
# 定义特征数量的范围
num_features = np.linspace(1, 20, 5, dtype=int)
test_error_als_sct = []  # 存储验证集误差
train_error_als_sct = []  # 存储训练集误差

# 遍历不同的特征数量
for i in num_features:
    # 使用ALS算法预测Scottsdale餐厅的评分
    preds_als = als(np.array(scottsdale_train_matrix.todense()),
                    num_features=i, iterations=5)

    # 计算验证集和训练集的均方误差
    test_err = calculate_mse(preds_als, np.array(
        scottsdale_val_matrix.todense()))
    train_err = calculate_mse(preds_als, np.array(
        scottsdale_train_matrix.todense()))

    # 将计算结果添加到列表中
    test_error_als_sct.append(test_err)
    train_error_als_sct.append(train_err)

# 创建图形对象并设置图形大小
fig = plt.figure(figsize=(8, 5))

# 绘制特征数量与均方误差的关系图
plt.plot(num_features, test_error_als_sct,
         label='验证集', color=plt_color[0])
plt.plot(num_features, train_error_als_sct,
         label='训练集', color=plt_color[1])

# 设置图表的标题和坐标轴标签

plt.xlabel('特征向量中的特征数量', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Scottsdale/10_ALS_特征数量与均方误差的关系.png', dpi=600)
plt.title('Scottsdale餐厅ALS特征数量与均方误差的关系', fontproperties=PingFang)
plt.show()

In [None]:
# 生成用户和物品的正则化参数，范围从10^-4到10^3，共7个点
user_reg = np.logspace(-4, 3, 7)
val_error_als = {}  # 初始化存储验证集误差的字典

# 创建图形对象并设置图形大小
fig = plt.figure(figsize=(8, 5))

# 遍历正则化参数
for i in user_reg:
    # 对每个正则化参数进行模型训练，并计算验证集上的均方误差
    preds_als = als(np.array(scottsdale_train_matrix.todense()), num_features=10, iterations=5,
                    user_regularization=i, item_regularization=i)
    val_err = calculate_mse(preds_als, np.array(
        scottsdale_val_matrix.todense()))
    val_error_als[i] = [val_err]  # 存储每个参数下的验证集MSE

# 绘制正则化参数（对数尺度）与MSE的关系图
plt.plot(np.log10(user_reg), [
         errors[0] for errors in val_error_als.values()], linestyle='-',
         label='验证集MSE', color=plt_color[0])


plt.xlabel('正则化参数（对数尺度）', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)
plt.legend(prop=PingFang, loc="upper right")
plt.tight_layout()
plt.savefig('../images/Scottsdale/11_ALS_正则化参数对MSE的影响.png', dpi=600)
plt.title('Scottsdale餐厅正则化参数对MSE的影响', fontproperties=PingFang)
plt.show()

In [None]:
# 使用ALS算法预测Scottsdale餐厅的评分，设置特征数量为10，迭代次数为5
# 并设置用户和物品的正则化参数均为0.01
preds_als_sct = als(np.array(scottsdale_train_matrix.todense()), num_features=10,
                    iterations=5, user_regularization=0.01, item_regularization=0.01)

# 使用calculate_mse函数计算验证集的均方误差
Best_Val_MSE_als_sct = calculate_mse(
    preds_als_sct, np.array(scottsdale_val_matrix.todense()))

# 打印验证集的最佳均方误差
print('Scottsdale餐厅ALS模型的最佳验证均方误差为：', Best_Val_MSE_als_sct)

In [None]:
train_preds_als_sct = preds_als_sct[train_nonzero_indices_sct]
val_preds_als_sct = preds_als_sct[val_nonzero_indices_sct]
test_preds_als_sct = preds_als_sct[test_nonzero_indices_sct]

## 3.4 带偏差修正的随机梯度下降

In [None]:
from src.model import sgd_with_bias_correction

In [None]:
# 定义迭代次数的范围
iters = np.arange(10, 50, 10)
err_sgd_test_sct = []  # 存储验证集的误差
err_sgd_train_sct = []  # 存储训练集的误差

# 遍历不同的迭代次数
for i in iters:
    print('当前迭代次数 = ', i)

    # 使用带偏差校正的SGD训练模型
    preds_sgd = sgd_with_bias_correction(
        np.array(scottsdale_train_matrix.todense()), num_features=10, iterations=i)

    # 计算验证集和训练集的均方误差
    err_test = calculate_mse(preds_sgd[0], np.array(
        scottsdale_val_matrix.todense()))
    err_train = calculate_mse(preds_sgd[0], np.array(
        scottsdale_train_matrix.todense()))

    # 将结果添加到列表中
    err_sgd_test_sct.append(err_test)
    err_sgd_train_sct.append(err_train)

# 绘制迭代次数与MSE的关系图
plt.plot(iters, err_sgd_test_sct, label='验证集', color=plt_color[0])
plt.plot(iters, err_sgd_train_sct, label='训练集', color=plt_color[1])


plt.xlabel('迭代次数 iteration', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.tight_layout()
plt.savefig('../images/Scottsdale/12_带偏差修正的SGD_MSE与迭代次数的关系.png', dpi=600)
plt.title('Scottsdale餐厅中带偏差修正的SGD模型中迭代次数对MSE的影响', fontproperties=PingFang)
plt.show()

**Optimal Number of Iterations = 30**

In [None]:
# 定义特征数量的范围
k = np.linspace(2, 15, 10, dtype=int)
err_sgd_val_sct = []  # 存储验证集的误差
err_sgd_train_sct = []  # 存储训练集的误差

# 遍历不同的特征数量
for i in k:
    print('当前特征数量 k=', i)

    # 使用带偏差校正的SGD进行模型训练
    preds_sgd = sgd_with_bias_correction(
        np.array(scottsdale_train_matrix.todense()), num_features=i, iterations=30)

    # 计算验证集和训练集的均方误差
    err_val = calculate_mse(preds_sgd[0], np.array(
        scottsdale_val_matrix.todense()))
    err_train = calculate_mse(preds_sgd[0], np.array(
        scottsdale_train_matrix.todense()))

    # 将结果添加到列表中
    err_sgd_val_sct.append(err_val)
    err_sgd_train_sct.append(err_train)

# 绘制特征数量与MSE的关系图
plt.plot(k, err_sgd_val_sct, label='验证集', color=plt_color[0])
plt.plot(k, err_sgd_train_sct, label='训练集', color=plt_color[1])


plt.xlabel('特征数量 k', fontproperties=PingFang)
plt.ylabel('均方误差 MSE', fontproperties=PingFang)

plt.legend(prop=PingFang)
plt.savefig('../images/Scottsdale/14_带偏差修正的SGD_特征数量与MSE关系.png', dpi=600)
plt.title('Scottsdale餐厅中SGD模型的特征数量对MSE的影响', fontproperties=PingFang)
plt.show()

**Optimal k value = 2**

In [None]:
# 使用带偏己校正的SGD算法训练模型，并获取预测结果、误差数组、用户向量和资源向量
# 设置特征数量为2，迭代次数为30
predictions_sgd_sct, error_array_sgd_sct, user_vec_sgd_sct, res_vec_sgd_sct = sgd_with_bias_correction(
    np.array(scottsdale_train_matrix.todense()), num_features=2, iterations=30)

In [None]:
# 再次使用SGD算法进行训练，此次仅获取预测结果
preds_sgd_sct = sgd_with_bias_correction(
    np.array(scottsdale_train_matrix.todense()), num_features=2, iterations=30)

# 计算预测结果的均方误差（MSE）并打印
Best_Val_MSE_sgd_sct = calculate_mse(
    preds_sgd_sct[0], np.array(scottsdale_val_matrix.todense()))
print('Scottsdale餐厅SGD模型的最佳验证均方误差为：', Best_Val_MSE_sgd_sct)

In [None]:
# 使用预测结果对训练集、验证集和测试集的非零索引位置进行评分预测
train_preds_sgd_sct = preds_sgd_sct[0][train_nonzero_indices_sct]
val_preds_sgd_sct = preds_sgd_sct[0][val_nonzero_indices_sct]
test_preds_sgd_sct = preds_sgd_sct[0][test_nonzero_indices_sct]

## 将矩阵补全问题转换为回归问题

In [None]:
import pickle
from src.model import concatenate_user_item_vectors

In [None]:
train_sct = concatenate_user_item_vectors(
    user_vec_sgd_sct, res_vec_sgd_sct, scottsdale_train_matrix)
val_sct = concatenate_user_item_vectors(
    user_vec_sgd_sct, res_vec_sgd_sct, scottsdale_val_matrix)
test_sct = concatenate_user_item_vectors(
    user_vec_sgd_sct, res_vec_sgd_sct, scottsdale_test_matrix)
train_mat_sct = np.array(train_sct)
val_mat_sct = np.array(val_sct)
test_mat_sct = np.array(test_sct)

In [None]:
x_train, y_train = train_mat_sct[:, :-1], np.ravel(train_mat_sct[:, -1])
x_val, y_val = val_mat_sct[:, :-1], np.ravel(val_mat_sct[:, -1])

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=5)

In [None]:
rf_sct = RandomForestRegressor(max_depth=5)
rf.fit(train_mat_sct[:, :4], train_mat_sct[:, 4])
preds_rf_sct = rf.predict(val_mat_sct[:, :4]).reshape(-1, 1)
MSE_rf_sct = calculate_mse(preds_rf_sct, val_mat_sct[:, 4])
print('MSE for Random Forest Classifier for Restaurants in Scottsdale = ', MSE_rf_sct)

In [None]:
train_preds_rf_sct = rf.predict(train_mat_sct[:, :4])
val_preds_rf_sct = rf.predict(val_mat_sct[:, :4])
test_preds_rf_sct = rf.predict(test_mat_sct[:, :4])

## Ensemble of all the Predictors (SVD, Cosine Model, ALS, SGD, RF)

In [None]:
from src.evaluation import build_ensemble_dataframe

In [None]:
ensemble_training_sct_df = build_ensemble_dataframe(
    train_predictions_svd_sct, train_preds_cosine_sct, train_preds_als_sct, train_preds_sgd_sct, train_preds_rf_sct)
ensemble_validation_sct_df = build_ensemble_dataframe(
    val_predictions_svd_sct, val_preds_cosine_sct, val_preds_als_sct, val_preds_sgd_sct, val_preds_rf_sct)
ensemble_testing_sct_df = build_ensemble_dataframe(
    test_predictions_svd_sct, test_preds_cosine_sct, test_preds_als_sct, test_preds_sgd_sct, test_preds_rf_sct)

In [None]:
y_train_sct = np.ravel(
    np.array(scottsdale_train_matrix[train_nonzero_indices_sct]))
y_val_sct = np.ravel(np.array(scottsdale_val_matrix[val_nonzero_indices_sct]))
y_test_sct = np.ravel(
    np.array(scottsdale_test_matrix[test_nonzero_indices_sct]))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_sct = LinearRegression(fit_intercept=True)
lr_sct.fit(ensemble_training_sct_df.iloc[:, -3:], y_train_sct)
ensemble_val_pred_sct = lr_sct.predict(ensemble_validation_sct_df.iloc[:, -3:])
MSE_ensemble_sct = calculate_mse(ensemble_val_pred_sct, y_val_sct)
print('MSE of the ensemble of Models for restaurants in Scottsdalle = ', MSE_ensemble_sct)

**Ridge Regression with high penalty**

In [None]:
from sklearn.linear_model import Ridge
lrr_phx = Ridge(alpha=10000)

In [None]:
lrr_sct = Ridge(alpha=10000)
lrr_sct.fit(ensemble_training_sct_df.iloc[:, -3:], y_train_sct)
ensemble_val_pred_sct = lrr_sct.predict(
    ensemble_validation_sct_df.iloc[:, -3:])
MSE_ensemble_sct = calculate_mse(ensemble_val_pred_sct, y_val_sct)
print('MSE of the ensemble of Models for restaurants in Scottsdalle = ', MSE_ensemble_sct)

# 4. Testing the best model on the test data and checking the test accuracy
## For the restaurants in Phoenix and Scottsdalle, the best model was the ensemble model.

In [None]:
ensemble_test_pred_sct = lrr_sct.predict(ensemble_testing_sct_df.iloc[:, -3:])
MSE_test_sct = calculate_mse(ensemble_test_pred_sct, y_test_sct)
print('MSE of the ensemble of Models for restaurants in Scottsdalle (test) = ', MSE_test_sct)