### 数据预处理

In [None]:
import pandas as pd

# 读取训练集和测试集的 CSV 文件
ratings_train = pd.read_csv("ratings_train.csv")
ratings_test = pd.read_csv("ratings_test.csv")
movies = pd.read_csv("movies.csv")

# 查看数据结构
print("ratings_train 数据结构:")
print(ratings_train.info())  # 展示数据的总体结构


print("\nratings_test 数据结构:")
print(ratings_test.info())


print("\nmovies 数据结构:")
print(movies.info())


In [None]:
# 简单查看数据
print(movies.head())
print(ratings_train.head())
print(ratings_test.head())

In [None]:
# 将 timestamp 列转换为时间戳（如果它是日期格式）
ratings_train['timestamp'] = pd.to_datetime(ratings_train['timestamp'])
ratings_train['timestamp'] = ratings_train['timestamp'].view('int64') / 10**9  # 转换为秒级时间戳

# 计算皮尔逊相关系数
correlation = ratings_train['timestamp'].corr(ratings_train['rating'])

print("Timestamp 和 Rating 之间的相关度:", correlation)


In [None]:
merged_1 = pd.merge(ratings_train, movies, on='movieId', how='outer')
merged_1.to_csv('merged_1.csv', index=False, encoding='utf-8')

In [None]:
merged_2 = pd.merge(ratings_train, movies, on='movieId', how='inner')
merged_2.to_csv('merged_2.csv', index=False, encoding='utf-8')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取 ratings_train 和 movies 数据
ratings_train = pd.read_csv('ratings_train.csv', encoding='utf-8')
movies = pd.read_csv('movies.csv', encoding='utf-8')

# 将 ratings_train 中的 timestamp（float64，即 Unix 时间戳）转换为 datetime 格式
ratings_train['timestamp'] = pd.to_datetime(ratings_train['timestamp'], unit='s')

# 使用已经转换好的 timestamp 生成日期，并提取年月信息
ratings_train['year_month'] = ratings_train['timestamp'].dt.to_period('M')

# 检查 year_month 列
print("Year-Month column:")
print(ratings_train[['timestamp', 'year_month']].head())

# 拆分 genres 列，这里以字母 'I' 作为分隔符
movies['genre_list'] = movies['genres'].str.split('|')

# 展开 genres 列并清理数据
movies_exploded = movies.explode('genre_list').rename(columns={'genre_list': 'genre'})

# 检查拆分结果
print("Exploded Movies DataFrame:")
print(movies_exploded[['movieId', 'genres', 'genre']].head())

# 在 ratings_train 和 movies_exploded 中分别提取 movieId 对应的数据
# 将 ratings_train 的每个 movieId 对应的 genre 从 movies_exploded 中获取
df = pd.merge(ratings_train, movies_exploded[['movieId', 'genre']], on='movieId', how='inner')

# 按每个月和每个 genre 分组，计算平均 rating
monthly_avg = df.groupby(['year_month', 'genre'])['rating'].mean().reset_index()

# 打印计算结果
print("Monthly Average:")
print(monthly_avg.head())

# 绘制每个 genre 的评分变化图
if not monthly_avg.empty:
    plt.figure(figsize=(12, 6))
    for genre in monthly_avg['genre'].unique():
        genre_data = monthly_avg[monthly_avg['genre'] == genre]
        plt.plot(genre_data['year_month'].astype(str), genre_data['rating'], label=genre)

    plt.xlabel('Month')
    plt.ylabel('Average Rating')
    plt.title('Average Rating per Genre over Time')
    plt.xticks(rotation=45)
    plt.legend(title='Genre')
    plt.tight_layout()
    plt.show()
else:
    print("No data to plot.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# 读取 ratings_train 和 movies 数据
ratings_train = pd.read_csv('ratings_train.csv', encoding='utf-8')
movies = pd.read_csv('movies.csv', encoding='utf-8')

# 将 ratings_train 中的 timestamp 转换为 datetime 格式，并提取年份
ratings_train['timestamp'] = pd.to_datetime(ratings_train['timestamp'], unit='s')
ratings_train['year'] = ratings_train['timestamp'].dt.year

# 拆分 movies 中的 genres 列，使用字母 'I' 作为分隔符
movies['genre_list'] = movies['genres'].str.split('|')
movies_exploded = movies.explode('genre_list').rename(columns={'genre_list': 'genre'})

# 通过 movieId 将 ratings_train 和 movies_exploded 数据关联起来
df = pd.merge(ratings_train, movies_exploded[['movieId', 'genre']], on='movieId', how='inner')

# 按每年和每个 genre 分组，计算平均 rating
yearly_avg = df.groupby(['year', 'genre'])['rating'].mean().reset_index()

# 指定保存图片的路径
save_path = r"C:\Users\C\Desktop\SML\Project 2\TS"
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 针对每个 genre 单独生成图表
for genre in yearly_avg['genre'].unique():
    # 筛选当前 genre 的数据，并按年份排序
    genre_data = yearly_avg[yearly_avg['genre'] == genre].sort_values('year')
    
    # 准备 x 轴：将年份转换为字符串
    x_labels = genre_data['year'].astype(str)
    x_positions = range(len(x_labels))
    y_values = genre_data['rating']
    
    plt.figure(figsize=(10, 6))
    # 绘制评分曲线
    plt.plot(x_positions, y_values, marker='o', linestyle='-', label=genre)
    
    # 计算该 genre 的总平均 rating，并绘制水平基准线
    overall_avg = df[df['genre'] == genre]['rating'].mean()
    plt.axhline(y=overall_avg, color='red', linestyle='--', label=f'Overall Avg: {overall_avg:.2f}')
    
    plt.xlabel('Year')
    plt.ylabel('Average Rating')
    plt.title(f'Average Rating Trend for {genre}')
    plt.xticks(x_positions, x_labels, rotation=45)
    plt.legend(title='Legend')
    plt.tight_layout()
    
    # 保存图像到指定目录，文件名为 "{genre}_rating_trend.png"
    plt.savefig(os.path.join(save_path, f"{genre}_rating_trend.png"))
    plt.show()


In [None]:
# 查找特定记录
user_records = merged_1[merged_1['movieId'] == 147426]
print(user_records)

# 查找符合多个条件的记录
filtered_records = ratings_train[(ratings_train['userId'] == 5) & (ratings_train['movieId'] == 32)]
print(filtered_records)


In [None]:
# Assuming ratings_train and ratings_test are pandas DataFrames
known_users = ratings_train['userId'].unique()
known_movies = ratings_train['movieId'].unique()

test_users = ratings_test['userId'].unique()
test_movies = ratings_test['movieId'].unique()

cold_start_users = sum(~pd.Series(test_users).isin(known_users))
cold_start_movies = sum(~pd.Series(test_movies).isin(known_movies))

print(cold_start_users)  # Cold start users count
print(cold_start_movies)  # Cold start movies count


# Week 1

## 训练

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 数据加载函数
def load_data():
    # 加载单个文件数据
    data = pd.read_csv('merged_1.csv')  # 确保文件名称与实际一致
    return data


# 构建用户-电影评分矩阵
def create_user_movie_matrix(data):
    """
    从完整的数据构建用户-电影的评分矩阵
    """
    user_movie_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    user_movie_matrix.fillna(0, inplace=True)  # 将缺失值（未评分）填充为 0
    return user_movie_matrix


# 矩阵分解 (梯度下降法) 带早停机制
def matrix_factorization_with_early_stopping(R, K, steps, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001):
    """
    矩阵分解 (随机梯度下降) 带早停机制
    参数:
        R: 用户-电影评分矩阵
        K: 潜在因子维度
        steps: 最大迭代次数
        alpha: 学习率
        lambda_reg: 正则化参数
        patience: 早停容忍次数
        tolerance: 损失改善的最低阈值（提升小于该值时认为没有改善）
    返回:
        P: 用户特征矩阵
        Q: 电影特征矩阵
        best_loss: 最佳损失值
    """
    num_users, num_movies = R.shape
    P = np.random.rand(num_users, K)  # 初始化用户特征矩阵
    Q = np.random.rand(num_movies, K)  # 初始化电影特征矩阵
    
    best_loss = float('inf')  # 最佳损失
    patience_counter = 0  # 早停计数器
    
    for step in range(steps):
        # 随机梯度更新
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:  # 仅更新有评分的数据点
                    eij = R[i, j] - np.dot(P[i, :], Q[j, :].T)
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[j, k] - lambda_reg * P[i, k])
                        Q[j, k] += alpha * (2 * eij * P[i, k] - lambda_reg * Q[j, k])
        
        # 每步计算当前的总误差
        loss = 0
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:
                    loss += (R[i, j] - np.dot(P[i, :], Q[j, :].T)) ** 2
        
        # 打印每隔多步的损失
        if step % 10 == 0:
            print(f"Iteration {step}/{steps} => Loss: {loss:.4f}")
        
        # 检查是否早停
        if loss < best_loss - tolerance:  # 损失有显著改善
            best_loss = loss
            patience_counter = 0  # 重置早停计数器
        else:  # 损失没有改善
            patience_counter += 1
            print(f"Patience {patience_counter}/{patience}: No significant improvement in loss.")

        if patience_counter >= patience:  # 提前停止
            print(f"Early stopping triggered. Best loss: {best_loss:.4f}")
            break
    
    return P, Q, best_loss


# 获取预测评分
def get_predictions(data, P, Q, user_id_mapping, movie_id_mapping):
    """
    为每个用户预测评分，并将其调整为 0 到 5 且以 0.5 为间隔
    """
    data['predicted_rating'] = data.apply(
        lambda row: round_rating(
            np.dot(P[user_id_mapping[row['userId']], :], Q[movie_id_mapping[row['movieId']], :].T)
        ) if row['userId'] in user_id_mapping and row['movieId'] in movie_id_mapping else np.nan,
        axis=1
    )
    return data

# 新增函数： 四舍五入到最近 0.5
def round_rating(rating):
    """
    将预测评分四舍五入到最近的 0.5 并裁剪到 [0, 5]
    参数:
        rating: 预测评分
    返回:
        调整后的评分
    """
    rating = round(rating * 2) / 2  # 四舍五入到最近的 0.5
    return np.clip(rating, 0, 5)  # 限制在 0 到 5 范围内

# 替换主程序调用矩阵分解的部分
def main1():
    print("Loading data...")
    data = load_data()  # 加载新数据集

    print("\nCreating user-movie matrix...")
    user_movie_matrix = create_user_movie_matrix(data)

    # 获取用户和电影映射字典
    user_id_mapping = {id: idx for idx, id in enumerate(user_movie_matrix.index)}
    movie_id_mapping = {id: idx for idx, id in enumerate(user_movie_matrix.columns)}

    print("Training Matrix Factorization Model...")
    latent_factors = 10  # 潜在因子数量
    patience = 20  # 早停容忍次数
    tolerance = 0.0001  # 损失改善的最低阈值
    
    P, Q, best_loss = matrix_factorization_with_early_stopping(
        user_movie_matrix.to_numpy(), 
        K=latent_factors, 
        steps=1000, 
        alpha=0.002, 
        lambda_reg=0.1, 
        patience=patience, 
        tolerance=tolerance
    )
    print(f"\nMatrix Factorization Training Completed! Best Loss: {best_loss:.4f}")

    # 保存结果
    print("Saving trained matrices and mappings for testing...")
    np.save('P_matrix.npy', P)
    np.save('Q_matrix.npy', Q)
    np.save('user_id_mapping.npy', user_id_mapping)
    np.save('movie_id_mapping.npy', movie_id_mapping)

    # 计算全局平均评分并保存
    global_avg_rating = np.mean(data['rating'])
    np.save('global_avg_rating.npy', global_avg_rating)

    print("Training data has been saved successfully.")
    
    # print("Predicting User Ratings...")
    # # 新列包含预测评分
    # data_with_predictions = get_predictions(data, P, Q, user_id_mapping, movie_id_mapping)

    # # 保存结果到文件
    # output_file = 'film_rating_predictions_group_E_week_Y.csv'
    # data_with_predictions.to_csv(output_file, index=False)
    # print(f"Predictions saved to {output_file}")

    # # 计算均方误差（可选）
    # mask = ~data_with_predictions['predicted_rating'].isna()  # 筛选出有效数据
    # mse = mean_squared_error(data_with_predictions[mask]['rating'], data_with_predictions[mask]['predicted_rating'])
    # print(f"Mean Squared Error (MSE) on Train Data: {mse:.4f}")

main1()


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 数据加载函数
def load_data():
    # 加载单个文件数据
    data = pd.read_csv('merged_1.csv')  # 确保文件名称与实际一致
    return data


# 构建用户-电影评分矩阵
def create_user_movie_matrix(data):
    """
    从完整的数据构建用户-电影的评分矩阵
    """
    user_movie_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    user_movie_matrix.fillna(0, inplace=True)  # 将缺失值（未评分）填充为 0
    return user_movie_matrix


# 矩阵分解 (梯度下降法) 带早停机制
def matrix_factorization_with_early_stopping(R, K, steps, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001):
    """
    矩阵分解 (随机梯度下降) 带早停机制
    参数:
        R: 用户-电影评分矩阵
        K: 潜在因子维度
        steps: 最大迭代次数
        alpha: 学习率
        lambda_reg: 正则化参数
        patience: 早停容忍次数
        tolerance: 损失改善的最低阈值（提升小于该值时认为没有改善）
    返回:
        P: 用户特征矩阵
        Q: 电影特征矩阵
        best_loss: 最佳损失值
    """
    num_users, num_movies = R.shape
    P = np.random.rand(num_users, K)  # 初始化用户特征矩阵
    Q = np.random.rand(num_movies, K)  # 初始化电影特征矩阵
    
    best_loss = float('inf')  # 最佳损失
    patience_counter = 0  # 早停计数器
    
    for step in range(steps):
        # 随机梯度更新
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:  # 仅更新有评分的数据点
                    eij = R[i, j] - np.dot(P[i, :], Q[j, :].T)
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[j, k] - lambda_reg * P[i, k])
                        Q[j, k] += alpha * (2 * eij * P[i, k] - lambda_reg * Q[j, k])
        
        # 每步计算当前的总误差
        loss = 0
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:
                    loss += (R[i, j] - np.dot(P[i, :], Q[j, :].T)) ** 2
        
        # 打印每隔多步的损失
        if step % 10 == 0:
            print(f"Iteration {step}/{steps} => Loss: {loss:.4f}")
        
        # 检查是否早停
        if loss < best_loss - tolerance:  # 损失有显著改善
            best_loss = loss
            patience_counter = 0  # 重置早停计数器
        else:  # 损失没有改善
            patience_counter += 1
            print(f"Patience {patience_counter}/{patience}: No significant improvement in loss.")

        if patience_counter >= patience:  # 提前停止
            print(f"Early stopping triggered. Best loss: {best_loss:.4f}")
            break
    
    return P, Q, best_loss


# 获取预测评分
def get_predictions(data, P, Q, user_id_mapping, movie_id_mapping):
    """
    为每个用户预测评分，并将其调整为 0 到 5 且以 0.5 为间隔
    """
    data['predicted_rating'] = data.apply(
        lambda row: round_rating(
            np.dot(P[user_id_mapping[row['userId']], :], Q[movie_id_mapping[row['movieId']], :].T)
        ) if row['userId'] in user_id_mapping and row['movieId'] in movie_id_mapping else np.nan,
        axis=1
    )
    return data

# 新增函数： 四舍五入到最近 0.5
def round_rating(rating):
    """
    将预测评分四舍五入到最近的 0.5 并裁剪到 [0, 5]
    参数:
        rating: 预测评分
    返回:
        调整后的评分
    """
    rating = round(rating * 2) / 2  # 四舍五入到最近的 0.5
    return np.clip(rating, 0, 5)  # 限制在 0 到 5 范围内


# 交叉验证并计算MSE
def cross_validate(data, K=10, steps=100, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001, test_size=0.3, n_splits=5):
    mse_list = []
    
    for split in range(n_splits):
        print(f"Cross-validation split {split + 1}/{n_splits}")
        
        # 将数据划分为训练集和测试集
        train_data, test_data = train_test_split(data, test_size=test_size)

        # 创建训练集的用户-电影评分矩阵
        train_matrix = create_user_movie_matrix(train_data)

        # 获取用户和电影映射字典
        user_id_mapping = {id: idx for idx, id in enumerate(train_matrix.index)}
        movie_id_mapping = {id: idx for idx, id in enumerate(train_matrix.columns)}

        # 训练矩阵分解模型
        P, Q, best_loss = matrix_factorization_with_early_stopping(
            train_matrix.to_numpy(),
            K=K,
            steps=steps,
            alpha=alpha,
            lambda_reg=lambda_reg,
            patience=patience,
            tolerance=tolerance
        )
        print(f"Best Loss for this split: {best_loss:.4f}")

        # 计算测试集的预测评分
        test_data_with_predictions = get_predictions(test_data, P, Q, user_id_mapping, movie_id_mapping)

        # 计算均方误差 (MSE)
        mask = ~test_data_with_predictions['predicted_rating'].isna()  # 筛选出有效的预测数据
        mse = mean_squared_error(test_data_with_predictions[mask]['rating'], test_data_with_predictions[mask]['predicted_rating'])
        print(f"MSE for this split: {mse:.4f}")
        
        mse_list.append(mse)
    
    # 计算所有分割的平均MSE
    avg_mse = np.mean(mse_list)
    print(f"Average MSE across all splits: {avg_mse:.4f}")
    return avg_mse


# 替换主程序调用交叉验证的部分
def main():
    print("Loading data...")
    data = load_data()  # 加载数据集
    avg_mse = cross_validate(data, n_splits=5)  # 5折交叉验证
    print(f"Final Average MSE: {avg_mse:.4f}")

main()


## 预测

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# 四舍五入到最近的 0.5 且裁剪到 [0, 5]
def round_rating(rating):
    """
    四舍五入评分到最近的 0.5，然后裁剪到 [0, 5] 的范围
    参数:
        rating: 预测的评分值
    返回:
        四舍五入且裁剪后的评分
    """
    rating = round(rating * 2) / 2  # 四舍五入到最近的 0.5
    return np.clip(rating, 0, 5)  # 限制到 [0, 5]


# 加载测试数据
def load_test_data(file_path):
    """
    加载测试数据集
    参数:
        file_path: 测试文件路径
    返回:
        测试数据 (dataframe)
    """
    ratings_test = pd.read_csv(file_path, sep=',')  # 加载测试数据
    return ratings_test


# 预测测试集中评分
def predict_test_ratings(test_data, P, Q, user_id_mapping, movie_id_mapping, global_avg_rating):
    """
    预测测试数据集中的评分
    参数:
        test_data: 测试数据集 (dataframe)，包含 userId 和 movieId
        P: 用户特征矩阵 (m x latent_factors)
        Q: 电影特征矩阵 (n x latent_factors)
        user_id_mapping: 用户 ID 映射到 P 的索引
        movie_id_mapping: 电影 ID 映射到 Q 的索引
        global_avg_rating: 训练集全局平均评分（作为冷启动策略时用）
    返回:
        测试数据集中预测的评分列表
    """
    predictions = []
    for _, row in test_data.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        
        # 如果用户和电影在训练集中，使用矩阵分解结果预测评分
        if user_id in user_id_mapping and movie_id in movie_id_mapping:
            user_idx = user_id_mapping[user_id]
            movie_idx = movie_id_mapping[movie_id]
            predicted_rating = np.dot(P[user_idx, :], Q[movie_idx, :].T)
        else:
            # 冷启动策略：使用全局平均评分
            predicted_rating = global_avg_rating
        
        # 四舍五入到 0.5 并裁剪到 [0, 5]
        predictions.append(round_rating(predicted_rating))
    
    return predictions


# 主函数
def main2():
    print("Loading test data...")
    ratings_test = load_test_data('ratings_test.csv')  # 确保测试文件路径正确

    print("Loading pre-trained model...")
    # 加载已保存的模型
    try:
        P = np.load('P_matrix.npy')
        Q = np.load('Q_matrix.npy')
        user_id_mapping = np.load('user_id_mapping.npy', allow_pickle=True).item()
        movie_id_mapping = np.load('movie_id_mapping.npy', allow_pickle=True).item()
        global_avg_rating = np.load('global_avg_rating.npy')  # 加载全局平均评分
        print("Pre-trained model successfully loaded.")
    except FileNotFoundError as e:
        print(f"Model file missing: {e}")
        return

    print(f"Test Data Shape: {ratings_test.shape}")
    print("Predicting ratings on the test set...")

    # 调用预测函数
    ratings_test['predicted_rating'] = predict_test_ratings(
        ratings_test, P, Q, user_id_mapping, movie_id_mapping, global_avg_rating
    )

    # 保存预测结果到文件
    output_file = 'film_rating_predictions_group_E_week_.csv'
    ratings_test.to_csv(output_file, index=False)
    print(f"Predictions saved to '{output_file}'.")

    # （可选）评估模型性能：如果测试集有真实评分
    if 'rating' in ratings_test.columns:
        mse = mean_squared_error(ratings_test['rating'], ratings_test['predicted_rating'])
        print(f"Mean Squared Error (MSE) on Test Data: {mse:.4f}")


# 运行主函数
main2()


# Week 2

### SGD Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd

# 数据加载函数
def load_data():
    # 加载单个文件数据
    data = pd.read_csv('merged_1.csv')  # 确保文件名称与实际一致
    return data

# 构建用户-电影评分矩阵
def create_user_movie_matrix(data):
    """
    从完整的数据构建用户-电影的评分矩阵
    """
    user_movie_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    user_movie_matrix.dropna(inplace=True)

    return user_movie_matrix


# 将genres列转换为二进制矩阵
def process_genres(data):
    """
    将电影的genres信息转换为二进制向量。
    """
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    return genres_df, mlb.classes_

# 模型训练函数
def matrix_factorization_with_genres(R, genres_matrix, K, steps, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001):
    """
    矩阵分解 (随机梯度下降) 带早停机制，加入电影的genres信息
    """
    num_users, num_movies = R.shape
    num_genres = genres_matrix.shape[1]  # genres的维度
    
    P = np.random.rand(num_users, K)  # 用户特征矩阵
    Q = np.random.rand(num_movies, K)  # 电影特征矩阵
    G = np.random.rand(num_movies, num_genres)  # 电影的genres信息矩阵

    best_loss = float('inf')  # 最佳损失
    patience_counter = 0  # 早停计数器
    
    for step in range(steps):
        # 随机梯度更新
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:  # 仅更新有评分的数据点
                    eij = R[i, j] - np.dot(P[i, :], Q[j, :].T) - np.dot(G[j, :], genres_matrix[j, :].T)
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[j, k] - lambda_reg * P[i, k])
                        Q[j, k] += alpha * (2 * eij * P[i, k] - lambda_reg * Q[j, k])
                    # 更新genres相关的G矩阵
                    for g in range(num_genres):
                        G[j, g] += alpha * (2 * eij * genres_matrix[j, g] - lambda_reg * G[j, g])

        # 每步计算当前的总误差
        loss = 0
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:
                    loss += (R[i, j] - np.dot(P[i, :], Q[j, :].T) - np.dot(G[j, :], genres_matrix[j, :].T)) ** 2
        
        if step % 10 == 0:
            print(f"Iteration {step}/{steps} => Loss: {loss:.4f}")

        # 检查是否早停
        if loss < best_loss - tolerance:  # 损失有显著改善
            best_loss = loss
            patience_counter = 0  # 重置早停计数器
        else:  # 损失没有改善
            patience_counter += 1
            print(f"Patience {patience_counter}/{patience}: No significant improvement in loss.")

        if patience_counter >= patience:  # 提前停止
            print(f"Early stopping triggered. Best loss: {best_loss:.4f}")
            break
    
    return P, Q, G, best_loss

# 计算预测评分
def get_predictions(data, P, Q, G, user_id_mapping, movie_id_mapping, genres_matrix):
    """
    为每个用户预测评分，并将其调整为 0 到 5 且以 0.5 为间隔
    """
    predictions = []
    for idx, row in data.iterrows():
        if row['userId'] in user_id_mapping and row['movieId'] in movie_id_mapping:
            user_idx = user_id_mapping[row['userId']]
            movie_idx = movie_id_mapping[row['movieId']]
            predicted_rating = np.dot(P[user_idx, :], Q[movie_idx, :].T) + np.dot(G[movie_idx, :], genres_matrix[movie_idx, :].T)
            predictions.append(predicted_rating)
        else:
            predictions.append(np.nan)
    
    return predictions

# 执行交叉验证并计算MSE
def cross_validate(data, K=10, steps=100, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001, test_size=0.3, n_splits=5):
    mse_list = []

    for split in range(n_splits):
        print(f"Cross-validation split {split + 1}/{n_splits}")
        
        # 将数据划分为训练集和测试集
        train_data, test_data = train_test_split(data, test_size=test_size)

        # 创建训练集的用户-电影评分矩阵
        train_matrix = create_user_movie_matrix(train_data)
        genres_matrix, _ = process_genres(train_data)

        # 获取用户和电影映射字典
        user_id_mapping = {id: idx for idx, id in enumerate(train_matrix.index)}
        movie_id_mapping = {id: idx for idx, id in enumerate(train_matrix.columns)}

        # 训练矩阵分解模型
        P, Q, G, best_loss = matrix_factorization_with_genres(
            train_matrix.to_numpy(),
            genres_matrix.to_numpy(),
            K=K,
            steps=steps,
            alpha=alpha,
            lambda_reg=lambda_reg,
            patience=patience,
            tolerance=tolerance
        )
        print(f"Best Loss for this split: {best_loss:.4f}")

        # 计算测试集的预测评分
        test_data['predicted_rating'] = get_predictions(test_data, P, Q, G, user_id_mapping, movie_id_mapping, genres_matrix.to_numpy())

        # 计算均方误差 (MSE)
        mask = ~test_data['predicted_rating'].isna()  # 筛选出有效的预测数据
        mse = mean_squared_error(test_data[mask]['rating'], test_data[mask]['predicted_rating'])
        print(f"MSE for this split: {mse:.4f}")
        
        mse_list.append(mse)
    
    # 计算所有分割的平均MSE
    avg_mse = np.mean(mse_list)
    print(f"Average MSE across all splits: {avg_mse:.4f}")
    return avg_mse

# 调用交叉验证函数
def main():
    print("Loading data...")
    data = load_data()  # 加载数据集
    avg_mse = cross_validate(data, n_splits=5)  # 5折交叉验证
    print(f"Final Average MSE: {avg_mse:.4f}")

main()   


### SVD Training

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def load_data():
    data = pd.read_csv('merged_1.csv')
    return data

def process_genres(data):
    unique_movies = data[['movieId', 'genres']].drop_duplicates().sort_values('movieId')
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(unique_movies['genres'].str.split('|'))
    return genres_matrix, mlb.classes_

def create_user_movie_matrix(data):
    user_movie_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    user_movie_matrix = user_movie_matrix.fillna(0)
    return user_movie_matrix

def svd_with_genres(R, genres_matrix, K, steps, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001):
    num_users, num_movies = R.shape
    num_genres = genres_matrix.shape[1]
    
    P = np.random.rand(num_users, K)
    Q = np.random.rand(num_movies, K)
    G = np.random.rand(num_movies, num_genres)
    b_u = np.zeros(num_users)
    b_i = np.zeros(num_movies)
    valid_ratings = R[R > 0]
    mu = np.mean(valid_ratings) if len(valid_ratings) > 0 else 0
    
    best_loss = float('inf')
    patience_counter = 0
    
    for step in range(steps):
        loss = 0
        for i in range(num_users):
            for j in range(num_movies):
                if R[i, j] > 0:
                    prediction = mu + b_u[i] + b_i[j] + np.dot(P[i, :], Q[j, :].T) + np.dot(G[j, :], genres_matrix[j, :].T)
                    eij = R[i, j] - prediction
                    for k in range(K):
                        P[i, k] += alpha * (2 * eij * Q[j, k] - lambda_reg * P[i, k])
                        Q[j, k] += alpha * (2 * eij * P[i, k] - lambda_reg * Q[j, k])
                    for g in range(num_genres):
                        G[j, g] += alpha * (2 * eij * genres_matrix[j, g] - lambda_reg * G[j, g])
                    b_u[i] += alpha * (2 * eij - lambda_reg * b_u[i])
                    b_i[j] += alpha * (2 * eij - lambda_reg * b_i[j])
                    loss += eij ** 2
        loss += lambda_reg * (np.sum(P**2) + np.sum(Q**2) + np.sum(G**2) + np.sum(b_u**2) + np.sum(b_i**2))
        
        if step % 10 == 0:
            print(f"Iteration {step}/{steps} => Loss: {loss:.4f}")
        
        if loss < best_loss - tolerance:
            best_loss = loss
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"Early stopping triggered at iteration {step}.")
            break
    
    return P, Q, G, b_u, b_i, mu

def get_predictions(R, P, Q, G, b_u, b_i, mu, genres_matrix, test_data, user_idx_dict, movie_idx_dict):
    predictions = []
    for _, row in test_data.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        if user_id in user_idx_dict and movie_id in movie_idx_dict:
            user_idx = user_idx_dict[user_id]
            movie_idx = movie_idx_dict[movie_id]
            prediction = (mu + b_u[user_idx] + b_i[movie_idx] + 
                         np.dot(P[user_idx, :], Q[movie_idx, :].T) + 
                         np.dot(G[movie_idx, :], genres_matrix[movie_idx, :].T))
            if np.isnan(prediction) or np.isinf(prediction):
                prediction = mu
        else:
            prediction = mu
        predictions.append(prediction)
    return np.array(predictions)

def cross_validate(data, K=10, steps=100, alpha=0.002, lambda_reg=0.1, patience=10, tolerance=0.001, test_size=0.3, n_splits=5):
    mse_list = []
    for split in range(n_splits):
        print(f"Cross-validation split {split + 1}/{n_splits}")
        train_data, test_data = train_test_split(data, test_size=test_size)
        train_matrix = create_user_movie_matrix(train_data)
        genres_matrix, _ = process_genres(train_data)
        user_idx_dict = {user_id: idx for idx, user_id in enumerate(train_matrix.index)}
        movie_idx_dict = {movie_id: idx for idx, movie_id in enumerate(train_matrix.columns)}
        P, Q, G, b_u, b_i, mu = svd_with_genres(
            train_matrix.to_numpy(), genres_matrix, K, steps, alpha, lambda_reg, patience, tolerance
        )
        predictions = get_predictions(
            train_matrix.to_numpy(), P, Q, G, b_u, b_i, mu, genres_matrix, test_data, 
            user_idx_dict, movie_idx_dict
        )
        test_data['predicted_rating'] = predictions
        valid_mask = ~np.isnan(test_data['rating'])
        test_data_cleaned = test_data[valid_mask]
        mse = mean_squared_error(test_data_cleaned['rating'], test_data_cleaned['predicted_rating'])
        print(f"MSE for this split: {mse:.4f}")
        mse_list.append(mse)
    avg_mse = np.mean(mse_list)
    print(f"Average MSE across all splits: {avg_mse:.4f}")
    return avg_mse

def main():
    print("Loading data...")
    data = load_data()
    avg_mse = cross_validate(data, n_splits=5)
    print(f"Final Average MSE: {avg_mse:.4f}")

main()


### SGD2 Training

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 假设你的训练数据文件名为 merged_1.csv / Assume your training data file is named merged_1.csv

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，并处理 genres 特征。
    English: Convert timestamp to year and process genres feature.
    """
    # 移除 userId 为 nan 的行
    data = data.dropna(subset=['userId'])

    # 将 timestamp 转换为年份作为时间特征 / Convert timestamp to year as time feature
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到原始数据 / Merge genres_df to original data
    data = pd.concat([data, genres_df], axis=1)
    return data, mlb.classes_

# SGD 推荐模型 / SGD Recommender Model
class SGDRecommender:
    def __init__(self, num_users, num_movies, num_genres, num_years, k=20, alpha=0.005, lambda_reg=0.1, max_iter=100):
        """
        中文：初始化模型参数
        English: Initialize model parameters
        :param num_users: 用户数量 / Number of users
        :param num_movies: 电影数量 / Number of movies
        :param num_genres: 类别数量 / Number of genres
        :param num_years: 年份数量 / Number of years
        :param k: 潜在特征维度 / Latent feature dimension
        :param alpha: 学习率 / Learning rate
        :param lambda_reg: 正则化参数 / Regularization parameter
        :param max_iter: 最大迭代次数 / Maximum number of iterations
        """
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.num_years = num_years
        self.k = k
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.max_iter = max_iter
        
        # 初始化参数 / Initialize parameters
        self.P = np.random.normal(scale=1./k, size=(num_users, k))  # 用户潜在特征 / User latent features
        self.Q = np.random.normal(scale=1./k, size=(num_movies, k))  # 电影潜在特征 / Movie latent features
        self.b_u = np.zeros(num_users)  # 用户偏差 / User bias
        self.b_i = np.zeros(num_movies)  # 电影偏差 / Movie bias
        self.b_g = np.zeros(num_genres)  # 类别偏差 / Genre bias
        self.b_y = np.zeros(num_years)  # 年份偏差 / Year bias
        self.mu = 0  # 全局均值 / Global mean
    
    def fit(self, train_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：训练模型
        English: Train the model
        """
        # 计算全局均值 / Calculate global mean
        self.mu = train_data['rating'].mean()
        
        # 迭代训练 / Iterative training
        for _ in range(self.max_iter):
            for _, row in train_data.iterrows():
                u = user_map[row['userId']]
                i = movie_map[row['movieId']]
                y = year_map[row['year']]
                genres = row[genre_cols].values.astype(float)
                
                # 预测评分 / Predict rating
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                error = row['rating'] - pred
                
                # 更新参数 / Update parameters
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])
                self.P[u] += self.alpha * (error * self.Q[i] - self.lambda_reg * self.P[u])
                self.Q[i] += self.alpha * (error * self.P[u] - self.lambda_reg * self.Q[i])
                self.b_g += self.alpha * (error * genres - self.lambda_reg * self.b_g)
                self.b_y[y] += self.alpha * (error - self.lambda_reg * self.b_y[y])
    
    def predict(self, test_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：预测测试集评分
        English: Predict ratings for the test set
        """
        predictions = []
        for _, row in test_data.iterrows():
            u = user_map.get(row['userId'], -1)  # 如果用户不在训练集中，返回 -1 / Return -1 if user not in training set
            i = movie_map.get(row['movieId'], -1)  # 如果电影不在训练集中，返回 -1 / Return -1 if movie not in training set
            y = year_map.get(row['year'], -1)  # 如果年份不在训练集中，返回 -1 / Return -1 if year not in training set
            genres = row[genre_cols].values.astype(float)
            
            # 冷启动处理 / Cold start handling
            if u == -1 or i == -1 or y == -1:
                pred = self.mu  # 对于新用户或新电影，使用全局均值 / Use global mean for new users or movies
            else:
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
            predictions.append(pred)
        return np.array(predictions)

# 交叉验证 / Cross-validation
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并计算 MSE（70% 训练，30% 测试）
    English: Perform k-fold cross-validation and calculate MSE (70% training, 30% testing)
    :param data: 预处理后的数据 / Preprocessed data
    :param genre_cols: 类别列名 / Genre column names
    :param k_fold: 折数 / Number of folds
    """
    # 确保数据中没有 userId 为 nan 的行
    data = data.dropna(subset=['userId'])
    
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # 创建映射字典 / Create mapping dictionaries
        user_ids = train_data['userId'].unique()
        movie_ids = train_data['movieId'].unique()
        years = train_data['year'].unique()
        
        user_map = {uid: idx for idx, uid in enumerate(user_ids)}
        movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}
        year_map = {y: idx for idx, y in enumerate(years)}
        
        # 初始化模型（高精度超参数） / Initialize model with high precision hyperparameters
        model = SGDRecommender(
            num_users=len(user_ids),
            num_movies=len(movie_ids),
            num_genres=len(genre_cols),
            num_years=len(years),
            k=20,         # 潜在特征维度 / Latent feature dimension
            alpha=0.005,  # 学习率 / Learning rate
            lambda_reg=0.1,  # 正则化参数 / Regularization parameter
            max_iter=100    # 迭代次数 / Number of iterations
        )
        
        # 训练模型 / Train the model
        model.fit(train_data, user_map, movie_map, year_map, genre_cols)
        
        # 预测测试集 / Predict on test set
        predictions = model.predict(test_data, user_map, movie_map, year_map, genre_cols)
        
        # 计算 MSE / Calculate MSE
        mse = mean_squared_error(test_data['rating'], predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    # 输出平均 MSE / Output average MSE
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    # 数据预处理 / Data preprocessing
    data_processed, genre_cols = preprocess_data(data)
    
    # 执行交叉验证 / Perform cross-validation
    print("开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


#### SGD2 Training Rounding

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 假设你的训练数据文件名为 merged_1.csv / Assume your training data file is named merged_1.csv

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，并处理 genres 特征。
    English: Convert timestamp to year and process genres feature.
    """
    # 移除 userId 为 nan 的行
    data = data.dropna(subset=['userId'])

    # 将 timestamp 转换为年份作为时间特征 / Convert timestamp to year as time feature
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到原始数据 / Merge genres_df to original data
    data = pd.concat([data, genres_df], axis=1)
    return data, mlb.classes_

# SGD 推荐模型 / SGD Recommender Model
class SGDRecommender:
    def __init__(self, num_users, num_movies, num_genres, num_years, k=20, alpha=0.005, lambda_reg=0.1, max_iter=100):
        """
        中文：初始化模型参数
        English: Initialize model parameters
        :param num_users: 用户数量 / Number of users
        :param num_movies: 电影数量 / Number of movies
        :param num_genres: 类别数量 / Number of genres
        :param num_years: 年份数量 / Number of years
        :param k: 潜在特征维度 / Latent feature dimension
        :param alpha: 学习率 / Learning rate
        :param lambda_reg: 正则化参数 / Regularization parameter
        :param max_iter: 最大迭代次数 / Maximum number of iterations
        """
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.num_years = num_years
        self.k = k
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.max_iter = max_iter
        
        # 初始化参数 / Initialize parameters
        self.P = np.random.normal(scale=1./k, size=(num_users, k))  # 用户潜在特征 / User latent features
        self.Q = np.random.normal(scale=1./k, size=(num_movies, k))  # 电影潜在特征 / Movie latent features
        self.b_u = np.zeros(num_users)  # 用户偏差 / User bias
        self.b_i = np.zeros(num_movies)  # 电影偏差 / Movie bias
        self.b_g = np.zeros(num_genres)  # 类别偏差 / Genre bias
        self.b_y = np.zeros(num_years)  # 年份偏差 / Year bias
        self.mu = 0  # 全局均值 / Global mean
    
    def fit(self, train_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：训练模型
        English: Train the model
        """
        # 计算全局均值 / Calculate global mean
        self.mu = train_data['rating'].mean()
        
        # 迭代训练 / Iterative training
        for _ in range(self.max_iter):
            for _, row in train_data.iterrows():
                u = user_map[row['userId']]
                i = movie_map[row['movieId']]
                y = year_map[row['year']]
                genres = row[genre_cols].values.astype(float)
                
                # 预测评分 / Predict rating
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                error = row['rating'] - pred
                
                # 更新参数 / Update parameters
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])
                self.P[u] += self.alpha * (error * self.Q[i] - self.lambda_reg * self.P[u])
                self.Q[i] += self.alpha * (error * self.P[u] - self.lambda_reg * self.Q[i])
                self.b_g += self.alpha * (error * genres - self.lambda_reg * self.b_g)
                self.b_y[y] += self.alpha * (error - self.lambda_reg * self.b_y[y])
    
    def predict(self, test_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：预测测试集评分
        English: Predict ratings for the test set
        """
        predictions = []
        for _, row in test_data.iterrows():
            u = user_map.get(row['userId'], -1)  # 如果用户不在训练集中，返回 -1 / Return -1 if user not in training set
            i = movie_map.get(row['movieId'], -1)  # 如果电影不在训练集中，返回 -1 / Return -1 if movie not in training set
            y = year_map.get(row['year'], -1)  # 如果年份不在训练集中，返回 -1 / Return -1 if year not in training set
            genres = row[genre_cols].values.astype(float)
            
            # 冷启动处理 / Cold start handling
            if u == -1 or i == -1 or y == -1:
                pred = self.mu  # 对于新用户或新电影，使用全局均值 / Use global mean for new users or movies
            else:
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
            predictions.append(pred)
        return np.array(predictions)

# 后处理预测评分 / Post-process predicted ratings
def postprocess_predictions(predictions):
    """
    中文：将预测评分裁剪到 0-5 并四舍五入到最近的 0.5 倍数。
    English: Clip predictions to 0-5 and round to the nearest 0.5 multiple.
    :param predictions: 原始预测评分 / Raw predicted ratings
    :return: 处理后的预测评分 / Processed predicted ratings
    """
    # 裁剪到 0-5 / Clip to 0-5
    clipped_predictions = np.clip(predictions, 0, 5)
    
    # 四舍五入到最近的 0.5 倍数 / Round to nearest 0.5 multiple
    rounded_predictions = np.round(clipped_predictions * 2) / 2
    return rounded_predictions


# 交叉验证 / Cross-validation
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并计算 MSE（70% 训练，30% 测试）
    English: Perform k-fold cross-validation and calculate MSE (70% training, 30% testing)
    :param data: 预处理后的数据 / Preprocessed data
    :param genre_cols: 类别列名 / Genre column names
    :param k_fold: 折数 / Number of folds
    """
    # 确保数据中没有 userId 为 nan 的行
    data = data.dropna(subset=['userId'])
    
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # 创建映射字典 / Create mapping dictionaries
        user_ids = train_data['userId'].unique()
        movie_ids = train_data['movieId'].unique()
        years = train_data['year'].unique()
        
        user_map = {uid: idx for idx, uid in enumerate(user_ids)}
        movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}
        year_map = {y: idx for idx, y in enumerate(years)}
        
        # 初始化模型（高精度超参数） / Initialize model with high precision hyperparameters
        model = SGDRecommender(
            num_users=len(user_ids),
            num_movies=len(movie_ids),
            num_genres=len(genre_cols),
            num_years=len(years),
            k=20,         # 潜在特征维度 / Latent feature dimension
            alpha=0.005,  # 学习率 / Learning rate
            lambda_reg=0.1,  # 正则化参数 / Regularization parameter
            max_iter=100    # 迭代次数 / Number of iterations
        )
        
        # 训练模型 / Train the model
        model.fit(train_data, user_map, movie_map, year_map, genre_cols)
        
        # 预测测试集 / Predict on test set
        predictions = model.predict(test_data, user_map, movie_map, year_map, genre_cols)
        
        # 后处理预测评分 / Post-process predictions
        predictions = postprocess_predictions(predictions)

        # 计算 MSE / Calculate MSE
        mse = mean_squared_error(test_data['rating'], predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    # 输出平均 MSE / Output average MSE
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    # 数据预处理 / Data preprocessing
    data_processed, genre_cols = preprocess_data(data)
    
    # 执行交叉验证 / Perform cross-validation
    print("开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


#### SGD2 Training Pre + Post

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 假设您的训练数据文件名为 merged_1.csv

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，创建用户-电影评分矩阵并校正偏差。
    English: Convert timestamp to year, process genres feature, create user-movie rating matrix and correct bias.
    """
    # 移除 userId 为 NaN 的行 / Remove rows where userId is NaN
    data = data.dropna(subset=['userId'])

    # 将 timestamp 转换为年份作为时间特征 / Convert timestamp to year as time feature
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到原始数据 / Merge genres_df to original data
    data = pd.concat([data, genres_df], axis=1)
    
    # 创建用户-电影评分矩阵 / Create user-movie rating matrix
    user_movie_matrix = data.pivot_table(index='userId', columns='movieId', values='rating')
    user_movie_matrix = user_movie_matrix * 2
    # 计算用户偏差和电影偏差 / Calculate user bias and movie bias
    user_bias = data.groupby('userId')['rating'].mean()
    movie_bias = data.groupby('movieId')['rating'].mean()
    
    # 校正评分矩阵 / Correct rating matrix
    # 对齐 user_bias 和 user_movie_matrix 的行（用户） / Align user_bias with user_movie_matrix rows (users)
    user_bias_aligned, _ = user_bias.align(user_movie_matrix, axis=0, join='right')
    user_movie_matrix = user_movie_matrix.sub(user_bias_aligned, axis=0)
    
    # 对齐 movie_bias 和 user_movie_matrix 的列（电影） / Align movie_bias with user_movie_matrix columns (movies)
    # 注意：Series.align 不能直接用于列对齐，所以我们对齐 movie_bias 和 user_movie_matrix.columns
    movie_bias_aligned = movie_bias.reindex(user_movie_matrix.columns, fill_value=0)
    user_movie_matrix = user_movie_matrix.sub(movie_bias_aligned, axis=1)
    
    # 处理 NaN 值 / Handle NaN values
    user_movie_matrix = user_movie_matrix.fillna(0)
    
    return data, mlb.classes_, user_movie_matrix, user_bias, movie_bias

# SGD 推荐模型 / SGD Recommender Model
class SGDRecommender:
    def __init__(self, num_users, num_movies, num_genres, num_years, k=20, alpha=0.005, lambda_reg=0.1, max_iter=100):
        """
        中文：初始化模型参数
        English: Initialize model parameters
        """
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.num_years = num_years
        self.k = k
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.max_iter = max_iter
        
        # 初始化参数 / Initialize parameters
        self.P = np.random.normal(scale=1./k, size=(num_users, k))  # 用户潜在特征
        self.Q = np.random.normal(scale=1./k, size=(num_movies, k))  # 电影潜在特征
        self.b_u = np.zeros(num_users)  # 用户偏差
        self.b_i = np.zeros(num_movies)  # 电影偏差
        self.b_g = np.zeros(num_genres)  # 类别偏差
        self.b_y = np.zeros(num_years)  # 年份偏差
        self.mu = 0  # 全局均值
    
    def fit(self, train_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：训练模型
        English: Train the model
        """
        self.mu = train_data['rating'].mean()
        
        for _ in range(self.max_iter):
            for _, row in train_data.iterrows():
                u = user_map[row['userId']]
                i = movie_map[row['movieId']]
                y = year_map[row['year']]
                genres = row[genre_cols].values.astype(float)
                
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                error = row['rating'] - pred
                
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])
                self.P[u] += self.alpha * (error * self.Q[i] - self.lambda_reg * self.P[u])
                self.Q[i] += self.alpha * (error * self.P[u] - self.lambda_reg * self.Q[i])
                self.b_g += self.alpha * (error * genres - self.lambda_reg * self.b_g)
                self.b_y[y] += self.alpha * (error - self.lambda_reg * self.b_y[y])
    
    def predict(self, test_data, user_map, movie_map, year_map, genre_cols, user_bias, movie_bias):
        """
        中文：预测测试集评分并进行后处理
        English: Predict ratings for the test set and perform post-processing
        """
        predictions = []
        for _, row in test_data.iterrows():
            u = user_map.get(row['userId'], -1)
            i = movie_map.get(row['movieId'], -1)
            y = year_map.get(row['year'], -1)
            genres = row[genre_cols].values.astype(float)
            
            if u == -1 or i == -1 or y == -1:
                pred = self.mu
            else:
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                
                # 后处理：调整预测评分
                if row['movieId'] in movie_bias:
                    pred += user_bias.get(row['userId'], 0) + movie_bias.get(row['movieId'], 0)
                    pred /= 2
                else:
                    pred += user_bias.get(row['userId'], 0)
                pred = round(pred)
            
            predictions.append(pred)
        return np.array(predictions)

# 交叉验证 / Cross-validation
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并计算 MSE
    English: Perform k-fold cross-validation and calculate MSE
    """
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        user_ids = train_data['userId'].unique()
        movie_ids = train_data['movieId'].unique()
        years = train_data['year'].unique()
        
        user_map = {uid: idx for idx, uid in enumerate(user_ids)}
        movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}
        year_map = {y: idx for idx, y in enumerate(years)}
        
        user_bias = train_data.groupby('userId')['rating'].mean()
        movie_bias = train_data.groupby('movieId')['rating'].mean()
        
        model = SGDRecommender(
            num_users=len(user_ids),
            num_movies=len(movie_ids),
            num_genres=len(genre_cols),
            num_years=len(years),
            k=20,
            alpha=0.005,
            lambda_reg=0.1,
            max_iter=100
        )
        
        model.fit(train_data, user_map, movie_map, year_map, genre_cols)
        predictions = model.predict(test_data, user_map, movie_map, year_map, genre_cols, user_bias, movie_bias)
        
        mse = mean_squared_error(test_data['rating'], predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f}")
    
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    data_processed, genre_cols, user_movie_matrix, user_bias, movie_bias = preprocess_data(data)
    print("开始交叉验证（70% 训练，30% 测试）...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


### SGD2 Forecast


#### without rounding 最终选用

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# 数据加载 / Data loading
train_data = pd.read_csv('merged_1.csv')  # 训练集文件 / Training set file
test_data = pd.read_csv('ratings_test.csv')  # 测试集文件 / Test set file

# 从训练集中提取 movieId 到 genres 的映射 / Extract movieId to genres mapping from training set
movie_genres_map = train_data[['movieId', 'genres']].drop_duplicates().set_index('movieId')['genres'].to_dict()

# 数据预处理 / Data preprocessing
def preprocess_data(data, mlb=None, is_train=True, movie_genres_map=None):
    """
    中文：对训练集或测试集进行预处理，转换 timestamp 和 genres。
    English: Preprocess the training or test set by converting timestamp and genres.
    :param data: 输入数据集 / Input dataset
    :param mlb: MultiLabelBinarizer 对象，用于 genres 转换 / MultiLabelBinarizer object for genres transformation
    :param is_train: 是否为训练集 / Whether it’s the training set
    :param movie_genres_map: movieId 到 genres 的映射（测试集使用） / Mapping from movieId to genres (for test set)
    :return: 预处理后的数据和 mlb（如果是训练集） / Preprocessed data and mlb (if training)
    """
    # 移除 userId 或 movieId 为 nan 的行 / Remove rows where userId or movieId is nan
    data = data.dropna(subset=['userId', 'movieId'])
    
    # 如果是测试集，添加 genres 列 / If test set, add genres column
    if not is_train:
        data = data.copy()  # 避免修改原始数据 / Avoid modifying original data
        data['genres'] = data['movieId'].map(movie_genres_map)
        # 处理缺失的 genres（movieId 不在训练集中） / Handle missing genres (movieId not in training set)
        data['genres'].fillna('Unknown', inplace=True)
    
    # 将 timestamp 转换为年份 / Convert timestamp to year
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    if is_train:
        mlb = MultiLabelBinarizer()
        genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    else:
        genres_matrix = mlb.transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到数据 / Merge genres_df to data
    data = pd.concat([data, genres_df], axis=1)
    return data, mlb if is_train else data

# SGD 推荐模型 / SGD Recommender Model
class SGDRecommender:
    def __init__(self, num_users, num_movies, num_genres, num_years, k=20, alpha=0.005, lambda_reg=0.1, max_iter=100):
        """
        中文：初始化模型参数
        English: Initialize model parameters
        :param num_users: 用户数量 / Number of users
        :param num_movies: 电影数量 / Number of movies
        :param num_genres: 类别数量 / Number of genres
        :param num_years: 年份数量 / Number of years
        :param k: 潜在特征维度 / Latent feature dimension
        :param alpha: 学习率 / Learning rate
        :param lambda_reg: 正则化参数 / Regularization parameter
        :param max_iter: 最大迭代次数 / Maximum number of iterations
        """
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.num_years = num_years
        self.k = k
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.max_iter = max_iter
        
        # 初始化参数 / Initialize parameters
        self.P = np.random.normal(scale=1./k, size=(num_users, k))  # 用户潜在特征 / User latent features
        self.Q = np.random.normal(scale=1./k, size=(num_movies, k))  # 电影潜在特征 / Movie latent features
        self.b_u = np.zeros(num_users)  # 用户偏差 / User bias
        self.b_i = np.zeros(num_movies)  # 电影偏差 / Movie bias
        self.b_g = np.zeros(num_genres)  # 类别偏差 / Genre bias
        self.b_y = np.zeros(num_years)  # 年份偏差 / Year bias
        self.mu = 0  # 全局均值 / Global mean
    
    def fit(self, train_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：训练模型
        English: Train the model
        """
        # 计算全局均值 / Calculate global mean
        self.mu = train_data['rating'].mean()
        
        # 迭代训练 / Iterative training
        for _ in range(self.max_iter):
            for _, row in train_data.iterrows():
                u = user_map[row['userId']]
                i = movie_map[row['movieId']]
                y = year_map[row['year']]
                genres = row[genre_cols].values.astype(float)
                
                # 预测评分 / Predict rating
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                error = row['rating'] - pred
                
                # 更新参数 / Update parameters
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])
                self.P[u] += self.alpha * (error * self.Q[i] - self.lambda_reg * self.P[u])
                self.Q[i] += self.alpha * (error * self.P[u] - self.lambda_reg * self.Q[i])
                self.b_g += self.alpha * (error * genres - self.lambda_reg * self.b_g)
                self.b_y[y] += self.alpha * (error - self.lambda_reg * self.b_y[y])
    
    def predict(self, test_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：预测测试集评分
        English: Predict ratings for the test set
        """
        predictions = []
        for _, row in test_data.iterrows():
            u = user_map.get(row['userId'], -1)  # 如果用户不在训练集中，返回 -1 / Return -1 if user not in training set
            i = movie_map.get(row['movieId'], -1)  # 如果电影不在训练集中，返回 -1 / Return -1 if movie not in training set
            y = year_map.get(row['year'], -1)  # 如果年份不在训练集中，返回 -1 / Return -1 if year not in training set
            genres = row[genre_cols].values.astype(float)
            
            # 冷启动处理 / Cold start handling
            if u == -1 or i == -1 or y == -1:
                pred = self.mu  # 对于新用户或新电影，使用全局均值 / Use global mean for new users or movies
            else:
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
            predictions.append(pred)
        return np.array(predictions)


# # 后处理预测评分 / Post-process predicted ratings
# def postprocess_predictions(predictions):
#     """
#     中文：将预测评分裁剪到 0-5 并四舍五入到最近的 0.5 倍数。
#     English: Clip predictions to 0-5 and round to the nearest 0.5 multiple.
#     :param predictions: 原始预测评分 / Raw predicted ratings
#     :return: 处理后的预测评分 / Processed predicted ratings
#     """
#     # 裁剪到 0-5 / Clip to 0-5
#     clipped_predictions = np.clip(predictions, 0, 5)
    
#     # 四舍五入到最近的 0.5 倍数 / Round to nearest 0.5 multiple
#     rounded_predictions = np.round(clipped_predictions * 2) / 2
#     return rounded_predictions

# 主程序 / Main function
def main():
    # 预处理训练集 / Preprocess training set
    train_data_processed, mlb = preprocess_data(train_data, is_train=True)
    
    # 创建映射字典 / Create mapping dictionaries
    user_ids = train_data_processed['userId'].unique()
    movie_ids = train_data_processed['movieId'].unique()
    years = train_data_processed['year'].unique()
    genre_cols = mlb.classes_
    
    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}
    year_map = {y: idx for idx, y in enumerate(years)}
    
    # 初始化并训练模型 / Initialize and train the model
    model = SGDRecommender(
        num_users=len(user_ids),
        num_movies=len(movie_ids),
        num_genres=len(genre_cols),
        num_years=len(years),
        k=20,         # 潜在特征维度 / Latent feature dimension
        alpha=0.005,  # 学习率 / Learning rate
        lambda_reg=0.1,  # 正则化参数 / Regularization parameter
        max_iter=100    # 迭代次数 / Number of iterations
    )
    print("开始训练模型... / Starting model training...")
    model.fit(train_data_processed, user_map, movie_map, year_map, genre_cols)
    
    # 预处理测试集，传入 movie_genres_map / Preprocess test set with movie_genres_map
    test_data_processed, _ = preprocess_data(test_data, mlb=mlb, is_train=False, movie_genres_map=movie_genres_map)
    
    # 预测测试集评分 / Predict ratings for test set
    print("开始预测测试集... / Starting prediction on test set...")
    raw_predictions = model.predict(test_data_processed, user_map, movie_map, year_map, genre_cols)
    
    # # 后处理预测评分 / Post-process predictions
    # final_predictions = postprocess_predictions(raw_predictions)
    
    # 将处理后的预测结果添加到测试集 / Add processed predictions to test set
    # test_data_processed['predicted_rating'] = final_predictions
    test_data_processed['predicted_rating'] = raw_predictions
    
    # 输出预测结果 / Output prediction results
    print("预测结果示例： / Prediction examples:")
    print(test_data_processed[['userId', 'movieId', 'timestamp', 'predicted_rating']].head())
    
    # 保存预测结果到文件 / Save predictions to file
    test_data_processed.to_csv('predicted_ratings.csv', index=False)
    print("预测结果已保存到 'predicted_ratings.csv' / Predictions saved to 'predicted_ratings.csv'")

if __name__ == "__main__":
    main()


#### rounding

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# 数据加载 / Data loading
train_data = pd.read_csv('merged_1.csv')  # 训练集文件 / Training set file
test_data = pd.read_csv('ratings_test.csv')  # 测试集文件 / Test set file

# 从训练集中提取 movieId 到 genres 的映射 / Extract movieId to genres mapping from training set
movie_genres_map = train_data[['movieId', 'genres']].drop_duplicates().set_index('movieId')['genres'].to_dict()

# 数据预处理 / Data preprocessing
def preprocess_data(data, mlb=None, is_train=True, movie_genres_map=None):
    """
    中文：对训练集或测试集进行预处理，转换 timestamp 和 genres。
    English: Preprocess the training or test set by converting timestamp and genres.
    :param data: 输入数据集 / Input dataset
    :param mlb: MultiLabelBinarizer 对象，用于 genres 转换 / MultiLabelBinarizer object for genres transformation
    :param is_train: 是否为训练集 / Whether it’s the training set
    :param movie_genres_map: movieId 到 genres 的映射（测试集使用） / Mapping from movieId to genres (for test set)
    :return: 预处理后的数据和 mlb（如果是训练集） / Preprocessed data and mlb (if training)
    """
    # 移除 userId 或 movieId 为 nan 的行 / Remove rows where userId or movieId is nan
    data = data.dropna(subset=['userId', 'movieId'])
    
    # 如果是测试集，添加 genres 列 / If test set, add genres column
    if not is_train:
        data = data.copy()  # 避免修改原始数据 / Avoid modifying original data
        data['genres'] = data['movieId'].map(movie_genres_map)
        # 处理缺失的 genres（movieId 不在训练集中） / Handle missing genres (movieId not in training set)
        data['genres'].fillna('Unknown', inplace=True)
    
    # 将 timestamp 转换为年份 / Convert timestamp to year
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    if is_train:
        mlb = MultiLabelBinarizer()
        genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    else:
        genres_matrix = mlb.transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到数据 / Merge genres_df to data
    data = pd.concat([data, genres_df], axis=1)
    return data, mlb if is_train else data

# SGD 推荐模型 / SGD Recommender Model
class SGDRecommender:
    def __init__(self, num_users, num_movies, num_genres, num_years, k=20, alpha=0.005, lambda_reg=0.1, max_iter=100):
        """
        中文：初始化模型参数
        English: Initialize model parameters
        :param num_users: 用户数量 / Number of users
        :param num_movies: 电影数量 / Number of movies
        :param num_genres: 类别数量 / Number of genres
        :param num_years: 年份数量 / Number of years
        :param k: 潜在特征维度 / Latent feature dimension
        :param alpha: 学习率 / Learning rate
        :param lambda_reg: 正则化参数 / Regularization parameter
        :param max_iter: 最大迭代次数 / Maximum number of iterations
        """
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_genres = num_genres
        self.num_years = num_years
        self.k = k
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.max_iter = max_iter
        
        # 初始化参数 / Initialize parameters
        self.P = np.random.normal(scale=1./k, size=(num_users, k))  # 用户潜在特征 / User latent features
        self.Q = np.random.normal(scale=1./k, size=(num_movies, k))  # 电影潜在特征 / Movie latent features
        self.b_u = np.zeros(num_users)  # 用户偏差 / User bias
        self.b_i = np.zeros(num_movies)  # 电影偏差 / Movie bias
        self.b_g = np.zeros(num_genres)  # 类别偏差 / Genre bias
        self.b_y = np.zeros(num_years)  # 年份偏差 / Year bias
        self.mu = 0  # 全局均值 / Global mean
    
    def fit(self, train_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：训练模型
        English: Train the model
        """
        # 计算全局均值 / Calculate global mean
        self.mu = train_data['rating'].mean()
        
        # 迭代训练 / Iterative training
        for _ in range(self.max_iter):
            for _, row in train_data.iterrows():
                u = user_map[row['userId']]
                i = movie_map[row['movieId']]
                y = year_map[row['year']]
                genres = row[genre_cols].values.astype(float)
                
                # 预测评分 / Predict rating
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
                error = row['rating'] - pred
                
                # 更新参数 / Update parameters
                self.b_u[u] += self.alpha * (error - self.lambda_reg * self.b_u[u])
                self.b_i[i] += self.alpha * (error - self.lambda_reg * self.b_i[i])
                self.P[u] += self.alpha * (error * self.Q[i] - self.lambda_reg * self.P[u])
                self.Q[i] += self.alpha * (error * self.P[u] - self.lambda_reg * self.Q[i])
                self.b_g += self.alpha * (error * genres - self.lambda_reg * self.b_g)
                self.b_y[y] += self.alpha * (error - self.lambda_reg * self.b_y[y])
    
    def predict(self, test_data, user_map, movie_map, year_map, genre_cols):
        """
        中文：预测测试集评分
        English: Predict ratings for the test set
        """
        predictions = []
        for _, row in test_data.iterrows():
            u = user_map.get(row['userId'], -1)  # 如果用户不在训练集中，返回 -1 / Return -1 if user not in training set
            i = movie_map.get(row['movieId'], -1)  # 如果电影不在训练集中，返回 -1 / Return -1 if movie not in training set
            y = year_map.get(row['year'], -1)  # 如果年份不在训练集中，返回 -1 / Return -1 if year not in training set
            genres = row[genre_cols].values.astype(float)
            
            # 冷启动处理 / Cold start handling
            if u == -1 or i == -1 or y == -1:
                pred = self.mu  # 对于新用户或新电影，使用全局均值 / Use global mean for new users or movies
            else:
                pred = (self.mu + self.b_u[u] + self.b_i[i] + 
                        np.dot(self.P[u], self.Q[i]) + 
                        np.dot(self.b_g, genres) + self.b_y[y])
            predictions.append(pred)
        return np.array(predictions)

# 后处理预测评分 / Post-process predicted ratings
def postprocess_predictions(predictions):
    """
    中文：将预测评分裁剪到 0-5 并四舍五入到最近的 0.5 倍数。
    English: Clip predictions to 0-5 and round to the nearest 0.5 multiple.
    :param predictions: 原始预测评分 / Raw predicted ratings
    :return: 处理后的预测评分 / Processed predicted ratings
    """
    # 裁剪到 0-5 / Clip to 0-5
    clipped_predictions = np.clip(predictions, 0, 5)
    
    # 四舍五入到最近的 0.5 倍数 / Round to nearest 0.5 multiple
    rounded_predictions = np.round(clipped_predictions * 2) / 2
    return rounded_predictions

# 主程序 / Main function
def main():
    # 预处理训练集 / Preprocess training set
    train_data_processed, mlb = preprocess_data(train_data, is_train=True)
    
    # 创建映射字典 / Create mapping dictionaries
    user_ids = train_data_processed['userId'].unique()
    movie_ids = train_data_processed['movieId'].unique()
    years = train_data_processed['year'].unique()
    genre_cols = mlb.classes_
    
    user_map = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_map = {mid: idx for idx, mid in enumerate(movie_ids)}
    year_map = {y: idx for idx, y in enumerate(years)}
    
    # 初始化并训练模型 / Initialize and train the model
    model = SGDRecommender(
        num_users=len(user_ids),
        num_movies=len(movie_ids),
        num_genres=len(genre_cols),
        num_years=len(years),
        k=20,         # 潜在特征维度 / Latent feature dimension
        alpha=0.005,  # 学习率 / Learning rate
        lambda_reg=0.1,  # 正则化参数 / Regularization parameter
        max_iter=100    # 迭代次数 / Number of iterations
    )
    print("开始训练模型... / Starting model training...")
    model.fit(train_data_processed, user_map, movie_map, year_map, genre_cols)
    
    # 预处理测试集，传入 movie_genres_map / Preprocess test set with movie_genres_map
    test_data_processed, _ = preprocess_data(test_data, mlb=mlb, is_train=False, movie_genres_map=movie_genres_map)
    
    # 预测测试集评分 / Predict ratings for test set
    print("开始预测测试集... / Starting prediction on test set...")
    raw_predictions = model.predict(test_data_processed, user_map, movie_map, year_map, genre_cols)
    
    # 后处理预测评分 / Post-process predictions
    final_predictions = postprocess_predictions(raw_predictions)
    
    # 将处理后的预测结果添加到测试集 / Add processed predictions to test set
    test_data_processed['predicted_rating'] = final_predictions
    
    # 输出预测结果 / Output prediction results
    print("预测结果示例： / Prediction examples:")
    print(test_data_processed[['userId', 'movieId', 'timestamp', 'predicted_rating']].head())
    
    # 保存预测结果到文件 / Save predictions to file
    test_data_processed.to_csv('predicted_ratings.csv', index=False)
    print("预测结果已保存到 'predicted_ratings.csv' / Predictions saved to 'predicted_ratings.csv'")

if __name__ == "__main__":
    main()


# Week 3

## XGBoost

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 假设您的训练数据文件名为 merged_1.csv

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，并添加用户和电影的平均评分特征。
    English: Convert timestamp to year, process genres feature, and add mean rating features for users and movies.
    """
    # 移除 userId 为 nan 的行
    data = data.dropna(subset=['userId'])

    # 将 timestamp 转换为年份作为时间特征 / Convert timestamp to year as time feature
    data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 处理 genres，转换为二进制向量 / Process genres, convert to binary vectors
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    
    # 合并 genres_df 到原始数据 / Merge genres_df to original data
    data = pd.concat([data, genres_df], axis=1)
    
    # 创建用户和电影的平均评分特征 / Create mean rating features for users and movies
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    return data, mlb.classes_

# 交叉验证 / Cross-validation
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并计算 MSE（70% 训练，30% 测试）
    English: Perform k-fold cross-validation and calculate MSE (70% training, 30% testing)
    """
    # 确保数据中没有 userId 为 nan 的行
    data = data.dropna(subset=['userId'])
    
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # 特征列 / Feature columns
        feature_cols = ['user_mean_rating', 'movie_mean_rating', 'year'] + list(genre_cols)
        
        # 训练集和测试集 / Training and test sets
        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_test = test_data[feature_cols]
        y_test = test_data['rating']
        
        # 分割训练集为训练和验证集 / Split training set into train and validation
        X_train_split = X_train[:-int(0.2 * len(X_train))]
        X_val_split = X_train[-int(0.2 * len(X_train)):]
        y_train_split = y_train[:-int(0.2 * len(y_train))]
        y_val_split = y_train[-int(0.2 * len(y_train)):]
        
        # 创建DMatrix，用于XGBoost原生接口 / Create DMatrix for XGBoost native interface
        dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
        dval = xgb.DMatrix(X_val_split, label=y_val_split)
        dtest = xgb.DMatrix(X_test)
        
        # 设置参数 / Define parameters
        params = {
            'objective': 'reg:squarederror',  # 平方误差回归
            'eta': 0.05,                      # 学习率，降低以提高精度
            'max_depth': 6,                   # 树的最大深度
            'subsample': 0.8,                 # 使用80%样本，防止过拟合
            'colsample_bytree': 0.8,          # 使用80%特征，防止过拟合
            'nthread': -1,                    # 并行计算，使用所有CPU核心
            'seed': 42                        # 随机种子，确保可复现
        }
        
        # 训练模型，带早停 / Train with early stopping
        evals = [(dval, 'eval')]  # 验证集用于早停
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=500,          # 最大树数量，支持早停
            evals=evals,
            early_stopping_rounds=10,     # 10轮无改进则停止
            verbose_eval=False            # 不显示训练过程
        )
        
        # 预测测试集 / Predict on test set
        predictions = model.predict(dtest)
        
        # 计算 MSE / Calculate MSE
        mse = mean_squared_error(y_test, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    # 输出平均 MSE / Output average MSE
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    # 数据预处理 / Data preprocessing
    data_processed, genre_cols = preprocess_data(data)
    
    # 执行交叉验证 / Perform cross-validation
    print("开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year


开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...
正在处理第 1 折... / Processing fold 1...
第 1 折 MSE: 0.6669 / Fold 1 MSE: 0.6669
正在处理第 2 折... / Processing fold 2...
第 2 折 MSE: 0.6537 / Fold 2 MSE: 0.6537
正在处理第 3 折... / Processing fold 3...
第 3 折 MSE: 0.6526 / Fold 3 MSE: 0.6526
正在处理第 4 折... / Processing fold 4...
第 4 折 MSE: 0.6439 / Fold 4 MSE: 0.6439
正在处理第 5 折... / Processing fold 5...
第 5 折 MSE: 0.6525 / Fold 5 MSE: 0.6525
平均 MSE: 0.6539 / Average MSE: 0.6539
最终平均 MSE: 0.6539 / Final Average MSE: 0.6539


## XGBoost 2

增加模型特征：用户评分次数，单独用户平均评分，电影的年份,评分年份和电影年份的差。把全部特征用于冷启动问题

Feature Engineering: the amount of user rating; the average of the user; the year of movie; the gap between rating year and movie year. Using all features for the cold start problem

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，提取电影年份，计算用户评分次数、用户平均评分、电影年份及评分年份与电影年份的差。
    English: Convert timestamp to year, process genres feature, extract movie year, calculate user rating count, user mean rating, movie year, and the difference between rating year and movie year.
    """
    data = data.dropna(subset=['userId'])
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    data['movie_year'] = data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan)
    
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    return data, mlb.classes_

# 交叉验证 / Cross-validation
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并计算 MSE（70% 训练，30% 测试）
    English: Perform k-fold cross-validation and calculate MSE (70% training, 30% testing)
    """
    data = data.dropna(subset=['userId'])
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # 特征列 / Feature columns
        feature_cols = ['user_rating_count', 'user_mean_rating', 'movie_mean_rating', 'rating_year', 'movie_year', 'year_diff'] + list(genre_cols)
        
        # 训练集和测试集 / Training and test sets
        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_test = test_data[feature_cols]
        y_test = test_data['rating']
        
        # 分割训练集为训练和验证集 / Split training set into train and validation
        X_train_split = X_train[:-int(0.2 * len(X_train))]
        X_val_split = X_train[-int(0.2 * len(X_train)):]
        y_train_split = y_train[:-int(0.2 * len(y_train))]
        y_val_split = y_train[-int(0.2 * len(y_train)):]
        
        # 创建DMatrix / Create DMatrix for XGBoost
        dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
        dval = xgb.DMatrix(X_val_split, label=y_val_split)
        dtest = xgb.DMatrix(X_test)
        
        # 设置参数 / Define parameters
        params = {
            'objective': 'reg:squarederror',
            'eta': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'nthread': -1,
            'seed': 42
        }
        
        # 训练模型，带早停 / Train with early stopping
        evals = [(dval, 'eval')]
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=500,
            evals=evals,
            early_stopping_rounds=10,
            verbose_eval=False
        )
        
        # 预测测试集 / Predict on test set
        predictions = model.predict(dtest)
        
        # 计算 MSE / Calculate MSE
        mse = mean_squared_error(y_test, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    data_processed, genre_cols = preprocess_data(data)
    print("开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


## XGBoost 3

增加特征：电影评分次数、用户评分方差。超参数调优：随机搜索

Feature Engineering: the amount of the movie rating; user rating variance.
Optimal parameters: random search

In [17]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 替换为您的实际数据文件名

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，提取电影年份，计算用户评分次数、用户平均评分、用户评分方差、电影评分次数、电影年份及评分年份与电影年份的差。
    English: Convert timestamp to year, process genres feature, extract movie year, calculate user rating count, user mean rating, user rating variance, movie rating count, movie year, and the difference between rating year and movie year.
    """
    # 移除 userId 为空的行
    data = data.dropna(subset=['userId'])

    # 将 timestamp 转换为评分年份
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 从 title 中提取电影年份（如 "Under Siege (1992)"）
    data['movie_year'] = data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan)
    
    # 处理 genres，转换为二进制向量
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    # 用户评分次数
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    
    # 用户平均评分
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    # 用户评分方差
    user_rating_var = data.groupby('userId')['rating'].var().to_dict()
    data['user_rating_var'] = data['userId'].map(user_rating_var)
    data['user_rating_var'] = data['user_rating_var'].fillna(0)  # 评分次数为1时方差为NaN，填充为0
    
    # 电影评分次数
    movie_rating_count = data.groupby('movieId')['rating'].count().to_dict()
    data['movie_rating_count'] = data['movieId'].map(movie_rating_count)
    
    # 电影平均评分
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    # 评分年份与电影年份的差
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    return data, mlb.classes_

# 交叉验证与超参数调优 / Cross-validation with hyperparameter tuning
def cross_validate(data, genre_cols, k_fold=5):
    """
    中文：执行 k 折交叉验证并使用随机搜索进行超参数调优。
    English: Perform k-fold cross-validation with randomized search for hyperparameter tuning.
    """
    data = data.dropna(subset=['userId'])
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        # 特征列：使用所有特征
        feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var', 
                        'movie_rating_count', 'movie_mean_rating', 'rating_year', 
                        'movie_year', 'year_diff'] + list(genre_cols)
        
        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_test = test_data[feature_cols]
        y_test = test_data['rating']
        
        # 定义超参数搜索空间
        param_dist = {
            'eta': [0.01, 0.03, 0.05, 0.1],           # 学习率 / Learning rate
            'max_depth': [4, 6, 8, 10],               # 树的最大深度 / Max depth of trees
            'subsample': [0.7, 0.8, 0.9],             # 样本采样率 / Subsample ratio
            'colsample_bytree': [0.7, 0.8, 0.9]       # 特征采样率 / Feature sampling ratio
        }
        
        # XGBoost模型
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=500,
            n_jobs=-1,  # 并行计算 / Parallel computation
            random_state=42
        )
        
        # 随机搜索
        random_search = RandomizedSearchCV(
            xgb_model,
            param_distributions=param_dist,
            n_iter=20,  # 搜索20次组合 / 20 iterations
            scoring='neg_mean_squared_error',
            cv=3,       # 内部3折交叉验证 / 3-fold CV
            random_state=42
        )
        random_search.fit(X_train, y_train)
        
        # 获取最佳模型
        best_model = random_search.best_estimator_
        
        # 预测测试集
        predictions = best_model.predict(X_test)
        
        # 计算 MSE
        mse = mean_squared_error(y_test, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    # 数据预处理
    data_processed, genre_cols = preprocess_data(data)
    
    # 执行交叉验证
    print("开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan)


开始交叉验证（70% 训练，30% 测试）... / Starting cross-validation (70% training, 30% testing)...
正在处理第 1 折... / Processing fold 1...
第 1 折 MSE: 0.6323 / Fold 1 MSE: 0.6323
正在处理第 2 折... / Processing fold 2...
第 2 折 MSE: 0.6195 / Fold 2 MSE: 0.6195
正在处理第 3 折... / Processing fold 3...
第 3 折 MSE: 0.6179 / Fold 3 MSE: 0.6179
正在处理第 4 折... / Processing fold 4...
第 4 折 MSE: 0.6146 / Fold 4 MSE: 0.6146
正在处理第 5 折... / Processing fold 5...
第 5 折 MSE: 0.6178 / Fold 5 MSE: 0.6178
平均 MSE: 0.6204 / Average MSE: 0.6204
最终平均 MSE: 0.6204 / Final Average MSE: 0.6204


## XGBoost 3 Forecasting 最终选用

In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
train_data = pd.read_csv('merged_1.csv')  # 训练集文件 / Training set file
test_data = pd.read_csv('ratings_test.csv')  # 测试集文件 / Test set file

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，提取电影年份，计算用户评分次数、用户平均评分、用户评分方差、电影评分次数、电影年份及评分年份与电影年份的差。
    English: Convert timestamp to year, process genres feature, extract movie year, calculate user rating count, user mean rating, user rating variance, movie rating count, movie year, and the difference between rating year and movie year.
    """
    # 移除 userId 或 movieId 为空的行
    data = data.dropna(subset=['userId', 'movieId'])

    # 将 timestamp 转换为评分年份
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 从 title 中提取电影年份（如 "Under Siege (1992)"）
    data['movie_year'] = data['title'].apply(
        lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan
    )
    
    # 处理 genres，转换为二进制向量
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    # 用户评分次数
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    
    # 用户平均评分
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    # 用户评分方差
    user_rating_var = data.groupby('userId')['rating'].var().to_dict()
    data['user_rating_var'] = data['userId'].map(user_rating_var)
    data['user_rating_var'] = data['user_rating_var'].fillna(0)  # 评分次数为1时方差为NaN，填充为0
    
    # 电影评分次数
    movie_rating_count = data.groupby('movieId')['rating'].count().to_dict()
    data['movie_rating_count'] = data['movieId'].map(movie_rating_count)
    
    # 电影平均评分
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    # 评分年份与电影年份的差
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    return data, mlb.classes_

# 主程序 / Main function
def main():
    # 预处理训练集 / Preprocess training set
    print("正在预处理训练数据... / Preprocessing training data...")
    train_data_processed, genre_cols = preprocess_data(train_data)
    
    # 计算用户和电影特征映射 / Compute user and movie feature mappings
    user_features = train_data_processed.groupby('userId')[['user_rating_count', 'user_mean_rating', 'user_rating_var']].first().to_dict('index')
    movie_features = train_data_processed.groupby('movieId')[['movie_rating_count', 'movie_mean_rating', 'movie_year'] + list(genre_cols)].first().to_dict('index')
    
    # 计算用于缺失值填充的统计值 / Compute statistics for imputation
    impute_values = {
        'user_rating_count': train_data_processed['user_rating_count'].median(),
        'user_mean_rating': train_data_processed['user_mean_rating'].mean(),
        'user_rating_var': train_data_processed['user_rating_var'].mean(),
        'movie_rating_count': train_data_processed['movie_rating_count'].median(),
        'movie_mean_rating': train_data_processed['movie_mean_rating'].mean(),
        'movie_year': train_data_processed['movie_year'].median()
    }
    
    # 定义特征列 / Define feature columns
    feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var', 
                    'movie_rating_count', 'movie_mean_rating', 'rating_year', 
                    'movie_year', 'year_diff'] + list(genre_cols)
    
    # 准备训练数据 / Prepare training data
    X_train = train_data_processed[feature_cols]
    y_train = train_data_processed['rating']
    
    # 超参数调优 / Hyperparameter tuning
    print("开始超参数调优... / Starting hyperparameter tuning...")
    param_dist = {
        'eta': [0.01, 0.03, 0.05, 0.1],           # 学习率 / Learning rate
        'max_depth': [4, 6, 8, 10],               # 树的最大深度 / Max depth of trees
        'subsample': [0.7, 0.8, 0.9],             # 样本采样率 / Subsample ratio
        'colsample_bytree': [0.7, 0.8, 0.9]       # 特征采样率 / Feature sampling ratio
    }
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=500,
        n_jobs=-1,  # 并行计算 / Parallel computation
        random_state=42
    )
    random_search = RandomizedSearchCV(
        xgb_model,
        param_distributions=param_dist,
        n_iter=50,  # 搜索20次组合 / 20 iterations
        scoring='neg_mean_squared_error',
        cv=5,       # 5折交叉验证 / 5-fold CV
        random_state=42
    )
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    print("超参数调优完成，最佳参数: / Hyperparameter tuning completed, best parameters:", random_search.best_params_)
    
    # 加载并预处理测试集 / Load and preprocess test set
    print("正在预处理测试数据... / Preprocessing test data...")
    test_data_processed = test_data.copy()
    test_data_processed['rating_year'] = pd.to_datetime(test_data_processed['timestamp'], unit='s').dt.year
    
    # 映射用户特征 / Map user features
    for feature in ['user_rating_count', 'user_mean_rating', 'user_rating_var']:
        test_data_processed[feature] = test_data_processed['userId'].map(
            lambda x: user_features.get(x, {}).get(feature, impute_values[feature])
        )
    
    # 映射电影特征 / Map movie features
    for feature in ['movie_rating_count', 'movie_mean_rating', 'movie_year']:
        test_data_processed[feature] = test_data_processed['movieId'].map(
            lambda x: movie_features.get(x, {}).get(feature, impute_values[feature])
        )
    
    # 映射 genres / Map genres
    for genre in genre_cols:
        test_data_processed[genre] = test_data_processed['movieId'].map(
            lambda x: movie_features.get(x, {}).get(genre, 0)
        )
    
    # 计算 year_diff / Compute year_diff
    test_data_processed['year_diff'] = test_data_processed['rating_year'] - test_data_processed['movie_year']
    
    # 准备测试特征 / Prepare test features
    X_test = test_data_processed[feature_cols]
    
    # 预测测试集评分 / Predict ratings for test set
    print("开始预测测试集... / Starting prediction on test set...")
    predictions = best_model.predict(X_test)
    
    # 将预测结果添加到测试集 / Add predictions to test set
    test_data_processed['predicted_rating'] = predictions
    
    # 输出预测结果示例 / Output prediction examples
    print("预测结果示例： / Prediction examples:")
    print(test_data_processed[['userId', 'movieId', 'timestamp', 'predicted_rating']].head())
    
    # 保存预测结果到文件 / Save predictions to file
    test_data_processed.to_csv('predicted_ratings_xgb.csv', index=False)
    print("预测结果已保存到 'predicted_ratings_xgb.csv' / Predictions saved to 'predicted_ratings_xgb.csv'")

if __name__ == "__main__":
    main()


正在预处理训练数据... / Preprocessing training data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(


开始超参数调优... / Starting hyperparameter tuning...
超参数调优完成，最佳参数: / Hyperparameter tuning completed, best parameters: {'subsample': 0.7, 'max_depth': 8, 'eta': 0.03, 'colsample_bytree': 0.7}
正在预处理测试数据... / Preprocessing test data...
开始预测测试集... / Starting prediction on test set...
预测结果示例： / Prediction examples:
   userId  movieId   timestamp  predicted_rating
0      73    49526  1255586478          3.283628
1     187    47518  1237162935          3.561228
2     150      788  1114306821          2.735262
3     216     8830  1095792449          2.853003
4     242     1227   956685476          4.811522
预测结果已保存到 'predicted_ratings_xgb.csv' / Predictions saved to 'predicted_ratings_xgb.csv'


## XGBoost 4

特征工程：
1. 基于评分次数，使用归一化的用户和电影热门度分数
2. 创建电影类型与年份或评分的交互项
3. 计算用户近期评分与历史评分的差异，或电影近期评分趋势

In [16]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载
data = pd.read_csv('merged_1.csv')

# 数据预处理
def preprocess_data(data):
    data = data.dropna(subset=['userId'])
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    data['movie_year'] = data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan)
    
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    data['user_popularity'] = data['user_rating_count'] / data['user_rating_count'].max()
    
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    user_rating_var = data.groupby('userId')['rating'].var().to_dict()
    data['user_rating_var'] = data['userId'].map(user_rating_var).fillna(0)
    
    movie_rating_count = data.groupby('movieId')['rating'].count().to_dict()
    data['movie_rating_count'] = data['movieId'].map(movie_rating_count)
    data['movie_popularity'] = data['movie_rating_count'] / data['movie_rating_count'].max()
    
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    user_trend = data.groupby('userId').apply(lambda x: x['rating'].iloc[-1] - x['rating'].mean()).to_dict()
    data['user_rating_trend'] = data['userId'].map(user_trend)

    for genre in mlb.classes_:
        data[f'{genre}_year'] = data[genre] * data['movie_year']
    
    scaler = StandardScaler()
    numeric_cols = ['user_rating_count', 'user_rating_var', 'movie_rating_count', 'year_diff']
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    
    return data, mlb.classes_

# 交叉验证与超参数调优
def cross_validate(data, genre_cols, k_fold=5):
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折...")
        train_data = data.iloc[train_idx]
        test_data = data.iloc[test_idx]
        
        feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var', 
                        'movie_rating_count', 'movie_mean_rating', 'rating_year', 
                        'movie_year', 'year_diff', 'user_rating_trend', 'user_popularity', 'movie_popularity'] + \
                        [f'{g}_year' for g in genre_cols] + list(genre_cols)
        
        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_test = test_data[feature_cols]
        y_test = test_data['rating']
        
        param_dist = {
            'eta': [0.005, 0.01, 0.03, 0.05, 0.1],
            'max_depth': [3, 4, 6, 8, 10, 12],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.5, 1]
        }
              
        xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, n_jobs=-1, random_state=42)
        random_search = RandomizedSearchCV(xgb_model, param_dist, n_iter=50, scoring='neg_mean_squared_error', cv=3, random_state=42)
        random_search.fit(X_train, y_train)
        
        best_model = random_search.best_estimator_
        predictions = best_model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f}")
    
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序
def main():
    data_processed, genre_cols = preprocess_data(data)
    print("开始交叉验证（70% 训练，30% 测试）...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan)


开始交叉验证（70% 训练，30% 测试）...
正在处理第 1 折...
第 1 折 MSE: 0.6347
正在处理第 2 折...
第 2 折 MSE: 0.6218
正在处理第 3 折...
第 3 折 MSE: 0.6195
正在处理第 4 折...
第 4 折 MSE: 0.6169
正在处理第 5 折...
第 5 折 MSE: 0.6205
平均 MSE: 0.6227
最终平均 MSE: 0.6227


# Week 4

## XGBoost regularization

In [None]:
import numpy as np 
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
data = pd.read_csv('merged_1.csv')  # 替换为您的实际训练集文件名

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，提取电影年份，计算用户评分次数、用户平均评分、用户评分方差、电影评分次数、电影年份及评分年份与电影年份的差。
    English: Convert timestamp to year, process genres feature, extract movie year, calculate user rating count, user mean rating, user rating variance, movie rating count, movie year, and the difference between rating year and movie year.
    """
    # 移除 userId 或 movieId 为空的行
    data = data.dropna(subset=['userId', 'movieId'])

    # 将 timestamp 转换为评分年份
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 从 title 中提取电影年份（如 "Under Siege (1992)"）
    data['movie_year'] = data['title'].apply(
        lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan
    )
    
    # 处理 genres，转换为二进制向量
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    # 用户评分次数
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    
    # 用户平均评分
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    # 用户评分方差
    user_rating_var = data.groupby('userId')['rating'].var().to_dict()
    data['user_rating_var'] = data['userId'].map(user_rating_var)
    data['user_rating_var'] = data['user_rating_var'].fillna(0)  # 填充 NaN
    
    # 电影评分次数
    movie_rating_count = data.groupby('movieId')['rating'].count().to_dict()
    data['movie_rating_count'] = data['movieId'].map(movie_rating_count)
    
    # 电影平均评分
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    # 评分年份与电影年份的差
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    return data, mlb.classes_

# 交叉验证与超参数调优 / Cross-validation with hyperparameter tuning
def cross_validate(data, genre_cols, k_fold=5):
    data = data.dropna(subset=['userId', 'movieId'])  # 再次确保无缺失值
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折... / Processing fold {fold+1}...")
        train_data = data.iloc[train_idx]
        val_data = data.iloc[val_idx]
        
        feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var', 
                        'movie_rating_count', 'movie_mean_rating', 'rating_year', 
                        'movie_year', 'year_diff'] + list(genre_cols)
        
        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_val = val_data[feature_cols]
        y_val = val_data['rating']
        
        param_dist = {
            'eta': [0.01, 0.03, 0.05, 0.1],
            'max_depth': [4, 6, 8, 10],
            'subsample': [0.7, 0.8, 0.9],
            'colsample_bytree': [0.7, 0.8, 0.9],
            'reg_alpha': [0, 0.1, 1.0],
            'reg_lambda': [0.1, 1.0, 10.0]
        }
        
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=500,
            n_jobs=-1,
            random_state=42
        )
        
        random_search = RandomizedSearchCV(
            xgb_model,
            param_distributions=param_dist,
            n_iter=20,
            scoring='neg_mean_squared_error',
            cv=3,
            random_state=42
        )
        # 修改：移除 early_stopping_rounds 和 eval_set
        random_search.fit(X_train, y_train)
        
        best_model = random_search.best_estimator_
        predictions = best_model.predict(X_val)
        mse = mean_squared_error(y_val, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f} / Fold {fold+1} MSE: {mse:.4f}")
    
    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f} / Average MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序 / Main function
def main():
    print("正在预处理训练数据... / Preprocessing training data...")
    data_processed, genre_cols = preprocess_data(data)
    
    print("开始交叉验证（5折交叉验证）... / Starting cross-validation (5-fold cross-validation)...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f} / Final Average MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()

正在预处理训练数据... / Preprocessing training data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(


开始交叉验证（5折交叉验证）... / Starting cross-validation (5-fold cross-validation)...
正在处理第 1 折... / Processing fold 1...
第 1 折 MSE: 0.6316 / Fold 1 MSE: 0.6316
正在处理第 2 折... / Processing fold 2...


Forecasting

In [5]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载 / Data loading
train_data = pd.read_csv('merged_1.csv')  # 训练集文件 / Training set file
test_data = pd.read_csv('ratings_test.csv')  # 测试集文件 / Test set file

# 数据预处理 / Data preprocessing
def preprocess_data(data):
    """
    中文：将 timestamp 转换为年份，处理 genres 特征，提取电影年份，计算用户评分次数、用户平均评分、用户评分方差、电影评分次数、电影年份及评分年份与电影年份的差。
    English: Convert timestamp to year, process genres feature, extract movie year, calculate user rating count, user mean rating, user rating variance, movie rating count, movie year, and the difference between rating year and movie year.
    """
    # 移除 userId 或 movieId 为空的行
    data = data.dropna(subset=['userId', 'movieId'])

    # 将 timestamp 转换为评分年份
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    
    # 从 title 中提取电影年份（如 "Under Siege (1992)"）
    data['movie_year'] = data['title'].apply(
        lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan
    )
    
    # 处理 genres，转换为二进制向量
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)
    
    # 用户评分次数
    user_rating_count = data.groupby('userId')['rating'].count().to_dict()
    data['user_rating_count'] = data['userId'].map(user_rating_count)
    
    # 用户平均评分
    user_mean_rating = data.groupby('userId')['rating'].mean().to_dict()
    data['user_mean_rating'] = data['userId'].map(user_mean_rating)
    
    # 用户评分方差
    user_rating_var = data.groupby('userId')['rating'].var().to_dict()
    data['user_rating_var'] = data['userId'].map(user_rating_var)
    data['user_rating_var'] = data['user_rating_var'].fillna(0)  # 评分次数为1时方差为NaN，填充为0
    
    # 电影评分次数
    movie_rating_count = data.groupby('movieId')['rating'].count().to_dict()
    data['movie_rating_count'] = data['movieId'].map(movie_rating_count)
    
    # 电影平均评分
    movie_mean_rating = data.groupby('movieId')['rating'].mean().to_dict()
    data['movie_mean_rating'] = data['movieId'].map(movie_mean_rating)
    
    # 评分年份与电影年份的差
    data['year_diff'] = data['rating_year'] - data['movie_year']
    
    return data, mlb.classes_

# 主程序 / Main function
def main():
    # 预处理训练集 / Preprocess training set
    print("正在预处理训练数据... / Preprocessing training data...")
    train_data_processed, genre_cols = preprocess_data(train_data)
    
    # 定义特征列 / Define feature columns
    feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var', 
                    'movie_rating_count', 'movie_mean_rating', 'rating_year', 
                    'movie_year', 'year_diff'] + list(genre_cols)
    
    # 准备训练数据 / Prepare training data
    X_train = train_data_processed[feature_cols]
    y_train = train_data_processed['rating']
    
    # 超参数调优 / Hyperparameter tuning
    print("开始超参数调优... / Starting hyperparameter tuning...")
    param_dist = {
        'eta': [0.01, 0.03, 0.05, 0.1],
        'max_depth': [4, 6, 8, 10],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1, 1.0],
        'reg_lambda': [0.1, 1.0, 10.0]
    }
    xgb_model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=500,
        n_jobs=-1,
        random_state=42
    )
    random_search = RandomizedSearchCV(
        xgb_model,
        param_distributions=param_dist,
        n_iter=20,
        scoring='neg_mean_squared_error',
        cv=5,
        random_state=42
    )
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    print("超参数调优完成，最佳参数: / Hyperparameter tuning completed, best parameters:", random_search.best_params_)
    
    # 加载并预处理测试集 / Load and preprocess test set
    print("正在预处理测试数据... / Preprocessing test data...")
    test_data_processed = test_data.copy()
    test_data_processed['rating_year'] = pd.to_datetime(test_data_processed['timestamp'], unit='s').dt.year
    
    # 映射用户特征 / Map user features
    user_features = train_data_processed.groupby('userId')[['user_rating_count', 'user_mean_rating', 'user_rating_var']].first().to_dict('index')
    for feature in ['user_rating_count', 'user_mean_rating', 'user_rating_var']:
        test_data_processed[feature] = test_data_processed['userId'].map(
            lambda x: user_features.get(x, {}).get(feature, train_data_processed[feature].median() if feature == 'user_rating_count' else train_data_processed[feature].mean())
        )
    
    # 映射电影特征 / Map movie features
    movie_features = train_data_processed.groupby('movieId')[['movie_rating_count', 'movie_mean_rating', 'movie_year'] + list(genre_cols)].first().to_dict('index')
    for feature in ['movie_rating_count', 'movie_mean_rating', 'movie_year']:
        test_data_processed[feature] = test_data_processed['movieId'].map(
            lambda x: movie_features.get(x, {}).get(feature, train_data_processed[feature].median() if feature == 'movie_rating_count' else train_data_processed[feature].mean())
        )
    for genre in genre_cols:
        test_data_processed[genre] = test_data_processed['movieId'].map(
            lambda x: movie_features.get(x, {}).get(genre, 0)
        )
    
    # 计算 year_diff / Compute year_diff
    test_data_processed['year_diff'] = test_data_processed['rating_year'] - test_data_processed['movie_year']
    
    # 准备测试特征 / Prepare test features
    X_test = test_data_processed[feature_cols]
    
    # 预测测试集评分 / Predict ratings for test set
    print("开始预测测试集... / Starting prediction on test set...")
    predictions = best_model.predict(X_test)
    
    # 将预测结果添加到测试集 / Add predictions to test set
    test_data_processed['predicted_rating'] = predictions
    
    # 输出预测结果示例 / Output prediction examples
    print("预测结果示例： / Prediction examples:")
    print(test_data_processed[['userId', 'movieId', 'timestamp', 'predicted_rating']].head())
    
    # 保存预测结果到文件 / Save predictions to file
    test_data_processed.to_csv('predicted_ratings_xgb.csv', index=False)
    print("预测结果已保存到 'predicted_ratings_xgb.csv' / Predictions saved to 'predicted_ratings_xgb.csv'")

if __name__ == "__main__":
    main()


正在预处理训练数据... / Preprocessing training data...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(


开始超参数调优... / Starting hyperparameter tuning...
超参数调优完成，最佳参数: / Hyperparameter tuning completed, best parameters: {'subsample': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 0, 'max_depth': 8, 'eta': 0.03, 'colsample_bytree': 0.8}
正在预处理测试数据... / Preprocessing test data...
开始预测测试集... / Starting prediction on test set...
预测结果示例： / Prediction examples:
   userId  movieId   timestamp  predicted_rating
0      73    49526  1255586478          3.282050
1     187    47518  1237162935          3.495789
2     150      788  1114306821          2.745424
3     216     8830  1095792449          2.811216
4     242     1227   956685476          4.825458
预测结果已保存到 'predicted_ratings_xgb.csv' / Predictions saved to 'predicted_ratings_xgb.csv'


## XGBoost regularization + PCA

In [2]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# 数据加载
data = pd.read_csv('merged_1.csv')  # 替换为您的实际训练集文件名

# 数据预处理
def preprocess_data(data):
    """将原始数据转换为包含统计特征和genres二进制向量的格式，并处理NaN值"""
    # 移除空值行
    data = data.dropna(subset=['userId', 'movieId'])

    # 时间特征
    data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
    data['movie_year'] = data['title'].apply(
        lambda x: int(re.search(r'\((\d{4})\)', x).group(1)) if re.search(r'\((\d{4})\)', x) else np.nan
    )
    # 填充 movie_year 的 NaN 为中位年份
    median_movie_year = data['movie_year'].median()
    data['movie_year'] = data['movie_year'].fillna(median_movie_year)
    data['year_diff'] = data['rating_year'] - data['movie_year']

    # 处理 genres
    mlb = MultiLabelBinarizer()
    genres_matrix = mlb.fit_transform(data['genres'].str.split('|'))
    genres_df = pd.DataFrame(genres_matrix, columns=mlb.classes_)
    data = pd.concat([data, genres_df], axis=1)

    # 用户统计特征
    data['user_rating_count'] = data['userId'].map(data.groupby('userId')['rating'].count().to_dict())
    data['user_mean_rating'] = data['userId'].map(data.groupby('userId')['rating'].mean().to_dict())
    data['user_rating_var'] = data['userId'].map(data.groupby('userId')['rating'].var().to_dict())
    data['user_rating_var'] = data['user_rating_var'].fillna(0)

    # 电影统计特征
    data['movie_rating_count'] = data['movieId'].map(data.groupby('movieId')['rating'].count().to_dict())
    data['movie_mean_rating'] = data['movieId'].map(data.groupby('movieId')['rating'].mean().to_dict())

    # 确保所有特征无 NaN
    feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var',
                    'movie_rating_count', 'movie_mean_rating', 'rating_year',
                    'movie_year', 'year_diff'] + list(mlb.classes_)
    data[feature_cols] = data[feature_cols].fillna(0)  # 填充任何剩余的 NaN

    return data, mlb.classes_

# 交叉验证与超参数调优
def cross_validate(data, genre_cols, k_fold=5):
    """执行k折交叉验证，加入PCA降维"""
    data = data.dropna(subset=['userId', 'movieId'])
    kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
    mse_list = []

    # 特征列
    feature_cols = ['user_rating_count', 'user_mean_rating', 'user_rating_var',
                    'movie_rating_count', 'movie_mean_rating', 'rating_year',
                    'movie_year', 'year_diff'] + list(genre_cols)

    for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
        print(f"正在处理第 {fold+1} 折...")
        train_data = data.iloc[train_idx]
        val_data = data.iloc[val_idx]

        X_train = train_data[feature_cols]
        y_train = train_data['rating']
        X_val = val_data[feature_cols]
        y_val = val_data['rating']

        # 标准化数据
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        # 应用 PCA
        pca = PCA(n_components=0.95)  # 保留95%的方差
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_val_pca = pca.transform(X_val_scaled)
        print(f"第 {fold+1} 折保留的主成分数量: {X_train_pca.shape[1]}")

        # 超参数搜索空间
        param_dist = {
            'eta': [0.01, 0.03, 0.05, 0.1],
            'max_depth': [4, 6, 8, 10],
            'subsample': [0.7, 0.8, 0.9],
            'colsample_bytree': [0.7, 0.8, 0.9],
            'reg_alpha': [0, 0.1, 1.0],
            'reg_lambda': [0.1, 1.0, 10.0]
        }

        # XGBoost 模型
        xgb_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=500,
            n_jobs=-1,
            random_state=42
        )

        # 随机搜索
        random_search = RandomizedSearchCV(
            xgb_model,
            param_distributions=param_dist,
            n_iter=20,
            scoring='neg_mean_squared_error',
            cv=3,
            random_state=42
        )
        random_search.fit(X_train_pca, y_train)

        # 获取最佳模型并预测
        best_model = random_search.best_estimator_
        predictions = best_model.predict(X_val_pca)
        mse = mean_squared_error(y_val, predictions)
        mse_list.append(mse)
        print(f"第 {fold+1} 折 MSE: {mse:.4f}")

    avg_mse = np.mean(mse_list)
    print(f"平均 MSE: {avg_mse:.4f}")
    return avg_mse

# 主程序
def main():
    print("正在预处理训练数据...")
    data_processed, genre_cols = preprocess_data(data)
    
    print("开始交叉验证（5折交叉验证）...")
    avg_mse = cross_validate(data_processed, genre_cols, k_fold=5)
    print(f"最终平均 MSE: {avg_mse:.4f}")

if __name__ == "__main__":
    main()


正在预处理训练数据...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating_year'] = pd.to_datetime(data['timestamp'], unit='s').dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['title'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['movie_year'] = data['movie_year'].fillna(median_movie_year)
A value is trying to be

开始交叉验证（5折交叉验证）...
正在处理第 1 折...
第 1 折保留的主成分数量: 24
第 1 折 MSE: 0.6883
正在处理第 2 折...
第 2 折保留的主成分数量: 24
第 2 折 MSE: 0.6691
正在处理第 3 折...
第 3 折保留的主成分数量: 24
第 3 折 MSE: 0.6718
正在处理第 4 折...
第 4 折保留的主成分数量: 24
第 4 折 MSE: 0.6616
正在处理第 5 折...
第 5 折保留的主成分数量: 24
第 5 折 MSE: 0.6736
平均 MSE: 0.6729
最终平均 MSE: 0.6729
