In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
# === Step 1: 读取三个输入文件 ===
# 真值包含 y_valence 和 y_arousal 列
df_truth = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv')
df_audio = pd.read_csv('csv/AUdio_predictions_mlp_torch.csv')
df_lyrics = pd.read_csv("Lyrics_predictions_mlp_torch.csv")

# 提取真值和预测结果（均为 shape: [n_samples, 2]）
y_true = df_truth[["y_valence", "y_arousal"]].values
pred_audio = df_audio.values
pred_lyrics = df_lyrics.values

FileNotFoundError: [Errno 2] No such file or directory: 'data/merged/merged_cleaned_sentiment_validation.csv'

In [None]:


# === Step 2: 进行 alpha grid search ===
alphas = np.linspace(0, 1, 21)  # 从 0.0 到 1.0，每隔 0.05
results = []

for alpha in alphas:
    fused = alpha * pred_audio + (1 - alpha) * pred_lyrics
    rmse_val = mean_squared_error(y_true[:, 0], fused[:, 0], squared=False)
    rmse_aro = mean_squared_error(y_true[:, 1], fused[:, 1], squared=False)
    avg_rmse = (rmse_val + rmse_aro) / 2
    results.append((alpha, rmse_val, rmse_aro, avg_rmse))

# 转成 DataFrame 方便查看
df_results = pd.DataFrame(results, columns=["alpha", "rmse_valence", "rmse_arousal", "avg_rmse"])

# 打印最优 alpha
best_row = df_results.loc[df_results["avg_rmse"].idxmin()]
best_alpha = best_row["alpha"]
print(f"\n✅ 最优 alpha = {best_alpha:.2f}, 平均 RMSE = {best_row['avg_rmse']:.4f}")



In [None]:
# === Step 3: 可视化 RMSE vs alpha ===
plt.plot(df_results["alpha"], df_results["rmse_valence"], label="Valence RMSE")
plt.plot(df_results["alpha"], df_results["rmse_arousal"], label="Arousal RMSE")
plt.plot(df_results["alpha"], df_results["avg_rmse"], label="Average RMSE", linestyle="--", linewidth=2)
plt.axvline(best_alpha, color="red", linestyle=":", label=f"Best alpha = {best_alpha:.2f}")
plt.xlabel("Alpha (Audio Weight)")
plt.ylabel("RMSE")
plt.title("Grid Search over Alpha for Late Fusion")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# === Step 4: 保存最优融合结果（可选）===
fused_best = best_alpha * pred_audio + (1 - best_alpha) * pred_lyrics
df_fused = pd.DataFrame(fused_best, columns=["pred_valence", "pred_arousal"])
df_fused.to_csv("fused_alpha_best.csv", index=False)
print("✅ 已保存最优融合结果到 fused_alpha_best.csv")