# Data Load

In [54]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sn
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
import nltk
from sklearn.metrics import roc_auc_score, f1_score
import seaborn as sn
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error #add rmse
from data import merged



In [55]:
# Load the data and drop VADER columns
df_train = pd.read_csv('data/merged/merged_cleaned_sentiment_train.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_val = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_test = pd.read_csv('data/merged/merged_cleaned_sentiment_test.csv').drop(['pos','neg','neu', 'compound'], axis = 1)

In [56]:
#Save audio features
# 5 audio features + 2 real target
df_train = df_train[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_val = df_val[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_test = df_test[['danceability', 'energy', 'instrumentalness', 'valence','mode','y_valence', 'y_arousal']]


In [57]:
# Remove rows with missing values from the training, validation, and test datasets
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [99]:
# output colums
df_train.columns

Index(['danceability', 'energy', 'instrumentalness', 'valence', 'mode',
       'y_valence', 'y_arousal'],
      dtype='object')

## split data

In [59]:

# Training set
# X_train: Features for training set, excluding the target variables 'y_valence' and 'y_arousal'
X_train = df_train.drop(['y_valence', 'y_arousal'], axis=1).values
# y_train_valence: Target variable 'y_valence' for training set
y_train_valence = df_train.y_valence.values 
# y_train_arousal: Target variable 'y_arousal' for training set
y_train_arousal = df_train.y_arousal.values
    
# Validation set
# X_val: Features for validation set, excluding the target variables 'y_valence' and 'y_arousal'
X_val = df_val.drop(['y_valence', 'y_arousal'], axis=1).values
# y_val_valence: Target variable 'y_valence' for validation set
y_val_valence = df_val.y_valence.values 
# y_val_arousal: Target variable 'y_arousal' for validation set
y_val_arousal = df_val.y_arousal.values 

# Test set
# X_test: Features for test set, excluding the target variables 'y_valence' and 'y_arousal'
X_test = df_test.drop(['y_valence', 'y_arousal'], axis=1).values
# y_test_valence: Target variable 'y_valence' for test set
y_test_valence = df_test.y_valence.values 
# y_test_arousal: Target variable 'y_arousal' for test set
y_test_arousal = df_test.y_arousal.values

# Evaluation

In [73]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(X_val, y_1_validation, y_2_validation, model_predictions_file='predictions.csv'):
    """Evaluate the trained model using different evaluation criteria, including Normalized RMSE"""
    
    # Load the saved predictions from the CSV file
    df_predictions = pd.read_csv(model_predictions_file)
    
    # Get the true values from validation data
    true_valence = y_1_validation
    true_arousal = y_2_validation

    # Ensure predictions are in the original range (if necessary)
    # If predictions are standardized, use the inverse_transform of your scaler before proceeding.
    # Example: df_predictions['pred_valence'] = scaler.inverse_transform(df_predictions[['pred_valence']])

    # Compute RMSE
    rmse_valence = mean_squared_error(true_valence, df_predictions['pred_valence'], squared=False)
    rmse_arousal = mean_squared_error(true_arousal, df_predictions['pred_arousal'], squared=False)

    # Compute Normalized RMSE
    valence_range = max(true_valence) - min(true_valence)
    arousal_range = max(true_arousal) - min(true_arousal)

    normalized_rmse_valence = rmse_valence / valence_range if valence_range > 0 else None
    normalized_rmse_arousal = rmse_arousal / arousal_range if arousal_range > 0 else None

    # Compute R²
    r2_valence = r2_score(true_valence, df_predictions['pred_valence'])
    r2_arousal = r2_score(true_arousal, df_predictions['pred_arousal'])

    # Print evaluation results
    print(f"RMSE for Valence: {rmse_valence:.4f}")
    print(f"RMSE for Arousal: {rmse_arousal:.4f}")
    print(f"Normalized RMSE for Valence: {normalized_rmse_valence:.4f}" if normalized_rmse_valence is not None else "Valence range is zero, cannot compute NRMSE.")
    print(f"Normalized RMSE for Arousal: {normalized_rmse_arousal:.4f}" if normalized_rmse_arousal is not None else "Arousal range is zero, cannot compute NRMSE.")
    print(f"R² for Valence: {r2_valence:.4f}")
    print(f"R² for Arousal: {r2_arousal:.4f}")
    
    # Return evaluation results as a dictionary
    eval_results = {
        'rmse_valence': rmse_valence,
        'rmse_arousal': rmse_arousal,
        'normalized_rmse_valence': normalized_rmse_valence,
        'normalized_rmse_arousal': normalized_rmse_arousal,
        'r2_valence': r2_valence,
        'r2_arousal': r2_arousal
    }

    return eval_results


## Evaluation Test

In [77]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate_on_test(X_test, y_test_valence, y_test_arousal, model_valence, model_arousal,scaler_X=None):
    """
    在原始尺度上评估模型性能：输出 RMSE、Normalized RMSE 和 R²。
    """
    # 1. 归一化测试集特征（如果提供了 scaler）
    if scaler_X is not None:
     X_test = scaler_X.transform(X_test)
    # 1. 模型预测
    pred_val = model_valence.predict(X_test)
    pred_arou = model_arousal.predict(X_test)

    # 2. RMSE
    rmse_val = mean_squared_error(y_test_valence, pred_val, squared=False)
    rmse_arou = mean_squared_error(y_test_arousal, pred_arou, squared=False)

    # 3. Normalized RMSE
    valence_range = np.max(y_test_valence) - np.min(y_test_valence)
    arousal_range = np.max(y_test_arousal) - np.min(y_test_arousal)

    nrmse_val = rmse_val / valence_range if valence_range > 0 else None
    nrmse_arou = rmse_arou / arousal_range if arousal_range > 0 else None

    # 4. R²
    r2_val = r2_score(y_test_valence, pred_val)
    r2_arou = r2_score(y_test_arousal, pred_arou)

    # 5. 输出结果
    print("📊 [Test Set Evaluation]")
    print(f"RMSE (Valence): {rmse_val:.4f}")
    print(f"RMSE (Arousal): {rmse_arou:.4f}")
    print(f"Normalized RMSE (Valence): {nrmse_val:.4f}" if nrmse_val is not None else "Valence range is zero, cannot compute NRMSE.")
    print(f"Normalized RMSE (Arousal): {nrmse_arou:.4f}" if nrmse_arou is not None else "Arousal range is zero, cannot compute NRMSE.")
    print(f"R² (Valence): {r2_val:.4f}")
    print(f"R² (Arousal): {r2_arou:.4f}")

    # 6. 返回可选结果字典
    return {
        'rmse_valence': rmse_val,
        'rmse_arousal': rmse_arou,
        'normalized_rmse_valence': nrmse_val,
        'normalized_rmse_arousal': nrmse_arou,
        'r2_valence': r2_val,
        'r2_arousal': r2_arou
    }


# Normalization to True y

In [61]:
from sklearn.preprocessing import MinMaxScaler

# 初始化 scaler
scaler_valence = MinMaxScaler()
scaler_arousal = MinMaxScaler()

# 归一化训练集目标变量
y_train_valence_scaled = scaler_valence.fit_transform(y_train_valence.reshape(-1, 1)).flatten()
y_train_arousal_scaled = scaler_arousal.fit_transform(y_train_arousal.reshape(-1, 1)).flatten()

# 验证集使用相同的 scaler 转换（不能重新 fit！）
y_val_valence_scaled = scaler_valence.transform(y_val_valence.reshape(-1, 1)).flatten()
y_val_arousal_scaled = scaler_arousal.transform(y_val_arousal.reshape(-1, 1)).flatten()

# 测试集同样如此（如果你用它）
y_test_valence_scaled = scaler_valence.transform(y_test_valence.reshape(-1, 1)).flatten()
y_test_arousal_scaled = scaler_arousal.transform(y_test_arousal.reshape(-1, 1)).flatten()


# LR

## Model1 Without Nor

In [25]:

def train_regression(X, y_1, y_2, X_val, param_grid=None):
    """ Train the regression model with GridSearchCV """
    
    # Default parameters for grid search
    if param_grid is None:
        param_grid = {'fit_intercept': [True, False], 'positive': [True, False]}
    
    # Initialize models for both targets
    lr_val = LinearRegression()
    lr_arou = LinearRegression()
    
    # Grid search for both models
    clf_vale = GridSearchCV(lr_val, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(lr_arou, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models to training data
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    print()
    print(f"Best parameter for Valence (CV score={clf_vale.best_score_:.3f}):")
    print(clf_vale.best_params_)
    
    print()
    print(f"Best parameter for Arousal (CV score={clf_arou.best_score_:.3f}):")
    print(clf_arou.best_params_)

    # Initialize models with best parameters
    lr_val_top = LinearRegression(fit_intercept=clf_vale.best_params_['fit_intercept'],  
                                  positive=clf_vale.best_params_['positive'])
    lr_arou_top = LinearRegression(fit_intercept=clf_arou.best_params_['fit_intercept'],  
                                   positive=clf_arou.best_params_['positive'])

    # Fit optimal models to the entire training data
    lr_val_top.fit(X, y_1)
    lr_arou_top.fit(X, y_2)

    # Predict for validation data
    predictions_val = lr_val_top.predict(X_val)  # Predictions for Valence
    predictions_arou = lr_arou_top.predict(X_val)  # Predictions for Arousal

    # Save predictions for validation data to CSV
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to a CSV file
    df_predictions.to_csv('AUdio_prediction.csv', index=False)

    print("✅ Training completed and predictions saved!")
    return lr_val_top, lr_arou_top


## Model2 Nor+track

In [62]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_regression_scaled_y(X, y_1_scaled, y_2_scaled, X_val, track_ids, param_grid=None, output_file='AUdio_prediction.csv'):
    """
    Train regression model using pre-scaled y values (Valence and Arousal).
    Predictions are not inverse-transformed and are saved as scaled.
    """

    # Step 1: Set default parameters if none provided
    if param_grid is None:
        param_grid = {'fit_intercept': [True, False], 'positive': [True, False]}

    # Step 2: Initialize base models
    lr_val = LinearRegression()
    lr_arou = LinearRegression()

    # Step 3: Grid search
    clf_vale = GridSearchCV(lr_val, param_grid, scoring='neg_mean_squared_error',
                            verbose=1, n_jobs=-1, return_train_score=True)
    clf_arou = GridSearchCV(lr_arou, param_grid, scoring='neg_mean_squared_error',
                            verbose=1, n_jobs=-1, return_train_score=True)

    # Step 4: Fit models on scaled y
    clf_vale.fit(X, y_1_scaled)
    clf_arou.fit(X, y_2_scaled)

    print(f"\nBest Valence Params (CV Score: {clf_vale.best_score_:.3f}): {clf_vale.best_params_}")
    print(f"Best Arousal Params (CV Score: {clf_arou.best_score_:.3f}): {clf_arou.best_params_}")

    # Step 5: Refit best models
    best_val = LinearRegression(**clf_vale.best_params_)
    best_arou = LinearRegression(**clf_arou.best_params_)

    best_val.fit(X, y_1_scaled)
    best_arou.fit(X, y_2_scaled)

    # Step 6: Predict on validation set (still scaled)
    pred_val_scaled = best_val.predict(X_val)
    pred_arou_scaled = best_arou.predict(X_val)

    # Step 7: Save predictions to CSV
    if len(track_ids) != len(pred_val_scaled):
        raise ValueError("Mismatch: The number of track IDs does not match the number of predictions!")

    df_predictions = pd.DataFrame({
        'track_id': track_ids,
        'pred_valence': pred_val_scaled,
        'pred_arousal': pred_arou_scaled
    })

    df_predictions.to_csv(output_file, index=False)
    print(f"✅ Scaled predictions saved to {output_file}!")

    return best_val, best_arou


In [63]:

lr_val_model, lr_arou_model = train_regression_scaled_y(
    X_train,
    y_train_valence_scaled,
    y_train_arousal_scaled,
    X_val,
    track_ids=df_validation['trackname'].values,
    output_file='AUdio_prediction_scaled.csv'
)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best Valence Params (CV Score: -0.072): {'fit_intercept': True, 'positive': False}
Best Arousal Params (CV Score: -0.030): {'fit_intercept': True, 'positive': False}
✅ Scaled predictions saved to AUdio_prediction_scaled.csv!


In [41]:
# 读取数据
df = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv')

# 假设验证集是从原始数据集中选出的
df_validation = df.iloc[:len(X_val)]  # 只保留验证集部分

# 提取验证集对应的 Track ID
track_ids = df_validation['trackname'].values  # 或者 df_validation['track_id'] 取唯一标识符

# 训练回归模型，并存储预测结果
train_regression(X_train, y_train_valence, y_train_arousal, X_val, track_ids)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best parameter for Valence (CV score=-0.982):
{'fit_intercept': True, 'positive': False}

Best parameter for Arousal (CV score=-0.774):
{'fit_intercept': True, 'positive': False}
✅ Training completed and predictions saved with Track IDs!


(LinearRegression(), LinearRegression())

In [27]:
# Assuming X_train, y_train, X_validation, y_validation are your data
lr_val_model, lr_arou_model = train_regression(X_train, y_train_valence, y_train_arousal, X_val)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best parameter for Valence (CV score=-0.982):
{'fit_intercept': True, 'positive': False}

Best parameter for Arousal (CV score=-0.774):
{'fit_intercept': True, 'positive': False}
✅ Training completed and predictions saved!


## Evaluation

In [74]:
# Assuming X_val, y_valence_val, y_arousal_val are your validation data
eval_results = evaluate_model(X_val, y_val_valence_scaled ,y_val_arousal_scaled,model_predictions_file='AUdio_prediction_scaled.csv')


RMSE for Valence: 0.2614
RMSE for Arousal: 0.1670
Normalized RMSE for Valence: 0.2774
Normalized RMSE for Arousal: 0.1670
R² for Valence: 0.1317
R² for Arousal: 0.2009


RMSE for Valence: 0.966
RMSE for Arousal: 0.850
Normalized RMSE for Valence: 0.277
Normalized RMSE for Arousal: 0.167
R² for Valence: 0.132
R² for Arousal: 0.201

In [65]:
print("Before scaling:", y_train_valence.min(), y_train_valence.max())
print("After scaling:", y_train_valence_scaled.min(), y_train_valence_scaled.max())


Before scaling: -2.14809720439 1.54671444505
After scaling: 0.0 1.0


## Test

In [None]:
from sklearn.metrics import mean_squared_error

# 计算 MSE
mse_valence = mean_squared_error(y_test_valence, lr_val_model.predict(X_test))
mse_arousal = mean_squared_error(y_test_arousal, lr_arou_model.predict(X_test))

# 计算 Normalized RMSE
valence_range = y_test_valence.max() - y_test_valence.min()
arousal_range = y_test_arousal.max() - y_test_arousal.min()

nrmse_valence = (mse_valence ** 0.5) / valence_range if valence_range > 0 else None
nrmse_arousal = (mse_arousal ** 0.5) / arousal_range if arousal_range > 0 else None

print(f"MSE for Valence: {mse_valence:.4f}")
print(f"MSE for Arousal: {mse_arousal:.4f}")
print(f"Normalized RMSE for Valence: {nrmse_valence:.4f}" if nrmse_valence is not None else "Valence range is zero, cannot compute NRMSE.")
print(f"Normalized RMSE for Arousal: {nrmse_arousal:.4f}" if nrmse_arousal is not None else "Arousal range is zero, cannot compute NRMSE.")

# 评估测试集上的模型性能
results = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    lr_val_model,
    lr_arou_model
)

📊 [Test Set Evaluation]
RMSE (Valence): 0.9594
RMSE (Arousal): 0.8554
Normalized RMSE (Valence): 0.2597
Normalized RMSE (Arousal): 0.1681
R² (Valence): 0.1703
R² (Arousal): 0.1928


# MLP

## Define Model

In [67]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

def train_mlp(X, y_1, y_2, X_val, param_grid=None):
    """ Train the MLP model with GridSearchCV and save the predictions """

    # Normalize the features using MinMaxScaler
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X_val = scaler.transform(X_val)

    # Default parameters for grid search
    if param_grid is None:
        param_grid = {
            'hidden_layer_sizes': [(5,), (10,), (15,), (5,5), (10,10), (15,15), (5,5,5), (10,10,10), (15,15,15)], 
            'max_iter': [500, 1000, 2000, 2500]
        }

    # Initialize models for both targets
    mlp_val = MLPRegressor(random_state=2)
    mlp_arou = MLPRegressor(random_state=2)
    
    # Grid search for Valence
    clf_vale = GridSearchCV(mlp_val, param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, return_train_score=True)
    clf_vale.fit(X, y_1)
    print(f"Best parameter for Valence (CV score={-clf_vale.best_score_:.3f}): {clf_vale.best_params_}")
    
    # Initialize model with best parameters and fit
    mlp_val_top = MLPRegressor(hidden_layer_sizes=clf_vale.best_params_['hidden_layer_sizes'],
                               max_iter=clf_vale.best_params_['max_iter'],
                               random_state=2)
    mlp_val_top.fit(X, y_1)

    # Grid search for Arousal
    clf_arou = GridSearchCV(mlp_arou, param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, return_train_score=True)
    clf_arou.fit(X, y_2)
    print(f"Best parameter for Arousal (CV score={-clf_arou.best_score_:.3f}): {clf_arou.best_params_}")
    
    # Initialize model with best parameters and fit
    mlp_arou_top = MLPRegressor(hidden_layer_sizes=clf_arou.best_params_['hidden_layer_sizes'],
                                max_iter=clf_arou.best_params_['max_iter'],
                                random_state=2)
    mlp_arou_top.fit(X, y_2)

    # Predict for validation data
    predictions_val = mlp_val_top.predict(X_val)  # Predictions for Valence
    predictions_arou = mlp_arou_top.predict(X_val)  # Predictions for Arousal

    # Save predictions for validation data to CSV
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to a CSV file
    df_predictions.to_csv('AUdio_predictions_mlp.csv', index=False)

    print("✅ MLP training completed and predictions saved!")
    return mlp_val_top, mlp_arou_top


## Train MLP

In [68]:
# Assuming X_train, y_train_valence, y_train_arousal, X_val are available
mlp_val_model, mlp_arou_model = train_mlp(X_train, y_train_valence, y_train_arousal, X_val)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameter for Valence (CV score=0.968): {'hidden_layer_sizes': (15,), 'max_iter': 500}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameter for Arousal (CV score=0.756): {'hidden_layer_sizes': (10, 10, 10), 'max_iter': 500}
✅ MLP training completed and predictions saved!


## Evaluation

In [75]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='AUdio_predictions_mlp.csv')

RMSE for Valence: 0.9624
RMSE for Arousal: 0.8360
Normalized RMSE for Valence: 0.2764
Normalized RMSE for Arousal: 0.1643
R² for Valence: 0.1379
R² for Arousal: 0.2270


## Test

In [91]:
### Test Set - MLP 
mlp_val = MLPRegressor(hidden_layer_sizes=mlp_val_model.hidden_layer_sizes, max_iter=mlp_val_model.max_iter, random_state=2)
mlp_arou = MLPRegressor(hidden_layer_sizes=mlp_arou_model.hidden_layer_sizes, max_iter=mlp_arou_model.max_iter, random_state=2)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape
print(mlp_val.fit(X_train_scaled, y_train_valence).score(X_test_scaled, y_test_valence))
print(mlp_arou.fit(X_train_scaled, y_train_arousal).score(X_test_scaled, y_test_arousal))

0.17802320341063427
0.20265538858827592


In [87]:
# 评估测试集上的 MLP 模型性能
results_mlp = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    mlp_val_model,
    mlp_arou_model
)

📊 [Test Set Evaluation]
RMSE (Valence): 0.9561
RMSE (Arousal): 0.8502
Normalized RMSE (Valence): 0.2588
Normalized RMSE (Arousal): 0.1671
R² (Valence): 0.1760
R² (Arousal): 0.2024


# RF

## def MODEL

In [70]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_rf(X, y_1, y_2, X_val):
    """ Train Random Forest model and save predictions for validation data """

    # Initialize models for Valence and Arousal
    rf_val = RandomForestRegressor(random_state=0)
    rf_arou = RandomForestRegressor(random_state=0)

    # Hyperparameter grid for GridSearchCV
    param_grid = { 
        'n_estimators': [100, 500],
        'max_depth': [5, 10, 15]
    }

    # Grid search for Valence model
    clf_vale = GridSearchCV(rf_val, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Grid search for Arousal model
    clf_arou = GridSearchCV(rf_arou, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models for Valence and Arousal
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)

    # Print best parameters found by GridSearchCV
    print(f"Best parameter for Valence (CV score={-clf_vale.best_score_:.3f}):")
    print(clf_vale.best_params_)

    print(f"Best parameter for Arousal (CV score={-clf_arou.best_score_:.3f}):")
    print(clf_arou.best_params_)

    # Initialize models with best parameters
    rf_val_best = RandomForestRegressor(n_estimators=clf_vale.best_params_['n_estimators'],
                                        max_depth=clf_vale.best_params_['max_depth'],
                                        random_state=0)
    
    rf_arou_best = RandomForestRegressor(n_estimators=clf_arou.best_params_['n_estimators'],
                                         max_depth=clf_arou.best_params_['max_depth'],
                                         random_state=0)

    # Fit models with best parameters
    rf_val_best.fit(X, y_1)
    rf_arou_best.fit(X, y_2)

    # Predict for validation data
    predictions_val = rf_val_best.predict(X_val)  # Predictions for Valence
    predictions_arou = rf_arou_best.predict(X_val)  # Predictions for Arousal

    # Save predictions to a DataFrame
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to CSV
    df_predictions.to_csv('AUdio_predictions_rf.csv', index=False)

    print("✅ Random Forest training completed and predictions saved!")

    return rf_val_best, rf_arou_best


## Train Model

In [71]:
rf_val_model, rf_arou_model = train_rf(X_train, y_train_valence, y_train_arousal, X_val)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameter for Valence (CV score=0.976):
{'max_depth': 5, 'n_estimators': 500}
Best parameter for Arousal (CV score=0.751):
{'max_depth': 5, 'n_estimators': 500}
✅ Random Forest training completed and predictions saved!


## Evaluation

In [76]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='AUdio_predictions_rf.csv') 

RMSE for Valence: 0.9642
RMSE for Arousal: 0.8331
Normalized RMSE for Valence: 0.2769
Normalized RMSE for Arousal: 0.1637
R² for Valence: 0.1346
R² for Arousal: 0.2324


## Test

In [90]:
#### Test Set - RF
rf_val = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)
rf_arou = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)

print(rf_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(rf_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))

0.1710578256745422
0.20400503272081394


In [88]:
# 评估测试集上的 Random Forest 模型性能
results_rf = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    rf_val_model,
    rf_arou_model
)

📊 [Test Set Evaluation]
RMSE (Valence): 0.9583
RMSE (Arousal): 0.8494
Normalized RMSE (Valence): 0.2594
Normalized RMSE (Arousal): 0.1669
R² (Valence): 0.1723
R² (Arousal): 0.2039


# SVR

## Def Model

In [92]:


def train_svr_linear(X, y_1, y_2, X_val):
    """ Train SVR model (linear kernel) with GridSearch over C, and save predictions for validation data """

    # Normalize features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_val_scaled = scaler.transform(X_val)

    # Define base model
    base_svr = SVR(kernel='linear')

    # Hyperparameter grid (only for C)
    param_grid = {
        'C': [0.1, 1, 5, 10, 100]
    }

    # Grid search for Valence model
    clf_vale = GridSearchCV(base_svr, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Grid search for Arousal model
    clf_arou = GridSearchCV(base_svr, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models
    clf_vale.fit(X_scaled, y_1)
    clf_arou.fit(X_scaled, y_2)

    # Print best parameters
    print(f"Best C for Valence (CV score={-clf_vale.best_score_:.3f}): {clf_vale.best_params_['C']}")
    print(f"Best C for Arousal (CV score={-clf_arou.best_score_:.3f}): {clf_arou.best_params_['C']}")

    # Use best estimators to predict
    svr_val_best = clf_vale.best_estimator_
    svr_arou_best = clf_arou.best_estimator_

    predictions_val = svr_val_best.predict(X_val_scaled)
    predictions_arou = svr_arou_best.predict(X_val_scaled)

    # Save predictions
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,
        'pred_arousal': predictions_arou
    })

    df_predictions.to_csv('Audio_predictions_svr_linear_gridC.csv', index=False)

    print("✅ SVR (linear kernel with GridSearch on C) training completed and predictions saved!")

    return svr_val_best, svr_arou_best


## train

In [93]:
svr_val_model, svr_arou_model = train_svr_linear(X_train, y_train_valence, y_train_arousal, X_val)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C for Valence (CV score=1.011): 0.1
Best C for Arousal (CV score=0.776): 0.1
✅ SVR (linear kernel with GridSearch on C) training completed and predictions saved!


## Evaluation

In [94]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='Audio_predictions_svr_linear_gridC.csv')

RMSE for Valence: 0.9784
RMSE for Arousal: 0.8504
Normalized RMSE for Valence: 0.2810
Normalized RMSE for Arousal: 0.1671
R² for Valence: 0.1091
R² for Arousal: 0.2001


## Test

In [95]:
#### Test Score - SVR
svr_val = SVR(kernel=svr_val_model.kernel, C=svr_val_model.C)
svr_arou = SVR(kernel=svr_arou_model.kernel, C=svr_arou_model.C)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(svr_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(svr_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))


0.15832727745087072
0.18879866121237865


In [97]:
results_svr = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    svr_val_model,
    svr_arou_model,
    scaler_X=scaler  # ✅ 传入用于训练的 scaler
)

📊 [Test Set Evaluation]
RMSE (Valence): 0.9674
RMSE (Arousal): 0.8576
Normalized RMSE (Valence): 0.2618
Normalized RMSE (Arousal): 0.1685
R² (Valence): 0.1565
R² (Arousal): 0.1885
