# Load Packagage

In [36]:
# Install the torch package
#%pip install torch
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from itertools import product
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV
from data import merged

## Load Data

In [41]:
# Load the data and drop VADER columns
df_train = pd.read_csv('data/merged/merged_cleaned_sentiment_train.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_val = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_test = pd.read_csv('data/merged/merged_cleaned_sentiment_test.csv').drop(['pos','neg','neu', 'compound'], axis = 1)

In [42]:
#Save audio features
# 5 audio features + 2 real target
df_train = df_train[[ 'y_valence', 'y_arousal']]
df_val = df_val[[ 'y_valence', 'y_arousal']]
df_test = df_test[['y_valence', 'y_arousal']]


In [43]:
# Load the lyrics features and merge with audio
df_train = pd.concat([df_train, pd.read_csv('data/lyrics/lyrics_features_train.csv').iloc[:, :-200]], axis=1)
df_val = pd.concat([df_val, pd.read_csv('data/lyrics/lyrics_features_val.csv').iloc[:, :-200]], axis=1)
df_test = pd.concat([df_test, pd.read_csv('data/lyrics/lyrics_features_test.csv').iloc[:, :-200]], axis=1)

In [44]:
# Remove rows with missing values from the training, validation, and test datasets
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [45]:
df_train = df_train.drop(columns=['Unnamed: 0'])
df_val = df_val.drop(columns=['Unnamed: 0'])
df_test = df_test.drop(columns=['Unnamed: 0'])

In [46]:
# output colums
df_train.columns

Index(['y_valence', 'y_arousal', 'pos', 'neg', 'neu', 'compound',
       'tfidf_pca_1', 'tfidf_pca_2', 'tfidf_pca_3', 'tfidf_pca_4',
       ...
       'tfidf_pca_91', 'tfidf_pca_92', 'tfidf_pca_93', 'tfidf_pca_94',
       'tfidf_pca_95', 'tfidf_pca_96', 'tfidf_pca_97', 'tfidf_pca_98',
       'tfidf_pca_99', 'tfidf_pca_100'],
      dtype='object', length=106)

In [48]:

# Training set
# X_train: Features for training set, excluding the target variables 'y_valence' and 'y_arousal'
X_train = df_train.drop(['y_valence', 'y_arousal'], axis=1).values
# y_train_valence: Target variable 'y_valence' for training set
y_train_valence = df_train.y_valence.values 
# y_train_arousal: Target variable 'y_arousal' for training set
y_train_arousal = df_train.y_arousal.values
    
# Validation set
# X_val: Features for validation set, excluding the target variables 'y_valence' and 'y_arousal'
X_val = df_val.drop(['y_valence', 'y_arousal'], axis=1).values
# y_val_valence: Target variable 'y_valence' for validation set
y_val_valence = df_val.y_valence.values 
# y_val_arousal: Target variable 'y_arousal' for validation set
y_val_arousal = df_val.y_arousal.values 

# Test set
# X_test: Features for test set, excluding the target variables 'y_valence' and 'y_arousal'
X_test = df_test.drop(['y_valence', 'y_arousal'], axis=1).values
# y_test_valence: Target variable 'y_valence' for test set
y_test_valence = df_test.y_valence.values 
# y_test_arousal: Target variable 'y_arousal' for test set
y_test_arousal = df_test.y_arousal.values

In [57]:
print(df_test.columns)  

Index(['y_valence', 'y_arousal', 'pos', 'neg', 'neu', 'compound',
       'tfidf_pca_1', 'tfidf_pca_2', 'tfidf_pca_3', 'tfidf_pca_4',
       ...
       'tfidf_pca_91', 'tfidf_pca_92', 'tfidf_pca_93', 'tfidf_pca_94',
       'tfidf_pca_95', 'tfidf_pca_96', 'tfidf_pca_97', 'tfidf_pca_98',
       'tfidf_pca_99', 'tfidf_pca_100'],
      dtype='object', length=106)


## Evaluation

In [49]:
## Evaluation
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(X_val, y_1_validation, y_2_validation, model_predictions_file='predictions.csv'):
    """Evaluate the trained model using different evaluation criteria, including Normalized RMSE"""
    
    # Load the saved predictions from the CSV file
    df_predictions = pd.read_csv(model_predictions_file)
    
    # Get the true values from validation data
    true_valence = y_1_validation
    true_arousal = y_2_validation

    # Ensure predictions are in the original range (if necessary)
    # If predictions are standardized, use the inverse_transform of your scaler before proceeding.
    # Example: df_predictions['pred_valence'] = scaler.inverse_transform(df_predictions[['pred_valence']])

    # Compute RMSE
    rmse_valence = mean_squared_error(true_valence, df_predictions['pred_valence'], squared=False)
    rmse_arousal = mean_squared_error(true_arousal, df_predictions['pred_arousal'], squared=False)

    # Compute Normalized RMSE
    valence_range = max(true_valence) - min(true_valence)
    arousal_range = max(true_arousal) - min(true_arousal)

    normalized_rmse_valence = rmse_valence / valence_range if valence_range > 0 else None
    normalized_rmse_arousal = rmse_arousal / arousal_range if arousal_range > 0 else None

    # Compute R²
    r2_valence = r2_score(true_valence, df_predictions['pred_valence'])
    r2_arousal = r2_score(true_arousal, df_predictions['pred_arousal'])

    # Print evaluation results
    print(f"RMSE for Valence: {rmse_valence:.4f}")
    print(f"RMSE for Arousal: {rmse_arousal:.4f}")
    print(f"Normalized RMSE for Valence: {normalized_rmse_valence:.4f}" if normalized_rmse_valence is not None else "Valence range is zero, cannot compute NRMSE.")
    print(f"Normalized RMSE for Arousal: {normalized_rmse_arousal:.4f}" if normalized_rmse_arousal is not None else "Arousal range is zero, cannot compute NRMSE.")
    print(f"R² for Valence: {r2_valence:.4f}")
    print(f"R² for Arousal: {r2_arousal:.4f}")
    
    # Return evaluation results as a dictionary
    eval_results = {
        'rmse_valence': rmse_valence,
        'rmse_arousal': rmse_arousal,
        'normalized_rmse_valence': normalized_rmse_valence,
        'normalized_rmse_arousal': normalized_rmse_arousal,
        'r2_valence': r2_valence,
        'r2_arousal': r2_arousal
    }

    return eval_results


## Normalization

In [30]:
## Normalization
# 第二步：归一化（💡 训练集 fit，val/test 只 transform）
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# MLP

## Model

In [50]:
# Define a generic MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_layer_sizes):
        super(MLP, self).__init__()
        layers = []
        in_features = input_size
        for h in hidden_layer_sizes:
            layers.append(nn.Linear(in_features, h))
            layers.append(nn.ReLU())
            in_features = h
        layers.append(nn.Linear(in_features, 1))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


In [51]:
# Training function for one model
def train_single_mlp(X_train, y_train, hidden_layers, max_iter):
    model = MLP(X_train.shape[1], hidden_layers)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    X_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

    model.train()
    for epoch in range(max_iter):
        optimizer.zero_grad()
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()

    return model


In [53]:
def train_mlp_torch(X, y_1, y_2, X_val, param_grid=None):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_val_scaled = scaler.transform(X_val)

    if param_grid is None:
        param_grid = {
            'hidden_layer_sizes': [(5,), (10,), (15,), (5,5), (10,10), (15,15)],
            'max_iter': [500, 1000]
        }

    best_val_model = None
    best_arou_model = None
    best_val_score = float('inf')
    best_arou_score = float('inf')
    best_hidden_layers = None
    best_max_iter = None

    for hidden_layers, max_iter in product(param_grid['hidden_layer_sizes'], param_grid['max_iter']):
        print(f"🔍 Trying config: hidden_layers={hidden_layers}, max_iter={max_iter}")

        model_val = train_single_mlp(X_scaled, y_1, hidden_layers, max_iter)
        preds_val = model_val(torch.tensor(X_scaled, dtype=torch.float32)).detach().numpy().squeeze()
        mse_val = mean_squared_error(y_1, preds_val)

        if mse_val < best_val_score:
            best_val_score = mse_val
            best_val_model = model_val
            best_hidden_layers = hidden_layers
            best_max_iter = max_iter

        model_arou = train_single_mlp(X_scaled, y_2, hidden_layers, max_iter)
        preds_arou = model_arou(torch.tensor(X_scaled, dtype=torch.float32)).detach().numpy().squeeze()
        mse_arou = mean_squared_error(y_2, preds_arou)

        if mse_arou < best_arou_score:
            best_arou_score = mse_arou
            best_arou_model = model_arou

    # 用最优模型在验证集上预测并保存为 CSV（仅验证集）
    best_val_model.eval()
    best_arou_model.eval()
    with torch.no_grad():
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        predictions_val = best_val_model(X_val_tensor).squeeze().numpy()
        predictions_arou = best_arou_model(X_val_tensor).squeeze().numpy()

    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,
        'pred_arousal': predictions_arou
    })
    #df_predictions.to_csv('csv/lyrics/Lyrics_predictions_mlp_torch.csv', index=False)

    print("✅ PyTorch MLP training completed and validation predictions saved.")
    return best_val_model, best_arou_model, scaler, best_hidden_layers, best_max_iter


## Test

In [54]:
def generate_test_predictions_torch(model_val, model_arou, X_test, scaler, output_filename):
    """
    用 PyTorch 模型对测试集 X_test 做预测，并保存为 CSV 文件。

    参数：
    - model_val: 已训练的 valence 模型
    - model_arou: 已训练的 arousal 模型
    - X_test: 测试集输入特征（未缩放）
    - scaler: 和训练时用的一致的 MinMaxScaler
    - output_filename: 保存结果文件名
    """
    import torch

    X_test_scaled = scaler.transform(X_test)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

    model_val.eval()
    model_arou.eval()
    with torch.no_grad():
        preds_val = model_val(X_test_tensor).squeeze().numpy()
        preds_aro = model_arou(X_test_tensor).squeeze().numpy()

    df_pred = pd.DataFrame({
        "pred_valence": preds_val,
        "pred_arousal": preds_aro
    })
    df_pred.to_csv(output_filename, index=False)
    print(f"✅ Test predictions saved to {output_filename}")


In [58]:
# 从测试集 DataFrame 中提取特征列
X_test_lyrics = df_test.drop(['y_valence', 'y_arousal'], axis=1).values


In [59]:

generate_test_predictions_torch(
    best_val_model,
    best_arou_model,
    X_test_lyrics,
    scaler,
    "csv/lyrics/lyrics_test_predictions.csv"
)


✅ Test predictions saved to csv/lyrics/lyrics_test_predictions.csv


## Train

In [55]:
best_val_model, best_arou_model, scaler, best_layers, best_iter = train_mlp_torch(
    X_train, y_train_valence, y_train_arousal, X_val
)


🔍 Trying config: hidden_layers=(5,), max_iter=500
🔍 Trying config: hidden_layers=(5,), max_iter=1000
🔍 Trying config: hidden_layers=(10,), max_iter=500
🔍 Trying config: hidden_layers=(10,), max_iter=1000
🔍 Trying config: hidden_layers=(15,), max_iter=500
🔍 Trying config: hidden_layers=(15,), max_iter=1000
🔍 Trying config: hidden_layers=(5, 5), max_iter=500
🔍 Trying config: hidden_layers=(5, 5), max_iter=1000
🔍 Trying config: hidden_layers=(10, 10), max_iter=500
🔍 Trying config: hidden_layers=(10, 10), max_iter=1000
🔍 Trying config: hidden_layers=(15, 15), max_iter=500
🔍 Trying config: hidden_layers=(15, 15), max_iter=1000
✅ PyTorch MLP training completed and validation predictions saved.


## Evaluation

In [35]:
results_scaled = evaluate_model(
    X_val=X_val_scaled,
    y_1_validation=y_val_valence,
    y_2_validation=y_val_arousal,
    model_predictions_file='csv/lyrics/Lyrics_predictions_mlp_torch.csv'
)

RMSE for Valence: 0.9820
RMSE for Arousal: 0.9683
Normalized RMSE for Valence: 0.2820
Normalized RMSE for Arousal: 0.1903
R² for Valence: 0.1025
R² for Arousal: -0.0370
