In [90]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import seaborn as sn
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
import nltk
from sklearn.metrics import roc_auc_score, f1_score
import seaborn as sn
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error #add rmse
from data import merged



# Load Data

In [None]:
# Load the data and drop VADER columns
df_train = pd.read_csv('data/merged/merged_cleaned_sentiment_train.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_val = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv').drop(['pos','neg','neu', 'compound'], axis = 1)
df_test = pd.read_csv('data/merged/merged_cleaned_sentiment_test.csv').drop(['pos','neg','neu', 'compound'], axis = 1)

In [None]:
#Save audio features
df_train = df_train[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_val = df_val[['danceability', 'energy', 'instrumentalness', 'valence','mode', 'y_valence', 'y_arousal']]
df_test = df_test[['danceability', 'energy', 'instrumentalness', 'valence','mode','y_valence', 'y_arousal']]


In [4]:
# Load the lyrics features and merge with audio
df_train = pd.concat([df_train, pd.read_csv('data/lyrics/lyrics_features_train.csv').iloc[:, :-200]], axis=1)
df_val = pd.concat([df_val, pd.read_csv('data/lyrics/lyrics_features_val.csv').iloc[:, :-200]], axis=1)
df_test = pd.concat([df_test, pd.read_csv('data/lyrics/lyrics_features_test.csv').iloc[:, :-200]], axis=1)

In [None]:
# Remove rows with missing values from the training, validation, and test datasets
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [136]:
# output colums
df_train.columns

Index(['danceability', 'energy', 'instrumentalness', 'valence', 'mode',
       'y_valence', 'y_arousal', 'Unnamed: 0', 'pos', 'neg',
       ...
       'tfidf_pca_91', 'tfidf_pca_92', 'tfidf_pca_93', 'tfidf_pca_94',
       'tfidf_pca_95', 'tfidf_pca_96', 'tfidf_pca_97', 'tfidf_pca_98',
       'tfidf_pca_99', 'tfidf_pca_100'],
      dtype='object', length=112)

In [None]:

# Training set
# X_train: Features for training set, excluding the target variables 'y_valence' and 'y_arousal'
X_train = df_train.drop(['y_valence', 'y_arousal'], axis=1).values
# y_train_valence: Target variable 'y_valence' for training set
y_train_valence = df_train.y_valence.values 
# y_train_arousal: Target variable 'y_arousal' for training set
y_train_arousal = df_train.y_arousal.values
    
# Validation set
# X_val: Features for validation set, excluding the target variables 'y_valence' and 'y_arousal'
X_val = df_val.drop(['y_valence', 'y_arousal'], axis=1).values
# y_val_valence: Target variable 'y_valence' for validation set
y_val_valence = df_val.y_valence.values 
# y_val_arousal: Target variable 'y_arousal' for validation set
y_val_arousal = df_val.y_arousal.values 

# Test set
# X_test: Features for test set, excluding the target variables 'y_valence' and 'y_arousal'
X_test = df_test.drop(['y_valence', 'y_arousal'], axis=1).values
# y_test_valence: Target variable 'y_valence' for test set
y_test_valence = df_test.y_valence.values 
# y_test_arousal: Target variable 'y_arousal' for test set
y_test_arousal = df_test.y_arousal.values

# RMSE+R2

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


def evaluate_model1(X_val, y_1_validation, y_2_validation, model_predictions_file='predictions.csv'):
    """Evaluate the trained model using different evaluation criteria"""
    
    # Load the saved predictions from the CSV file
    df_predictions = pd.read_csv(model_predictions_file)
    
    # Get the true values from validation data
    true_valence = y_1_validation
    true_arousal = y_2_validation

    # Evaluate RMSE for Valence
    rmse_valence = mean_squared_error(true_valence, df_predictions['pred_valence'], squared=False)
    rmse_arousal = mean_squared_error(true_arousal, df_predictions['pred_arousal'], squared=False)

    # Evaluate R¬≤ for Valence and Arousal
    r2_valence = r2_score(true_valence, df_predictions['pred_valence'])
    r2_arousal = r2_score(true_arousal, df_predictions['pred_arousal'])

    # Print evaluation results
    print(f"RMSE for Valence: {rmse_valence:.3f}")
    print(f"RMSE for Arousal: {rmse_arousal:.3f}")
    
    print(f"R¬≤ for Valence: {r2_valence:.3f}")
    print(f"R¬≤ for Arousal: {r2_arousal:.3f}")
    
    # Return evaluation results as a dictionary
    eval_results = {
        'rmse_valence': rmse_valence,
        'rmse_arousal': rmse_arousal,
        'r2_valence': r2_valence,
        'r2_arousal': r2_arousal
    }

    return eval_results


In [91]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(X_val, y_1_validation, y_2_validation, model_predictions_file='predictions.csv'):
    """Evaluate the trained model using different evaluation criteria, including Normalized RMSE"""
    
    # Load the saved predictions from the CSV file
    df_predictions = pd.read_csv(model_predictions_file)
    
    # Get the true values from validation data
    true_valence = y_1_validation
    true_arousal = y_2_validation

    # Ensure predictions are in the original range (if necessary)
    # If predictions are standardized, use the inverse_transform of your scaler before proceeding.
    # Example: df_predictions['pred_valence'] = scaler.inverse_transform(df_predictions[['pred_valence']])

    # Compute RMSE
    rmse_valence = mean_squared_error(true_valence, df_predictions['pred_valence'], squared=False)
    rmse_arousal = mean_squared_error(true_arousal, df_predictions['pred_arousal'], squared=False)

    # Compute Normalized RMSE
    valence_range = max(true_valence) - min(true_valence)
    arousal_range = max(true_arousal) - min(true_arousal)

    normalized_rmse_valence = rmse_valence / valence_range if valence_range > 0 else None
    normalized_rmse_arousal = rmse_arousal / arousal_range if arousal_range > 0 else None

    # Compute R¬≤
    r2_valence = r2_score(true_valence, df_predictions['pred_valence'])
    r2_arousal = r2_score(true_arousal, df_predictions['pred_arousal'])

    # Print evaluation results
    print(f"RMSE for Valence: {rmse_valence:.4f}")
    print(f"RMSE for Arousal: {rmse_arousal:.4f}")
    print(f"Normalized RMSE for Valence: {normalized_rmse_valence:.4f}" if normalized_rmse_valence is not None else "Valence range is zero, cannot compute NRMSE.")
    print(f"Normalized RMSE for Arousal: {normalized_rmse_arousal:.4f}" if normalized_rmse_arousal is not None else "Arousal range is zero, cannot compute NRMSE.")
    print(f"R¬≤ for Valence: {r2_valence:.4f}")
    print(f"R¬≤ for Arousal: {r2_arousal:.4f}")
    
    # Return evaluation results as a dictionary
    eval_results = {
        'rmse_valence': rmse_valence,
        'rmse_arousal': rmse_arousal,
        'normalized_rmse_valence': normalized_rmse_valence,
        'normalized_rmse_arousal': normalized_rmse_arousal,
        'r2_valence': r2_valence,
        'r2_arousal': r2_arousal
    }

    return eval_results


# Evaluation on test

In [133]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate_on_test(X_test, y_test_valence, y_test_arousal, model_valence, model_arousal,scaler_X=None):
    """
    Âú®ÂéüÂßãÂ∞∫Â∫¶‰∏äËØÑ‰º∞Ê®°ÂûãÊÄßËÉΩÔºöËæìÂá∫ RMSE„ÄÅNormalized RMSE Âíå R¬≤„ÄÇ
    """
    # 1. ÂΩí‰∏ÄÂåñÊµãËØïÈõÜÁâπÂæÅÔºàÂ¶ÇÊûúÊèê‰æõ‰∫Ü scalerÔºâ
    if scaler_X is not None:
     X_test = scaler_X.transform(X_test)
    # 1. Ê®°ÂûãÈ¢ÑÊµã
    pred_val = model_valence.predict(X_test)
    pred_arou = model_arousal.predict(X_test)

    # 2. RMSE
    rmse_val = mean_squared_error(y_test_valence, pred_val, squared=False)
    rmse_arou = mean_squared_error(y_test_arousal, pred_arou, squared=False)

    # 3. Normalized RMSE
    valence_range = np.max(y_test_valence) - np.min(y_test_valence)
    arousal_range = np.max(y_test_arousal) - np.min(y_test_arousal)

    nrmse_val = rmse_val / valence_range if valence_range > 0 else None
    nrmse_arou = rmse_arou / arousal_range if arousal_range > 0 else None

    # 4. R¬≤
    r2_val = r2_score(y_test_valence, pred_val)
    r2_arou = r2_score(y_test_arousal, pred_arou)

    # 5. ËæìÂá∫ÁªìÊûú
    print("üìä [Test Set Evaluation]")
    print(f"RMSE (Valence): {rmse_val:.4f}")
    print(f"RMSE (Arousal): {rmse_arou:.4f}")
    print(f"Normalized RMSE (Valence): {nrmse_val:.4f}" if nrmse_val is not None else "Valence range is zero, cannot compute NRMSE.")
    print(f"Normalized RMSE (Arousal): {nrmse_arou:.4f}" if nrmse_arou is not None else "Arousal range is zero, cannot compute NRMSE.")
    print(f"R¬≤ (Valence): {r2_val:.4f}")
    print(f"R¬≤ (Arousal): {r2_arou:.4f}")

    # 6. ËøîÂõûÂèØÈÄâÁªìÊûúÂ≠óÂÖ∏
    return {
        'rmse_valence': rmse_val,
        'rmse_arousal': rmse_arou,
        'normalized_rmse_valence': nrmse_val,
        'normalized_rmse_arousal': nrmse_arou,
        'r2_valence': r2_val,
        'r2_arousal': r2_arou
    }


# MODEL:LR

## Define model

In [67]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_regression(X, y_1, y_2, X_val, param_grid=None):
    """ Train the regression model with GridSearchCV """
    
    # Default parameters for grid search
    if param_grid is None:
        param_grid = {'fit_intercept': [True, False], 'positive': [True, False]}
    
    # Initialize models for both targets
    lr_val = LinearRegression()
    lr_arou = LinearRegression()
    
    # Grid search for both models
    clf_vale = GridSearchCV(lr_val, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(lr_arou, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models to training data
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    print()
    print(f"Best parameter for Valence (CV score={clf_vale.best_score_:.3f}):")
    print(clf_vale.best_params_)
    
    print()
    print(f"Best parameter for Arousal (CV score={clf_arou.best_score_:.3f}):")
    print(clf_arou.best_params_)

    # Initialize models with best parameters
    lr_val_top = LinearRegression(fit_intercept=clf_vale.best_params_['fit_intercept'],  
                                  positive=clf_vale.best_params_['positive'])
    lr_arou_top = LinearRegression(fit_intercept=clf_arou.best_params_['fit_intercept'],  
                                   positive=clf_arou.best_params_['positive'])

    # Fit optimal models to the entire training data
    lr_val_top.fit(X, y_1)
    lr_arou_top.fit(X, y_2)

    # Predict for validation data
    predictions_val = lr_val_top.predict(X_val)  # Predictions for Valence
    predictions_arou = lr_arou_top.predict(X_val)  # Predictions for Arousal

    # Save predictions for validation data to CSV
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to a CSV file
    df_predictions.to_csv('predictions2.csv', index=False)

    print("‚úÖ Training completed and predictions saved!")
    return lr_val_top, lr_arou_top


In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_regression(X, y_1, y_2, X_val, track_ids, param_grid=None):
    """ Train the regression model with GridSearchCV and save predictions with track IDs """

    # Default parameters for grid search
    if param_grid is None:
        param_grid = {'fit_intercept': [True, False], 'positive': [True, False]}
    
    # Initialize models for both targets
    lr_val = LinearRegression()
    lr_arou = LinearRegression()
    
    # Grid search for both models
    clf_vale = GridSearchCV(lr_val, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)
    
    clf_arou = GridSearchCV(lr_arou, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models to training data
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)
    
    # Print best results on training data
    print()
    print(f"Best parameter for Valence (CV score={clf_vale.best_score_:.3f}):")
    print(clf_vale.best_params_)
    
    print()
    print(f"Best parameter for Arousal (CV score={clf_arou.best_score_:.3f}):")
    print(clf_arou.best_params_)

    # Initialize models with best parameters
    lr_val_top = LinearRegression(fit_intercept=clf_vale.best_params_['fit_intercept'],  
                                  positive=clf_vale.best_params_['positive'])
    lr_arou_top = LinearRegression(fit_intercept=clf_arou.best_params_['fit_intercept'],  
                                   positive=clf_arou.best_params_['positive'])

    # Fit optimal models to the entire training data
    lr_val_top.fit(X, y_1)
    lr_arou_top.fit(X, y_2)

    # Predict for validation data
    predictions_val = lr_val_top.predict(X_val)  # Predictions for Valence
    predictions_arou = lr_arou_top.predict(X_val)  # Predictions for Arousal

    # Ensure track_ids length matches predictions length
    if len(track_ids) != len(predictions_val):
        raise ValueError("Mismatch: The number of track IDs does not match the number of predictions!")

    # Save predictions for validation data to CSV with track ID
    df_predictions = pd.DataFrame({
        'track_id': track_ids,  # Track ID column
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to a CSV file
    df_predictions.to_csv('predictions1.csv', index=False)

    print("‚úÖ Training completed and predictions saved with Track IDs!")
    return lr_val_top, lr_arou_top


## Train LR model

In [65]:
# ËØªÂèñÊï∞ÊçÆ
df = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv')

# ÂÅáËÆæÈ™åËØÅÈõÜÊòØ‰ªéÂéüÂßãÊï∞ÊçÆÈõÜ‰∏≠ÈÄâÂá∫ÁöÑ
df_validation = df.iloc[:len(X_val)]  # Âè™‰øùÁïôÈ™åËØÅÈõÜÈÉ®ÂàÜ

# ÊèêÂèñÈ™åËØÅÈõÜÂØπÂ∫îÁöÑ Track ID
track_ids = df_validation['trackname'].values  # ÊàñËÄÖ df_validation['track_id'] ÂèñÂîØ‰∏ÄÊ†áËØÜÁ¨¶

# ËÆ≠ÁªÉÂõûÂΩíÊ®°ÂûãÔºåÂπ∂Â≠òÂÇ®È¢ÑÊµãÁªìÊûú
train_regression(X_train, y_train_valence, y_train_arousal, X_val, track_ids)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best parameter for Valence (CV score=-0.912):
{'fit_intercept': True, 'positive': False}

Best parameter for Arousal (CV score=-0.778):
{'fit_intercept': False, 'positive': False}
‚úÖ Training completed and predictions saved with Track IDs!


(LinearRegression(), LinearRegression(fit_intercept=False))

In [93]:
# Assuming X_train, y_train, X_validation, y_validation are your data
lr_val_model, lr_arou_model = train_regression(X_train, y_train_valence, y_train_arousal, X_val)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best parameter for Valence (CV score=-0.912):
{'fit_intercept': True, 'positive': False}

Best parameter for Arousal (CV score=-0.778):
{'fit_intercept': False, 'positive': False}


PermissionError: [Errno 13] Permission denied: 'predictions2.csv'

## Evaluation

### R2+RMSE 

In [94]:
# Assuming X_val, y_valence_val, y_arousal_val are your validation data
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal ,model_predictions_file='predictions2.csv')


RMSE for Valence: 0.9352
RMSE for Arousal: 0.8537
Normalized RMSE for Valence: 0.2686
Normalized RMSE for Arousal: 0.1678
R¬≤ for Valence: 0.1860
R¬≤ for Arousal: 0.1939


RMSE for Valence: 0.935
RMSE for Arousal: 0.854
R¬≤ for Valence: 0.186
R¬≤ for Arousal: 0.194

In [28]:
df_predictions = pd.read_csv('predictions1.csv')
print(df_predictions.shape)  # ËæìÂá∫È¢ÑÊµãÁªìÊûúÁöÑË°åÊï∞ÔºåÁ°Æ‰øù‰∏éÈ™åËØÅÈõÜ‰∏ÄËá¥


(2558, 2)


In [29]:
print(len(df_predictions), len(y_val_valence), len(y_val_arousal))


2558 2558 2558


## Test

In [109]:
results = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    lr_val_model,
    lr_arou_model
)


üìä [Test Set Evaluation]
RMSE (Valence): 0.9211
RMSE (Arousal): 0.8569
Normalized RMSE (Valence): 0.2493
Normalized RMSE (Arousal): 0.1684
R¬≤ (Valence): 0.2352
R¬≤ (Arousal): 0.1898


# Model MLP

## call func MLP

In [99]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

def train_mlp(X, y_1, y_2, X_val, param_grid=None):
    """ Train the MLP model with GridSearchCV and save the predictions """

    # Normalize the features using MinMaxScaler
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X_val = scaler.transform(X_val)

    # Default parameters for grid search
    if param_grid is None:
        param_grid = {
            'hidden_layer_sizes': [(5,), (10,), (15,), (5,5), (10,10), (15,15), (5,5,5), (10,10,10), (15,15,15)], 
            'max_iter': [500, 1000, 2000, 2500]
        }

    # Initialize models for both targets
    mlp_val = MLPRegressor(random_state=2)
    mlp_arou = MLPRegressor(random_state=2)
    
    # Grid search for Valence
    clf_vale = GridSearchCV(mlp_val, param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, return_train_score=True)
    clf_vale.fit(X, y_1)
    print(f"Best parameter for Valence (CV score={-clf_vale.best_score_:.3f}): {clf_vale.best_params_}")
    
    # Initialize model with best parameters and fit
    mlp_val_top = MLPRegressor(hidden_layer_sizes=clf_vale.best_params_['hidden_layer_sizes'],
                               max_iter=clf_vale.best_params_['max_iter'],
                               random_state=2)
    mlp_val_top.fit(X, y_1)

    # Grid search for Arousal
    clf_arou = GridSearchCV(mlp_arou, param_grid, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, return_train_score=True)
    clf_arou.fit(X, y_2)
    print(f"Best parameter for Arousal (CV score={-clf_arou.best_score_:.3f}): {clf_arou.best_params_}")
    
    # Initialize model with best parameters and fit
    mlp_arou_top = MLPRegressor(hidden_layer_sizes=clf_arou.best_params_['hidden_layer_sizes'],
                                max_iter=clf_arou.best_params_['max_iter'],
                                random_state=2)
    mlp_arou_top.fit(X, y_2)

    # Predict for validation data
    predictions_val = mlp_val_top.predict(X_val)  # Predictions for Valence
    predictions_arou = mlp_arou_top.predict(X_val)  # Predictions for Arousal

    # Save predictions for validation data to CSV
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to a CSV file
    df_predictions.to_csv('predictions_mlp.csv', index=False)

    print("‚úÖ MLP training completed and predictions saved!")
    return mlp_val_top, mlp_arou_top


In [100]:
# Assuming X_train, y_train_valence, y_train_arousal, X_val are available
mlp_val_model, mlp_arou_model = train_mlp(X_train, y_train_valence, y_train_arousal, X_val)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameter for Valence (CV score=0.903): {'hidden_layer_sizes': (5, 5), 'max_iter': 500}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameter for Arousal (CV score=0.769): {'hidden_layer_sizes': (5, 5, 5), 'max_iter': 500}
‚úÖ MLP training completed and predictions saved!


## Evaluation Results

In [101]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='predictions_mlp.csv')

RMSE for Valence: 0.9324
RMSE for Arousal: 0.8468
Normalized RMSE for Valence: 0.2678
Normalized RMSE for Arousal: 0.1664
R¬≤ for Valence: 0.1909
R¬≤ for Arousal: 0.2069


## Test

In [107]:
### Test Set - MLP 
mlp_val = MLPRegressor(hidden_layer_sizes=mlp_val_model.hidden_layer_sizes, max_iter=mlp_val_model.max_iter, random_state=2)
mlp_arou = MLPRegressor(hidden_layer_sizes=mlp_arou_model.hidden_layer_sizes, max_iter=mlp_arou_model.max_iter, random_state=2)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape
print(mlp_val.fit(X_train_scaled, y_train_valence).score(X_test_scaled, y_test_valence))
print(mlp_arou.fit(X_train_scaled, y_train_arousal).score(X_test_scaled, y_test_arousal))

0.23934316834217395
0.20374419332813887


In [114]:
from sklearn.preprocessing import MinMaxScaler

# ËÆ≠ÁªÉÊó∂ÂÅöÁöÑÂΩí‰∏ÄÂåñ
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
mlp_val_model.fit(X_train_scaled, y_train_valence)
mlp_arou_model.fit(X_train_scaled, y_train_arousal)

# Âú®ËØÑ‰º∞ÂáΩÊï∞ÈáåÁõ¥Êé•‰º† scaler
results = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    mlp_val_model,
    mlp_arou_model,
    scaler_X=scaler  # ‚úÖ ‰º†ÂÖ•Áî®‰∫éËÆ≠ÁªÉÁöÑ scaler
)


üìä [Test Set Evaluation]
RMSE (Valence): 0.9186
RMSE (Arousal): 0.8495
Normalized RMSE (Valence): 0.2486
Normalized RMSE (Arousal): 0.1669
R¬≤ (Valence): 0.2393
R¬≤ (Arousal): 0.2037


# Model Random Forest

## Def Model

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_rf(X, y_1, y_2, X_val):
    """ Train Random Forest model and save predictions for validation data """

    # Initialize models for Valence and Arousal
    rf_val = RandomForestRegressor(random_state=0)
    rf_arou = RandomForestRegressor(random_state=0)

    # Hyperparameter grid for GridSearchCV
    param_grid = { 
        'n_estimators': [100, 500],
        'max_depth': [5, 10, 15]
    }

    # Grid search for Valence model
    clf_vale = GridSearchCV(rf_val, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Grid search for Arousal model
    clf_arou = GridSearchCV(rf_arou, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models for Valence and Arousal
    clf_vale.fit(X, y_1)
    clf_arou.fit(X, y_2)

    # Print best parameters found by GridSearchCV
    print(f"Best parameter for Valence (CV score={-clf_vale.best_score_:.3f}):")
    print(clf_vale.best_params_)

    print(f"Best parameter for Arousal (CV score={-clf_arou.best_score_:.3f}):")
    print(clf_arou.best_params_)

    # Initialize models with best parameters
    rf_val_best = RandomForestRegressor(n_estimators=clf_vale.best_params_['n_estimators'],
                                        max_depth=clf_vale.best_params_['max_depth'],
                                        random_state=0)
    
    rf_arou_best = RandomForestRegressor(n_estimators=clf_arou.best_params_['n_estimators'],
                                         max_depth=clf_arou.best_params_['max_depth'],
                                         random_state=0)

    # Fit models with best parameters
    rf_val_best.fit(X, y_1)
    rf_arou_best.fit(X, y_2)

    # Predict for validation data
    predictions_val = rf_val_best.predict(X_val)  # Predictions for Valence
    predictions_arou = rf_arou_best.predict(X_val)  # Predictions for Arousal

    # Save predictions to a DataFrame
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,  # Valence predictions
        'pred_arousal': predictions_arou  # Arousal predictions
    })

    # Save predictions to CSV
    df_predictions.to_csv('predictions_rf.csv', index=False)

    print("‚úÖ Random Forest training completed and predictions saved!")

    return rf_val_best, rf_arou_best


## Train RF

In [47]:
rf_val_model, rf_arou_model = train_rf(X_train, y_train_valence, y_train_arousal, X_val)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameter for Valence (CV score=0.924):
{'max_depth': 10, 'n_estimators': 500}
Best parameter for Arousal (CV score=0.751):
{'max_depth': 10, 'n_estimators': 500}
‚úÖ Random Forest training completed and predictions saved!


## Evaluation fro Rf

In [95]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='predictions_rf.csv') 

RMSE for Valence: 0.9443
RMSE for Arousal: 0.8342
Normalized RMSE for Valence: 0.2712
Normalized RMSE for Arousal: 0.1639
R¬≤ for Valence: 0.1700
R¬≤ for Arousal: 0.2304


## TEST

In [118]:
#### Test Set - RF
rf_val = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)
rf_arou = RandomForestRegressor(n_estimators = 100, 
                                        max_depth = 5, random_state=0)

print(rf_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(rf_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))

0.21563447935080082
0.2031321009223268


In [117]:



# ÊµãËØïÈõÜÁâπÂæÅÂΩí‰∏ÄÂåñ‰ºöÂú®ÂáΩÊï∞ÈáåËá™Âä®ÂÆåÊàêÔºàÈÄöËøá‰º†ÂÖ• scalerÔºâ
results_rf = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    rf_val_model,
    rf_arou_model,
    scaler_X=None  # ‚úÖ ÂΩí‰∏ÄÂåñÊµãËØïÈõÜÁâπÂæÅ
)


üìä [Test Set Evaluation]
RMSE (Valence): 0.9280
RMSE (Arousal): 0.8478
Normalized RMSE (Valence): 0.2512
Normalized RMSE (Arousal): 0.1666
R¬≤ (Valence): 0.2238
R¬≤ (Arousal): 0.2069


# Model SVR

## def model

In [122]:
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd

def train_svr_linear(X, y_1, y_2, X_val):
    """ Train SVR model (linear kernel) with GridSearch over C, and save predictions for validation data """

    # Normalize features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_val_scaled = scaler.transform(X_val)

    # Define base model
    base_svr = SVR(kernel='linear')

    # Hyperparameter grid (only for C)
    param_grid = {
        'C': [0.1, 1, 5, 10, 100]
    }

    # Grid search for Valence model
    clf_vale = GridSearchCV(base_svr, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Grid search for Arousal model
    clf_arou = GridSearchCV(base_svr, 
                            param_grid, 
                            scoring='neg_mean_squared_error', 
                            verbose=1, 
                            n_jobs=-1,
                            return_train_score=True)

    # Fit models
    clf_vale.fit(X_scaled, y_1)
    clf_arou.fit(X_scaled, y_2)

    # Print best parameters
    print(f"Best C for Valence (CV score={-clf_vale.best_score_:.3f}): {clf_vale.best_params_['C']}")
    print(f"Best C for Arousal (CV score={-clf_arou.best_score_:.3f}): {clf_arou.best_params_['C']}")

    # Use best estimators to predict
    svr_val_best = clf_vale.best_estimator_
    svr_arou_best = clf_arou.best_estimator_

    predictions_val = svr_val_best.predict(X_val_scaled)
    predictions_arou = svr_arou_best.predict(X_val_scaled)

    # Save predictions
    df_predictions = pd.DataFrame({
        'pred_valence': predictions_val,
        'pred_arousal': predictions_arou
    })

    df_predictions.to_csv('predictions_svr_linear_gridC.csv', index=False)

    print("‚úÖ SVR (linear kernel with GridSearch on C) training completed and predictions saved!")

    return svr_val_best, svr_arou_best


In [123]:
svr_val_model, svr_arou_model = train_svr_linear(X_train, y_train_valence, y_train_arousal, X_val)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best C for Valence (CV score=0.931): 0.1
Best C for Arousal (CV score=0.780): 0.1
‚úÖ SVR (linear kernel with GridSearch on C) training completed and predictions saved!


## Evaluation

In [124]:
eval_results = evaluate_model(X_val, y_val_valence, y_val_arousal, model_predictions_file='predictions_svr_linear_gridC.csv')

RMSE for Valence: 0.9415
RMSE for Arousal: 0.8543
Normalized RMSE for Valence: 0.2704
Normalized RMSE for Arousal: 0.1679
R¬≤ for Valence: 0.1749
R¬≤ for Arousal: 0.1927


## Test

In [131]:
#### Test Score - SVR
svr_val = SVR(kernel=svr_val_model.kernel, C=svr_val_model.C)
svr_arou = SVR(kernel=svr_arou_model.kernel, C=svr_arou_model.C)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(svr_val.fit(X_train, y_train_valence).score(X_test, y_test_valence))
print(svr_arou.fit(X_train, y_train_arousal).score(X_test, y_test_arousal))


0.23012613432357842
0.18684068102087226


In [132]:
results_svr = evaluate_on_test(
    X_test,
    y_test_valence,
    y_test_arousal,
    svr_val_model,
    svr_arou_model,
    scaler_X=scaler  # ‚úÖ ‰º†ÂÖ•Áî®‰∫éËÆ≠ÁªÉÁöÑ scaler
)

üìä [Test Set Evaluation]
RMSE (Valence): 0.9242
RMSE (Arousal): 0.8585
Normalized RMSE (Valence): 0.2501
Normalized RMSE (Arousal): 0.1687
R¬≤ (Valence): 0.2301
R¬≤ (Arousal): 0.1868


# Normalization

## Match True and Prediction

In [83]:
import pandas as pd

# ËØªÂèñ Validation Êï∞ÊçÆÔºàÁúüÂÆûÂÄºÔºâ
df_val = pd.read_csv('data/merged/merged_cleaned_sentiment_validation.csv')  # Á°Æ‰øù‰ΩøÁî®È™åËØÅÈõÜÊñá‰ª∂

# ËØªÂèñÈ¢ÑÊµãÊï∞ÊçÆ
df_predictions = pd.read_csv('predictions1.csv')

# Áõ¥Êé•ÂåπÈÖçÁúüÂÆûÁöÑ Valence Âíå ArousalÔºàÂÅáËÆæÊï∞ÊçÆË°åÈ°∫Â∫èÁõ∏ÂêåÔºâ
df_predictions['y_valence'] = df_val['y_valence'].values[:len(df_predictions)]
df_predictions['y_arousal'] = df_val['y_arousal'].values[:len(df_predictions)]

# Êü•ÁúãÂâçÂá†Ë°åÔºåÁ°Æ‰øùÂåπÈÖçÊ≠£Á°Æ
print(df_predictions.head())


           track_id  pred_valence  pred_arousal  y_valence  y_arousal
0         Strangler      0.122002      0.006594  -1.176640  -0.314720
1        The Letter      0.300168      0.070162  -0.780962  -0.789480
2   Big Sky Country      0.542147      0.138120  -0.767318   0.911361
3    Baby's Romance     -0.467359     -0.365043  -1.487725  -0.360813
4  Over The Rainbow     -0.292903     -0.176600   1.071901   0.846830


## Normaliation T+P

In [87]:
from sklearn.preprocessing import MinMaxScaler

# ‰∏∫ Valence Âíå Arousal ÁúüÂÆûÂÄºÂàõÂª∫Áã¨Á´ãÁöÑÂΩí‰∏ÄÂåñÂô®
scaler_valence = MinMaxScaler(feature_range=(0, 1))
scaler_arousal = MinMaxScaler(feature_range=(0, 1))

# ÂΩí‰∏ÄÂåñÁúüÂÆûÂÄº
df_predictions['y_valence_scaled'] = scaler_valence.fit_transform(df_predictions[['y_valence']])
df_predictions['y_arousal_scaled'] = scaler_arousal.fit_transform(df_predictions[['y_arousal']])

# ‰∏∫È¢ÑÊµãÂÄºÂàõÂª∫Áã¨Á´ãÁöÑÂΩí‰∏ÄÂåñÂô®
scaler_pred_valence = MinMaxScaler(feature_range=(0, 1))
scaler_pred_arousal = MinMaxScaler(feature_range=(0, 1))

# ÂΩí‰∏ÄÂåñÈ¢ÑÊµãÂÄº
df_predictions['pred_valence_scaled'] = scaler_pred_valence.fit_transform(df_predictions[['pred_valence']])
df_predictions['pred_arousal_scaled'] = scaler_pred_arousal.fit_transform(df_predictions[['pred_arousal']])

# Êü•ÁúãÂΩí‰∏ÄÂåñÂêéÁöÑÊï∞ÊçÆ
print(df_predictions[['track_id', 'y_valence_scaled', 'pred_valence_scaled', 'y_arousal_scaled', 'pred_arousal_scaled']].head())


           track_id  y_valence_scaled  pred_valence_scaled  y_arousal_scaled  \
0         Strangler          0.217868             0.564546          0.396739   
1        The Letter          0.331505             0.623268          0.303442   
2   Big Sky Country          0.335423             0.703022          0.637681   
3    Baby's Romance          0.128527             0.370297          0.387681   
4  Over The Rainbow          0.863636             0.427796          0.625000   

   pred_arousal_scaled  
0             0.460284  
1             0.488848  
2             0.519384  
3             0.293290  
4             0.377966  


In [88]:
print(f"y_valence min: {df_predictions['y_valence'].min()}, max: {df_predictions['y_valence'].max()}")
print(f"pred_valence min: {df_predictions['pred_valence'].min()}, max: {df_predictions['pred_valence'].max()}")


y_valence min: -1.93524985679, max: 1.54671444505
pred_valence min: -1.5908582342796578, max: 1.4431918286540686


In [89]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_valence = np.sqrt(mean_squared_error(df_predictions['y_valence_scaled'], df_predictions['pred_valence_scaled']))
rmse_arousal = np.sqrt(mean_squared_error(df_predictions['y_arousal_scaled'], df_predictions['pred_arousal_scaled']))

print(f"RMSE for Valence (normalized scale): {rmse_valence:.3f}")
print(f"RMSE for Arousal (normalized scale): {rmse_arousal:.3f}")


RMSE for Valence (normalized scale): 0.274
RMSE for Arousal (normalized scale): 0.204
