Metric Reliability Test: To ensure we selected stable and
meaningful metrics for evaluating sentiment forecasting, we
conducted a reliability test inspired by the methodology of
St-Aubin and Agard (2022). This test evaluates how consis-
tently each metric behaves when forecasting errors are intro-
duced under controlled conditions.
We simulated sentiment forecasting outputs by adding
synthetic noise to real sentiment sequences. The noise in-
cluded varying bias levels (from −0.2 to 0.2) and variance
(from 0.01 to 0.5), reflecting realistic forecasting errors like
shifts in opinion or fluctuation in engagement. Each metric
was then evaluated over multiple runs to compute:
• Variability: Standard deviation of the metric values
across noise levels. Lower variability indicates the met-
ric remains stable when predictions are noisy.
• Confidence Interval Width (CI Width): The width of
the 95% confidence interval around the metric’s aver-
age score. Narrower intervals indicate greater precision
in evaluation.

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


def load_sentiment_data(file_path):
    data = pd.read_csv(file_path)
   
    data['sentiment_score'] = data['sentiment_score'].clip(-1, 1)
    return data


def MSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def MAE(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def sRMSE(y_true, y_pred, scale):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / scale

def MASE(y_true, y_pred, y_naive):
    errors = np.abs(y_true - y_pred)
    naive_errors = np.abs(y_true - y_naive)
    return np.mean(errors) / np.mean(naive_errors[naive_errors != 0] + 1e-10)

def sPIS(y_true, y_pred, scale):
    errors = y_pred - y_true
    return np.mean(errors) / scale  

def sAPIS(y_true, y_pred, scale):
    errors = y_pred - y_true
    return (np.mean(np.abs(errors)) + np.abs(np.mean(errors))) / scale 

def SMAPE(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-10))

def test_metrics(file_path, n_samples=None):
    
    data = load_sentiment_data(file_path)
    y_true = data['sentiment_score'].values
    if n_samples:
        y_true = y_true[:n_samples] 
    scale = np.std(y_true) if np.std(y_true) > 0 else 1
   
    y_naive = np.roll(y_true, 1)
    y_naive[0] = y_true[0]  

    bias_values = [-0.2, 0, 0.2]  
    variance_values = [0.01, 0.1, 0.5]  
    results = []

    for bias in bias_values:
        for var in variance_values:
            
            noise = np.random.normal(bias, np.sqrt(var), len(y_true))
            y_pred = np.clip(y_true + noise, -1, 1)

        
            metrics = {
                "bias": bias,
                "var": var,
                "MSE": MSE(y_true, y_pred),
                "RMSE": RMSE(y_true, y_pred),
                "MAE": MAE(y_true, y_pred),
                "sRMSE": sRMSE(y_true, y_pred, scale),
                "MASE": MASE(y_true, y_pred, y_naive),
                "sPIS": sPIS(y_true, y_pred, scale),
                "sAPIS": sAPIS(y_true, y_pred, scale),
                "SMAPE": SMAPE(y_true, y_pred)
            }
            results.append(metrics)


    results_df = pd.DataFrame(results)
    reliability = {}
    for metric in ["MSE", "RMSE", "MAE", "sRMSE", "MASE", "sPIS", "sAPIS", "SMAPE"]:
     
        variability = results_df[metric].std()

        boot_values = []
        for _ in range(100):
            sample = results_df[metric].sample(frac=1, replace=True)
            boot_values.append(sample.mean())
        ci = np.percentile(boot_values, [2.5, 97.5])
        ci_width = ci[1] - ci[0]

        reliability[metric] = {"variability": variability, "ci_width": ci_width}


    print("Metric Reliability (Lower variability and CI width = more reliable):")
    for metric, stats in reliability.items():
        print(f"{metric}: Variability = {stats['variability']:.4f}, CI Width = {stats['ci_width']:.4f}")

    for metric in ["MSE", "RMSE", "MAE", "sRMSE", "MASE", "sPIS", "sAPIS", "SMAPE"]:
        plt.figure(figsize=(8, 6))
        plt.scatter(results_df["bias"], results_df[metric], alpha=0.5)
        plt.title(f"{metric} Sensitivity to Bias")
        plt.xlabel("Bias")
        plt.ylabel(metric)
        plt.savefig(f"{metric}_sensitivity.png")
        plt.close()

    return results_df, reliability

results_df, reliability = test_metrics('/content/Score_output_data.csv', n_samples=1000)

Metric Reliability (Lower variability and CI width = more reliable):
MSE: Variability = 0.1296, CI Width = 0.1544
RMSE: Variability = 0.1747, CI Width = 0.2173
MAE: Variability = 0.1333, CI Width = 0.1537
sRMSE: Variability = 0.3247, CI Width = 0.4015
MASE: Variability = 0.3112, CI Width = 0.3709
sPIS: Variability = 0.2676, CI Width = 0.3380
sAPIS: Variability = 0.3014, CI Width = 0.3389
SMAPE: Variability = 23.1660, CI Width = 26.1985


Results of the test (variability, CI width):
• MSE: most reliable, especially for penalizing large errors.

• MAE: nearly as reliable as MSE; intuitive and robust.

• RMSE: reliable and highlights large deviations.

• sPIS: slightly less stable but valuable for identifying bias.

This evaluation ensures our chosen metrics are reliable,interpretable, and suitable for forecasting sentiment scores
over irregular time steps.