In [22]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error


def calculate_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def direction_accuracy(actual, forecast):
    actual_direction = np.sign(np.diff(actual))
    forecast_direction = np.sign(np.diff(forecast))
    correct = np.sum(actual_direction == forecast_direction)
    return correct / len(actual_direction) * 100

def rmse_reduction(rmse_model, rmse_rw):
    return (1 - rmse_model / rmse_rw) * 100


def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def calculate_corr_dir(actual_values, predicted_values, last_value):
    if not last_value:
        return 1
    t = actual_values.tail(1).values[0]
    t2 = predicted_values.tail(1).values[0]
    sign_act = np.sign(actual_values.tail(1).values[0] - last_value)
    sign_pred = np.sign(predicted_values.tail(1).values[0] - last_value)
    if sign_act == sign_pred:
        return 1
    else:
        return 0

def calculate_avg_dir_accuracy(values):
    return sum(values) / len(values) * 100


def calculate_rmse_reduction(baseline_rmse, model_rmse):
    return (baseline_rmse - model_rmse) / baseline_rmse * 100





segments = ["csz", "pmx", "smx"]
horizons = [1, 2, 5, 10, 20]

segments = ["smx"]
horizons = [10,20]

for seg in segments:
    for hor in horizons:
        df_ml_0 = pd.read_csv(f"./pred/{seg}_{hor}_0.csv")
        df_ml_1 = pd.read_csv(f"./pred/{seg}_{hor}_1.csv")
        df_ml_2 = pd.read_csv(f"./pred/{seg}_{hor}_2.csv")

        df_r = pd.read_csv(f"./pred/{seg}_{hor}_r.csv")

        df_r.rename(columns={"Actuals": "Actual"}, inplace=True)
        df_r.columns = df_r.columns.str.replace('_fcs', '_pred', regex=False)


        #df_r = df_r.iloc[1:-27]
        df_r.head()

        # Define a small delta
        delta = 2e-3
        ml_long = False
        # Find the longer DataFrame
        if len(df_ml_0) > len(df_r):
            longer_df = df_ml_0
            shorter_df = df_r
            ml_long = True
        else:
            longer_df = df_r
            shorter_df = df_ml_0

        # Initialize counters
        i = 0
        j = 0

        # Compare each row of both DataFrames
        while i < len(longer_df) and j < len(shorter_df):
            if not np.isclose(longer_df.loc[i, 'Actual'], shorter_df.loc[j, 'Actual'], atol=delta):
                longer_df = longer_df.drop(i).reset_index(drop=True)
                if ml_long:
                    df_ml_1_df = df_ml_1.drop(i).reset_index(drop=True)
                    df_ml_2_df = df_ml_2.drop(i).reset_index(drop=True)
            else:
                i += 2
                j += 2

        # Remove any extra rows from the longer DataFrame
        longer_df = longer_df[:len(shorter_df)]
        if ml_long:
            df_ml_1 = df_ml_1[:len(shorter_df)]
            df_ml_2 = df_ml_2[:len(shorter_df)]



        # Print the final cleaned longer DataFrame
        print("\nCleaned Longer DataFrame:")
        print(longer_df)

        cumsum = 0
        for i in range(len(longer_df)):
            t = longer_df.loc[i, "Actual"]
            cumsum += longer_df.loc[i, "Actual"] - shorter_df.loc[i, "Actual"]
            
        print(cumsum)

        # Sum up all column errors to get the total cumulative error

        # Remove the identified rows from the longer DataFrame
        df_merged = pd.merge(longer_df, shorter_df, left_index=True, right_index=True, how="left", suffixes=("", "_ml"))
        df_merged = pd.merge(df_merged, df_ml_1, left_index=True, right_index=True, how="left", suffixes=("", "_1_ml"))
        df_merged = pd.merge(df_merged, df_ml_2, left_index=True, right_index=True, how="left", suffixes=("", "_2_ml"))


        # Remove right DataFrame's duplicate columns
        columns_to_keep = [col for col in df_merged.columns if not col.endswith('_ml')]

        # Keep only the required columns (excluding duplicates from the right DataFrame)
        df_final = df_merged[columns_to_keep]
        # Remove unnamed columns (columns with 'Unnamed' in their name)
        df_final = df_final.loc[:, ~df_final.columns.str.contains('^Unnamed')]
        df_final.head()
        print(df_final)
        
        
        df = df_final
        
        # COMB 1
        # Selected models
        if seg == "pmx" or seg == "csz":
            forecast_columns = ['VECM_pred', 'LSTM_pred_[2, 4]']
        elif seg == "smx":
            forecast_columns = ['VECM_pred', 'LSTM_pred_[3, 4]']
            
        forecast_data = df[forecast_columns].to_numpy()


        df['avg_1_pred'] = df[forecast_columns].mean(axis=1)
        df['avg_1_res'] = df['Actual'] - df['avg_1_pred']


        # Covariance matrix and eigenvectors
        cov_matrix = np.cov(forecast_data, rowvar=False)
        eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

        # Get the eigenvector associated with the largest eigenvalue
        largest_eigenvector = eigenvectors[:, np.argmax(eigenvalues)]
        eigenvector_weights = largest_eigenvector / np.sum(largest_eigenvector)

        # Combine forecasts using the largest eigenvector
        df['eigen_1_pred'] = np.dot(forecast_data, eigenvector_weights)
        df['eigen_1_res'] = df['Actual'] - df['eigen_1_pred']
        print(df)


        # COMB 2
        # ALl models
        # Prepare for Eigenvector-Based Combination
        if seg == "pmx":
            forecast_columns = ['VAR_pred', 'VECM_pred', 'ARIMA_pred', 'MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[2]', 'LSTM_pred_[2]', 'MLP_pred_[2, 4]', 'LSTM_pred_[2, 4]']
        elif seg == "csz":
            forecast_columns = ['VAR_pred', 'VECM_pred', 'ARIMA_pred', 'MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[4]', 'LSTM_pred_[4]', 'MLP_pred_[2, 4]', 'LSTM_pred_[2, 4]']     
        elif seg == "smx":
            forecast_columns = ['VAR_pred', 'VECM_pred', 'ARIMA_pred', 'MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[2]', 'LSTM_pred_[2]', 'MLP_pred_[3, 4]', 'LSTM_pred_[3, 4]']       
        forecast_data = df[forecast_columns].to_numpy()



        # Compute the Simple Average of All Models
        df['avg_2_pred'] = df[forecast_columns].mean(axis=1)
        df['avg_2_res'] = df['Actual'] - df['avg_2_pred']


        # Combine All Models using the Eigenvector-Based Combination
        cov_matrix_all = np.cov(forecast_data, rowvar=False)
        eigenvalues_all, eigenvectors_all = np.linalg.eig(cov_matrix_all)

        # Get the eigenvector associated with the largest eigenvalue
        largest_eigenvector_all = eigenvectors_all[:, np.argmax(eigenvalues_all)]
        eigenvector_weights_all = largest_eigenvector_all / np.sum(largest_eigenvector_all)

        # Combine forecasts using the largest eigenvector
        df['eigen_2_pred'] = np.dot(forecast_data, eigenvector_weights_all)
        df['eigen_2_res'] = df['Actual'] - df['eigen_2_pred']

        print(df)

        
        

        
        
        # COMB 3: ALl Econometric models

        #df = df_final.copy()
        #print(df)

        # Prepare for Eigenvector-Based Combination
        forecast_columns = ['VAR_pred', 'VECM_pred', 'ARIMA_pred']
        forecast_data = df[forecast_columns].to_numpy()

        # Compute the Simple Average of All Models
        df['avg_3_pred'] = df[forecast_columns].mean(axis=1)
        df['avg_3_res'] = df['Actual'] - df['avg_3_pred']

        # Combine All Models using the Eigenvector-Based Combination
        cov_matrix_all = np.cov(forecast_data, rowvar=False)
        eigenvalues_all, eigenvectors_all = np.linalg.eig(cov_matrix_all)

        # Get the eigenvector associated with the largest eigenvalue
        largest_eigenvector_all = eigenvectors_all[:, np.argmax(eigenvalues_all)]
        eigenvector_weights_all = largest_eigenvector_all / np.sum(largest_eigenvector_all)

        # Combine forecasts using the largest eigenvector
        df['eigen_3_pred'] = np.dot(forecast_data, eigenvector_weights_all)
        df['eigen_3_res'] = df['Actual'] - df['eigen_3_pred']

        df_comb2 = df.copy()
        print(df)
        
        
        # COMB 4: ALl ML models

        #df = df_final.copy()
        #print(df)


        # Prepare for Eigenvector-Based Combination
        if seg == "pmx":
            forecast_columns = ['MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[2]', 'LSTM_pred_[2]', 'MLP_pred_[2, 4]', 'LSTM_pred_[2, 4]']
        elif seg == "csz":
            forecast_columns = ['MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[4]', 'LSTM_pred_[4]', 'MLP_pred_[2, 4]', 'LSTM_pred_[2, 4]']     
        elif seg == "smx":
            forecast_columns = ['MLP_pred_[]', 'LSTM_pred_[]', 
                                'MLP_pred_[2]', 'LSTM_pred_[2]', 'MLP_pred_[3, 4]', 'LSTM_pred_[3, 4]']  
        forecast_data = df[forecast_columns].to_numpy()



        # Compute the Simple Average of All Models
        df['avg_4_pred'] = df[forecast_columns].mean(axis=1)
        df['avg_4_res'] = df['Actual'] - df['avg_4_pred']


        # Combine All Models using the Eigenvector-Based Combination
        cov_matrix_all = np.cov(forecast_data, rowvar=False)
        eigenvalues_all, eigenvectors_all = np.linalg.eig(cov_matrix_all)

        # Get the eigenvector associated with the largest eigenvalue
        largest_eigenvector_all = eigenvectors_all[:, np.argmax(eigenvalues_all)]
        eigenvector_weights_all = largest_eigenvector_all / np.sum(largest_eigenvector_all)

        # Combine forecasts using the largest eigenvector
        df['eigen_4_pred'] = np.dot(forecast_data, eigenvector_weights_all)
        df['eigen_4_res'] = df['Actual'] - df['eigen_4_pred']

        df_comb2 = df.copy()
        print(df)
        

        df.to_csv(f'./mcs/{seg}_{hor}.csv', index=False, mode="a")    



        rw_rmses= []

        avg_1_rmses = []
        avg_1_maes = []
        avg_1_mapes = []
        avg_1_corr_dirs = []
        eigen_1_rmses = []
        eigen_1_maes = []
        eigen_1_mapes = []
        eigen_1_corr_dirs = []

        avg_2_rmses = []
        avg_2_maes = []
        avg_2_mapes = []
        avg_2_corr_dirs = []
        eigen_2_rmses = []
        eigen_2_maes = []
        eigen_2_mapes = []
        eigen_2_corr_dirs = []

        avg_3_rmses = []
        avg_3_maes = []
        avg_3_mapes = []
        avg_3_corr_dirs = []
        eigen_3_rmses = []
        eigen_3_maes = []
        eigen_3_mapes = []
        eigen_3_corr_dirs = []

        avg_4_rmses = []
        avg_4_maes = []
        avg_4_mapes = []
        avg_4_corr_dirs = []
        eigen_4_rmses = []
        eigen_4_maes = []
        eigen_4_mapes = []
        eigen_4_corr_dirs = []

        num_fcs = len(df) // hor
        for i in range(num_fcs):
            
            if i == 0:
                last_value = None
            else:
                last_value = df["Actual"].iloc[i-1]
            act = df["Actual"].iloc[i:i+hor]
            
            rw_rmses.append(calculate_rmse(act, df["RW_pred"].iloc[i:i+hor]))
            
            pred = df["avg_1_pred"].iloc[i:i+hor]
            avg_1_rmses.append(calculate_rmse(act,pred ))
            avg_1_maes.append(calculate_mae(act, pred))
            avg_1_mapes.append(calculate_mape(act, pred))
            if last_value:
                avg_1_corr_dirs.append(calculate_corr_dir(act, pred, last_value))
            
            pred = df["eigen_1_pred"].iloc[i:i+hor]   
            eigen_1_rmses.append(calculate_rmse(act,pred ))
            eigen_1_maes.append(calculate_mae(act, pred))
            eigen_1_mapes.append(calculate_mape(act, pred))
            if last_value:
                eigen_1_corr_dirs.append(calculate_corr_dir(act, pred, last_value))

            pred = df["avg_2_pred"].iloc[i:i+hor]
            avg_2_rmses.append(calculate_rmse(act,pred ))
            avg_2_maes.append(calculate_mae(act, pred))
            avg_2_mapes.append(calculate_mape(act, pred))
            if last_value:
                avg_2_corr_dirs.append(calculate_corr_dir(act, pred, last_value))
            
            pred = df["eigen_2_pred"].iloc[i:i+hor]   
            eigen_2_rmses.append(calculate_rmse(act,pred ))
            eigen_2_maes.append(calculate_mae(act, pred))
            eigen_2_mapes.append(calculate_mape(act, pred))
            if last_value:
                eigen_2_corr_dirs.append(calculate_corr_dir(act, pred, last_value))

            pred = df["avg_3_pred"].iloc[i:i+hor]
            avg_3_rmses.append(calculate_rmse(act,pred ))
            avg_3_maes.append(calculate_mae(act, pred))
            avg_3_mapes.append(calculate_mape(act, pred))
            if last_value:
                avg_3_corr_dirs.append(calculate_corr_dir(act, pred, last_value))
            
            pred = df["eigen_3_pred"].iloc[i:i+hor]   
            eigen_3_rmses.append(calculate_rmse(act,pred ))
            eigen_3_maes.append(calculate_mae(act, pred))
            eigen_3_mapes.append(calculate_mape(act, pred))
            if last_value:
                eigen_3_corr_dirs.append(calculate_corr_dir(act, pred, last_value))

            pred = df["avg_4_pred"].iloc[i:i+hor]
            avg_4_rmses.append(calculate_rmse(act,pred ))
            avg_4_maes.append(calculate_mae(act, pred))
            avg_4_mapes.append(calculate_mape(act, pred))
            if last_value:
                avg_4_corr_dirs.append(calculate_corr_dir(act, pred, last_value))
            
            pred = df["eigen_4_pred"].iloc[i:i+hor]   
            eigen_4_rmses.append(calculate_rmse(act,pred ))
            eigen_4_maes.append(calculate_mae(act, pred))
            eigen_4_mapes.append(calculate_mape(act, pred))
            if last_value:
                eigen_4_corr_dirs.append(calculate_corr_dir(act, pred, last_value))
            

        rw_rmse = np.mean(rw_rmses)

        avg_1_rmse = np.mean(avg_1_rmses)
        avg_1_mae = np.mean(avg_1_maes)
        avg_1_mape = np.mean(avg_1_mapes)
        avg_1_corr_dir = calculate_avg_dir_accuracy(avg_1_corr_dirs)
        avg_1_red = calculate_rmse_reduction(rw_rmse, avg_1_rmse)
                
        eigen_1_rmse = np.mean(eigen_1_rmses)
        eigen_1_mae = np.mean(eigen_1_maes)
        eigen_1_mape = np.mean(eigen_1_mapes)
        eigen_1_corr_dir = calculate_avg_dir_accuracy(eigen_1_corr_dirs)
        eigen_1_red = calculate_rmse_reduction(rw_rmse, eigen_1_rmse)

        avg_2_rmse = np.mean(avg_2_rmses)
        avg_2_mae = np.mean(avg_2_maes)
        avg_2_mape = np.mean(avg_2_mapes)
        avg_2_corr_dir = calculate_avg_dir_accuracy(avg_2_corr_dirs)
        avg_2_red = calculate_rmse_reduction(rw_rmse, avg_2_rmse)

        eigen_2_rmse = np.mean(eigen_2_rmses)
        eigen_2_mae = np.mean(eigen_2_maes)
        eigen_2_mape = np.mean(eigen_2_mapes)
        eigen_2_corr_dir = calculate_avg_dir_accuracy(eigen_2_corr_dirs)
        eigen_2_red = calculate_rmse_reduction(rw_rmse, eigen_2_rmse)

        avg_3_rmse = np.mean(avg_3_rmses)
        avg_3_mae = np.mean(avg_3_maes)
        avg_3_mape = np.mean(avg_3_mapes)
        avg_3_corr_dir = calculate_avg_dir_accuracy(avg_3_corr_dirs)
        avg_3_red = calculate_rmse_reduction(rw_rmse, avg_3_rmse)

        eigen_3_rmse = np.mean(eigen_3_rmses)
        eigen_3_mae = np.mean(eigen_3_maes)
        eigen_3_mape = np.mean(eigen_3_mapes)
        eigen_3_corr_dir = calculate_avg_dir_accuracy(eigen_3_corr_dirs)
        eigen_3_red = calculate_rmse_reduction(rw_rmse, eigen_3_rmse)

        avg_4_rmse = np.mean(avg_4_rmses)
        avg_4_mae = np.mean(avg_4_maes)
        avg_4_mape = np.mean(avg_4_mapes)
        avg_4_corr_dir = calculate_avg_dir_accuracy(avg_4_corr_dirs)
        avg_4_red = calculate_rmse_reduction(rw_rmse, avg_4_rmse)

        eigen_4_rmse = np.mean(eigen_4_rmses)
        eigen_4_mae = np.mean(eigen_4_maes)
        eigen_4_mape = np.mean(eigen_4_mapes)
        eigen_4_corr_dir = calculate_avg_dir_accuracy(eigen_4_corr_dirs)
        eigen_4_red = calculate_rmse_reduction(rw_rmse, eigen_4_rmse)

        # Prepare data for CSV
        results = [
            [seg, hor, 'avg_1', avg_1_rmse, avg_1_mae, avg_1_mape, avg_1_corr_dir, avg_1_red],
            [seg, hor, 'eigen_1', eigen_1_rmse, eigen_1_mae, eigen_1_mape, eigen_1_corr_dir, eigen_1_red],
            [seg, hor, 'avg_2', avg_2_rmse, avg_2_mae, avg_2_mape, avg_2_corr_dir, avg_2_red],
            [seg, hor, 'eigen_2', eigen_2_rmse, eigen_2_mae, eigen_2_mape, eigen_2_corr_dir, eigen_2_red],
            [seg, hor, 'avg_3', avg_3_rmse, avg_3_mae, avg_3_mape, avg_3_corr_dir, avg_3_red],
            [seg, hor, 'eigen_3', eigen_3_rmse, eigen_3_mae, eigen_3_mape, eigen_3_corr_dir, eigen_3_red],
            [seg, hor, 'avg_4', avg_4_rmse, avg_4_mae, avg_4_mape, avg_4_corr_dir, avg_4_red],
            [seg, hor, 'eigen_4', eigen_4_rmse, eigen_4_mae, eigen_4_mape, eigen_4_corr_dir, eigen_4_red],
        ]

        # Create a DataFrame
        columns = ['Segment', 'Horizon', 'Model', 'RMSE', 'MAE', 'MAPE', 'CorrDir', 'Reduction']
        results_df = pd.DataFrame(results, columns=columns)

        # Write to CSV
        results_df.to_csv('model_metrics.csv', index=False, header=False, mode="a")    


Cleaned Longer DataFrame:
        Actual   VAR_pred  VECM_pred  ARIMA_pred    RW_pred   VAR_res  \
0    10.039809  10.070260  10.069503   10.070627  10.062967  0.030451   
1    10.009063  10.074693  10.072534   10.073461  10.062967  0.065630   
2     9.990261  10.077886  10.073864   10.075713  10.062967  0.087624   
3     9.973760  10.080112  10.074034   10.077438  10.062967  0.106353   
4     9.955653  10.082094  10.073480   10.078701  10.062967  0.126441   
..         ...        ...        ...         ...        ...       ...   
695   9.345745   9.335370   9.337410    9.351065   9.364434 -0.010375   
696   9.351927   9.330167   9.332860    9.349267   9.364434 -0.021760   
697   9.354441   9.325024   9.329179    9.347683   9.364434 -0.029417   
698   9.357380   9.320579   9.326449    9.346288   9.364434 -0.036802   
699   9.352968   9.316854   9.324503    9.345059   9.364434 -0.036113   

     VECM_res  ARIMA_res    RW_res  
0    0.029694   0.030818  0.023158  
1    0.063471   0.0643

In [None]:

#df_comb1 = df.copy()