In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [13]:
# Function to calculate overall RMSE and top 10% RMSE
def calculate_rmse(df, start_row=None, end_row=None):
    # Filter the rows if a range is provided
    if start_row is not None and end_row is not None:
        df = df.iloc[start_row:end_row]

    # Remove rows with NaN values
    df = df.dropna(subset=['Predicted', 'Actual', 'Raw'])

    # Calculate overall RMSE
    overall_rmse = np.sqrt(mean_squared_error(df['Predicted'], df['Raw']))

    # Calculate RMSE for the top 10% of the highest 'Raw' data points
    top_10_percent_threshold = np.percentile(df['Raw'], 90)  # 90th percentile
    top_10_percent_data = df[df['Raw'] >= top_10_percent_threshold]
    top_10_rmse = np.sqrt(mean_squared_error(top_10_percent_data['Predicted'], top_10_percent_data['Raw']))
    
    return overall_rmse, top_10_rmse

In [15]:
files = ["l3-cutoff-01", "l3-cutoff-02", "l3-cutoff-03", "l3-cutoff-04", "l3-cutoff-05"]

# Load the CSV file
for file in files:
    file_path = f'../../source/l3_prediction_dataset/butterworth/{file}.csv'
    df = pd.read_csv(file_path)

    overall_rmse, top_10_rmse = calculate_rmse(df, start_row=None, end_row=None)

    print(f"Overall RMSE for {file}: {overall_rmse}")
    print(f"Top 10% RMSE for {file}: {top_10_rmse}\n")

Overall RMSE for l3-cutoff-01: 0.045828484298954676
Top 10% RMSE for l3-cutoff-01: 0.07823614568941256

Overall RMSE for l3-cutoff-02: 0.050754842387351146
Top 10% RMSE for l3-cutoff-02: 0.07074152513919785

Overall RMSE for l3-cutoff-03: 0.0538750874853876
Top 10% RMSE for l3-cutoff-03: 0.08498984196181643

Overall RMSE for l3-cutoff-04: 0.057998510173104374
Top 10% RMSE for l3-cutoff-04: 0.10820989422034707

Overall RMSE for l3-cutoff-05: 0.05951853410729308
Top 10% RMSE for l3-cutoff-05: 0.10760690979670996

