In [15]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [16]:
df_preds = pd.read_csv("C:/Users/nicol/OneDrive/Master/Goldsmiths/Thesis/weather-prediction/data/df_with_all_preds.csv")

In [17]:
df_preds.head()

Unnamed: 0.1,Unnamed: 0,temp,dew,humidity,precip,precipprob,precipcover,snow,snowdepth,windgust,...,preds_gs,sktime_naive,naive_last,preds_autoarima,preds_sarimax,lstm_base,lstm_two_layers,lstm_two_layers_batch_size_1,lstm_features,sarimax_gs
0,2023-05-22,15.1,6.9,64.3,0.0,0.0,0.0,0.0,0.0,26.1,...,13.432552,12.189839,12.189839,13.984044,15.701231,15.777449,15.368996,13.841642,12.066167,1.085501
1,2023-05-23,14.0,6.1,60.6,0.0,0.0,0.0,0.0,0.0,36.4,...,14.786562,12.189839,12.189839,13.895126,15.439488,16.630573,15.308199,13.047831,12.066167,1.225803
2,2023-05-24,15.8,6.7,61.2,0.0,0.0,0.0,0.0,0.0,34.7,...,15.185232,12.189839,12.189839,13.759057,16.274314,17.326608,15.238746,12.342554,12.066167,4.254934
3,2023-05-25,15.0,6.7,57.9,0.0,0.0,0.0,0.0,0.0,41.5,...,13.373943,12.189839,12.189839,13.725513,16.052129,17.891799,15.16725,11.64898,12.066167,3.296934
4,2023-05-26,14.2,6.9,63.5,0.0,0.0,0.0,0.0,0.0,26.1,...,14.683416,12.189839,12.189839,13.632718,14.740708,18.356468,15.097821,10.958488,12.066167,0.700043


In [18]:
df_preds.columns

Index(['Unnamed: 0', 'temp', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'moonphase',
       'clusters_description', 'year', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'preds', 'preds_gs', 'sktime_naive',
       'naive_last', 'preds_autoarima', 'preds_sarimax', 'lstm_base',
       'lstm_two_layers', 'lstm_two_layers_batch_size_1', 'lstm_features',
       'sarimax_gs'],
      dtype='object')

In [19]:
columns_todrop = ['Unnamed: 0', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'moonphase',
       'clusters_description', 'year', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12']
df_preds = df_preds.drop(columns_todrop, axis=1)
df_preds.head()

Unnamed: 0,temp,preds,preds_gs,sktime_naive,naive_last,preds_autoarima,preds_sarimax,lstm_base,lstm_two_layers,lstm_two_layers_batch_size_1,lstm_features,sarimax_gs
0,15.1,14.492051,13.432552,12.189839,12.189839,13.984044,15.701231,15.777449,15.368996,13.841642,12.066167,1.085501
1,14.0,17.870735,14.786562,12.189839,12.189839,13.895126,15.439488,16.630573,15.308199,13.047831,12.066167,1.225803
2,15.8,16.441523,15.185232,12.189839,12.189839,13.759057,16.274314,17.326608,15.238746,12.342554,12.066167,4.254934
3,15.0,14.017682,13.373943,12.189839,12.189839,13.725513,16.052129,17.891799,15.16725,11.64898,12.066167,3.296934
4,14.2,13.380795,14.683416,12.189839,12.189839,13.632718,14.740708,18.356468,15.097821,10.958488,12.066167,0.700043


In [20]:
df_preds.columns

Index(['temp', 'preds', 'preds_gs', 'sktime_naive', 'naive_last',
       'preds_autoarima', 'preds_sarimax', 'lstm_base', 'lstm_two_layers',
       'lstm_two_layers_batch_size_1', 'lstm_features', 'sarimax_gs'],
      dtype='object')

In [21]:
def calculate_metrics(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)
   #calculate MAPE
    mask = actual != 0  
    mape = np.mean(((actual[mask] - predicted[mask]) / actual[mask])) * 100
    
    return mae, mse, rmse, r2, mape

metrics_list = []

# calculate metrics for each model 
for col in df_preds.columns:
    if col != 'temp':
        mae, mse, rmse, r2, mape = calculate_metrics(df_preds['temp'], df_preds[col])
        metrics_list.append({
            'Model': col,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'MAPE': mape
        })

metrics_df = pd.DataFrame(metrics_list)


In [22]:
metrics_by_mae = metrics_df.sort_values(by='MAE', ascending=True)

In [23]:
metrics_by_mae

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE
1,preds_gs,2.103032,7.130397,2.67028,0.741301,-12.492555
0,preds,2.386807,8.827414,2.971096,0.679731,-7.501623
7,lstm_two_layers,4.194984,27.362728,5.230939,0.007249,-12.841021
2,sktime_naive,4.421406,28.217105,5.311978,-0.023749,-44.973731
3,naive_last,4.421406,28.217105,5.311978,-0.023749,-44.973731
9,lstm_features,4.429446,28.432498,5.332213,-0.031564,-43.50291
4,preds_autoarima,4.440994,27.79048,5.271668,-0.008271,-60.464816
8,lstm_two_layers_batch_size_1,6.326612,59.461907,7.711155,-1.157347,13.045202
10,sarimax_gs,10.032825,126.466961,11.245753,-3.588367,22.349657
5,preds_sarimax,28.006129,1045.717374,32.337554,-36.939833,-431.363625


In [24]:
#sorted by MAPE

In [25]:
metrics_by_mape = metrics_df.sort_values(by='MAPE')

In [26]:
metrics_by_mape

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE
6,lstm_base,42.981212,2096.584465,45.788475,-75.066503,-606.598101
5,preds_sarimax,28.006129,1045.717374,32.337554,-36.939833,-431.363625
4,preds_autoarima,4.440994,27.79048,5.271668,-0.008271,-60.464816
2,sktime_naive,4.421406,28.217105,5.311978,-0.023749,-44.973731
3,naive_last,4.421406,28.217105,5.311978,-0.023749,-44.973731
9,lstm_features,4.429446,28.432498,5.332213,-0.031564,-43.50291
7,lstm_two_layers,4.194984,27.362728,5.230939,0.007249,-12.841021
1,preds_gs,2.103032,7.130397,2.67028,0.741301,-12.492555
0,preds,2.386807,8.827414,2.971096,0.679731,-7.501623
8,lstm_two_layers_batch_size_1,6.326612,59.461907,7.711155,-1.157347,13.045202


In [27]:
# mae for xgb with gs and lstm with feature mape