In [3]:
import pandas as pd 
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
df_preds = pd.read_csv(r"C:\Users\nicol\OneDrive\Master\Goldsmiths\Thesis\weather-prediction\data\df_with_all_preds.csv")

In [5]:
df_preds.head()

Unnamed: 0,ds,temp,dew,humidity,precip,precipprob,precipcover,snow,snowdepth,windgust,...,preds_gs,sktime_naive,preds_autoarima,preds_sarimax,lstm_base,lstm_two_layers,lstm_two_layers_batch_size_1,lstm_features,prophet_baseline,sarimax_gs
0,2023-05-22,15.1,6.9,64.3,0.0,0.0,0.0,0.0,0.0,26.1,...,13.432552,14.401869,13.984044,15.701231,14.622599,15.16249,15.655559,9.15113,15.369008,1.085501
1,2023-05-23,14.0,6.1,60.6,0.0,0.0,0.0,0.0,0.0,36.4,...,14.786562,14.403738,13.895126,15.439488,14.595062,15.061334,16.471144,9.6914,15.581018,1.225803
2,2023-05-24,15.8,6.7,61.2,0.0,0.0,0.0,0.0,0.0,34.7,...,15.185232,14.405607,13.759057,16.274314,14.49039,14.948617,16.924369,9.946012,15.804391,4.254934
3,2023-05-25,15.0,6.7,57.9,0.0,0.0,0.0,0.0,0.0,41.5,...,13.373943,14.407477,13.725513,16.052129,14.415214,14.8397,17.228202,9.265878,15.929066,3.296934
4,2023-05-26,14.2,6.9,63.5,0.0,0.0,0.0,0.0,0.0,26.1,...,14.683416,14.409346,13.632718,14.740708,14.378917,14.734895,17.481015,9.883703,15.979924,0.700043


In [6]:
df_preds.columns

Index(['ds', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover',
       'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'moonphase',
       'clusters_description', 'year', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'preds', 'preds_gs', 'sktime_naive',
       'preds_autoarima', 'preds_sarimax', 'lstm_base', 'lstm_two_layers',
       'lstm_two_layers_batch_size_1', 'lstm_features', 'prophet_baseline',
       'sarimax_gs'],
      dtype='object')

In [7]:
columns_todrop = ['ds', 'dew', 'humidity', 'precip', 'precipprob',
       'precipcover', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation',
       'solarenergy', 'uvindex', 'severerisk', 'moonphase',
       'clusters_description', 'year', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12']
df_preds = df_preds.drop(columns_todrop, axis=1)
df_preds.head()

Unnamed: 0,temp,preds,preds_gs,sktime_naive,preds_autoarima,preds_sarimax,lstm_base,lstm_two_layers,lstm_two_layers_batch_size_1,lstm_features,prophet_baseline,sarimax_gs
0,15.1,14.492051,13.432552,14.401869,13.984044,15.701231,14.622599,15.16249,15.655559,9.15113,15.369008,1.085501
1,14.0,17.870735,14.786562,14.403738,13.895126,15.439488,14.595062,15.061334,16.471144,9.6914,15.581018,1.225803
2,15.8,16.441523,15.185232,14.405607,13.759057,16.274314,14.49039,14.948617,16.924369,9.946012,15.804391,4.254934
3,15.0,14.017682,13.373943,14.407477,13.725513,16.052129,14.415214,14.8397,17.228202,9.265878,15.929066,3.296934
4,14.2,13.380795,14.683416,14.409346,13.632718,14.740708,14.378917,14.734895,17.481015,9.883703,15.979924,0.700043


In [8]:
def calculate_metrics(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)
   #calculate MAPE
    mask = actual != 0  
    mape = np.mean(((actual[mask] - predicted[mask]) / actual[mask])) * 100
    
    return mae, mse, rmse, r2, mape

metrics_list = []

# calculate metrics for each model 
for col in df_preds.columns:
    if col != 'temp':
        mae, mse, rmse, r2, mape = calculate_metrics(df_preds['temp'], df_preds[col])
        metrics_list.append({
            'Model': col,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R2': r2,
            'MAPE': mape
        })

metrics_df = pd.DataFrame(metrics_list)


In [15]:
metrics_by_mae = metrics_df.sort_values(by='MAE', ascending=True)

In [16]:
metrics_by_mae

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE
1,preds_gs,2.103032,7.130397,2.67028,0.741301,-12.492555
9,prophet_baseline,2.364277,8.068908,2.840582,0.707251,-21.040123
0,preds,2.386807,8.827414,2.971096,0.679731,-7.501623
8,lstm_features,2.860478,12.768812,3.573347,0.536733,-25.320188
5,lstm_base,4.076612,26.389814,5.137102,0.042547,-24.708019
6,lstm_two_layers,4.337684,29.8066,5.459542,-0.081418,-15.13724
3,preds_autoarima,4.440994,27.79048,5.271668,-0.008271,-60.464816
2,sktime_naive,4.736848,31.844905,5.643129,-0.15537,-76.035466
7,lstm_two_layers_batch_size_1,6.748019,67.374131,8.208175,-1.444411,-129.596221
10,sarimax_gs,10.032825,126.466961,11.245753,-3.588367,22.349657


In [11]:
#sorted by MAPE

In [12]:
metrics_by_mape = metrics_df.sort_values(by='MAPE', ascending=False)

In [13]:
metrics_by_mape

Unnamed: 0,Model,MAE,MSE,RMSE,R2,MAPE
10,sarimax_gs,10.032825,126.466961,11.245753,-3.588367,22.349657
0,preds,2.386807,8.827414,2.971096,0.679731,-7.501623
1,preds_gs,2.103032,7.130397,2.67028,0.741301,-12.492555
6,lstm_two_layers,4.337684,29.8066,5.459542,-0.081418,-15.13724
9,prophet_baseline,2.364277,8.068908,2.840582,0.707251,-21.040123
5,lstm_base,4.076612,26.389814,5.137102,0.042547,-24.708019
8,lstm_features,2.860478,12.768812,3.573347,0.536733,-25.320188
3,preds_autoarima,4.440994,27.79048,5.271668,-0.008271,-60.464816
2,sktime_naive,4.736848,31.844905,5.643129,-0.15537,-76.035466
7,lstm_two_layers_batch_size_1,6.748019,67.374131,8.208175,-1.444411,-129.596221


In [14]:
# mae for xgb with gs and lstm with feature mape