# Index

* [Strategy](#strategy)
* [Finding the best NWP variables](#best_NWP)
    - [Best velocities `U` and `V`](#u_v)
    - [Beste temperature `T`](#T)
    - [Trying the best features found](#best_feat)
    - [Trying the features obtained form the average of all NWP](#average_feat)
* [Modeling](#modeling)
    - [Multivariate Adaptative Regression Splines (MARS)](#mars)
    - [K-Nearest Neighbohrs (KNN)](#knn)
    - [Suppor Vector Machines (SVM)](#svm)
    - [Xtreme Gradient Boosting (XGBoost)](#xgb)
    - [Artificial Neural Network (ANN)](#ann)
    

In [80]:
%load_ext autoreload
%autoreload 2

# Common libraries
import numpy as np
import pandas as pd
import os
import datetime as dt

from src.functions import data_import as dimp
from src.functions import data_exploration as dexp
from src.functions import data_transformation as dtr
from src.functions import metric

# Graphics
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly as pty
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

import re
from collections import OrderedDict

# Save images 
DIR = "../../TFM/reports/figures/"
WF = "WF1"
IMAGES_PATH = os.path.join(DIR, WF)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore warnings (SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
# Load data
%store -r X_train_cpy
%store -r X_test_cpy
%store -r y_train
%store -r y_test

In [138]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Strategy
<a id="strategy"></a>

These are the steps we're going to take:
1. Find out which NWP performs the best for each weather variable (except for `CLCT` that is only provided by NWP4 provides).
    - Varaibles `U` and `V`: it implies to train models and compare them for all the four NWP, as all provide these variables.
    - Variable `T`: it implies to compare models between NWP1 and NWP3, the ones who provide tempetarue.


2. Train models with standard parameters using the best weather variables from step 1.
3. For each variable, take the mean of all the NWP.
4. Train models with the variables obtained in step 3.
5. Compare the models in order to decide which one will be the one used to re-train original X_train and predicts on original X_test, for each WF.
    
These are the algorithms to be analyzed, using standard hyper-parameters here, to be tunned in next steps:
* KNN
* SVM
* XGBoost
* ANN
* MARS 

In [83]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb
from pyearth import Earth
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, median_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor 
np.warnings.filterwarnings('ignore')

## Finding the best NWP variables
<a id="best_NWP"></a>

### Best velocities  `U` and `V`
<a id="u_v"></a>

In [139]:
# selecting column
y_train = y_train['Production']
y_test = y_test['Production']

In [85]:
def train_models_by_nwp(nwp):
    
    r2_test_scores = []
    
    # selecting columns by nwp
    X_train = X_train_cpy[[nwp + '_wshear', nwp + '_wdir']]
    X_test = X_test_cpy[[nwp + '_wshear', nwp + '_wdir']]
    
    # scaling features 
    pt = MinMaxScaler()
    X_train_trn = pt.fit_transform(X_train)
    X_test_trn = pt.fit_transform(X_test)
    
    # KNN model
    knn_reg = KNeighborsRegressor()
    knn_reg.fit(X_train_trn, y_train)

    y_train_pred = knn_reg.predict(X_train_trn)
    y_test_pred = knn_reg.predict(X_test_trn)

    knn_mse_train = mean_squared_error(y_train, y_train_pred)
    knn_mse_test  = mean_squared_error(y_test, y_test_pred)
    knn_r2_train = r2_score(y_train, y_train_pred)
    knn_r2_test = r2_score(y_test, y_test_pred)
    
    # SVM model
    svm_reg = SVR()
    svm_reg.fit(X_train_trn, y_train)

    y_train_pred = svm_reg.predict(X_train_trn)
    y_test_pred = svm_reg.predict(X_test_trn)

    svm_mse_train = mean_squared_error(y_train, y_train_pred)
    svm_mse_test  = mean_squared_error(y_test, y_test_pred)
    svm_r2_train = r2_score(y_train, y_train_pred)
    svm_r2_test = r2_score(y_test, y_test_pred)
    
    # XGBoost model
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.fit(X_train, y_train)

    y_train_pred = xgb_reg.predict(X_train)
    y_test_pred = xgb_reg.predict(X_test)

    xgb_mse_train = mean_squared_error(y_train, y_train_pred)
    xgb_mse_test  = mean_squared_error(y_test, y_test_pred)
    xgb_r2_train = r2_score(y_train, y_train_pred)
    xgb_r2_test = r2_score(y_test, y_test_pred)
    
    # MARS model
    reg = Earth()
    reg.fit(X_train, y_train)

    y_train_pred = reg.predict(X_train)
    y_test_pred = reg.predict(X_test)

    mars_mse_train = mean_squared_error(y_train, y_train_pred)
    mars_mse_test  = mean_squared_error(y_test, y_test_pred)
    mars_r2_train = r2_score(y_train, y_train_pred)
    mars_r2_test = r2_score(y_test, y_test_pred) 
    
    r2_test_scores.append(knn_r2_test)
    r2_test_scores.append(svm_r2_test)
    r2_test_scores.append(xgb_r2_test)
    r2_test_scores.append(mars_r2_test)
    
    print('Test accuracies for {}'.format(nwp))
    print('KNN:     R^2 = %.3f, MSE = %.3f' % (knn_r2_test, knn_mse_test))
    print('SVM:     R^2 = %.3f, MSE = %.3f' % (svm_r2_test, svm_mse_test))
    print('XGBoost: R^2 = %.3f, MSE = %.3f' % (xgb_r2_test, xgb_mse_test))
    print('MARS:    R^2 = %.3f, MSE = %.3f' % (mars_r2_test, mars_mse_test))
    print('Mean:    R^2 = %3f' % (np.mean(r2_test_scores)))
    print('\n')
    
    return np.mean(r2_test_scores)
    

In [86]:
test_mean_scores = {}
for nwp in ['NWP1','NWP2','NWP3','NWP4']:
    test_mean_scores[nwp] = train_models_by_nwp(nwp)

    

Test accuracies for NWP1
KNN:     R^2 = 0.650, MSE = 4.078
SVM:     R^2 = 0.671, MSE = 3.830
XGBoost: R^2 = 0.681, MSE = 3.719
MARS:    R^2 = 0.671, MSE = 3.826
Mean:    R^2 = 0.668282


Test accuracies for NWP2
KNN:     R^2 = 0.620, MSE = 4.430
SVM:     R^2 = 0.663, MSE = 3.928
XGBoost: R^2 = 0.663, MSE = 3.923
MARS:    R^2 = 0.687, MSE = 3.644
Mean:    R^2 = 0.658166


Test accuracies for NWP3
KNN:     R^2 = 0.601, MSE = 4.648
SVM:     R^2 = 0.595, MSE = 4.712
XGBoost: R^2 = 0.642, MSE = 4.171
MARS:    R^2 = 0.631, MSE = 4.300
Mean:    R^2 = 0.617245


Test accuracies for NWP4
KNN:     R^2 = 0.393, MSE = 7.066
SVM:     R^2 = 0.406, MSE = 6.915
XGBoost: R^2 = 0.512, MSE = 5.684
MARS:    R^2 = 0.586, MSE = 4.826
Mean:    R^2 = 0.474254




The best NWP for variables `U` and `V` is NWP1.

### Best temperature `T`
<a id="T"></a>

Now will find with NWP, 1 or 3, provides the best forecast of `T`. We're using `U` and `V` from NWP1, as we saw they are the best forecasts among all.

In [87]:
def train_models_by_nwp(nwp):
    
    r2_test_scores = []
    
    # selecting columns by nwp
    X_train = X_train_cpy[['NWP1_wvel', 'NWP1_wdir', nwp + '_T']]
    X_test = X_test_cpy[['NWP1_wvel', 'NWP1_wdir', nwp + '_T']]
    
    # scaling features 
    pt = MinMaxScaler()
    X_train_trn = pt.fit_transform(X_train)
    X_test_trn = pt.fit_transform(X_test)
    
    # KNN model
    knn_reg = KNeighborsRegressor()
    knn_reg.fit(X_train_trn, y_train)

    y_train_pred = knn_reg.predict(X_train_trn)
    y_test_pred = knn_reg.predict(X_test_trn)

    knn_mse_train = mean_squared_error(y_train, y_train_pred)
    knn_mse_test  = mean_squared_error(y_test, y_test_pred)
    knn_r2_train = r2_score(y_train, y_train_pred)
    knn_r2_test = r2_score(y_test, y_test_pred)
    
    # SVM model
    svm_reg = SVR()
    svm_reg.fit(X_train_trn, y_train)

    y_train_pred = svm_reg.predict(X_train_trn)
    y_test_pred = svm_reg.predict(X_test_trn)

    svm_mse_train = mean_squared_error(y_train, y_train_pred)
    svm_mse_test  = mean_squared_error(y_test, y_test_pred)
    svm_r2_train = r2_score(y_train, y_train_pred)
    svm_r2_test = r2_score(y_test, y_test_pred)
    
    # XGBoost model
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.fit(X_train, y_train)

    y_train_pred = xgb_reg.predict(X_train)
    y_test_pred = xgb_reg.predict(X_test)

    xgb_mse_train = mean_squared_error(y_train, y_train_pred)
    xgb_mse_test  = mean_squared_error(y_test, y_test_pred)
    xgb_r2_train = r2_score(y_train, y_train_pred)
    xgb_r2_test = r2_score(y_test, y_test_pred)
    
    # MARS model
    reg = Earth()
    reg.fit(X_train, y_train)

    y_train_pred = reg.predict(X_train)
    y_test_pred = reg.predict(X_test)

    mars_mse_train = mean_squared_error(y_train, y_train_pred)
    mars_mse_test  = mean_squared_error(y_test, y_test_pred)
    mars_r2_train = r2_score(y_train, y_train_pred)
    mars_r2_test = r2_score(y_test, y_test_pred) 
    
    r2_test_scores.append(knn_r2_test)
    r2_test_scores.append(svm_r2_test)
    r2_test_scores.append(xgb_r2_test)
    r2_test_scores.append(mars_r2_test)
    
    print('Test accuracies for {}'.format(nwp))
    print('KNN:     R^2 = %.3f, MSE = %.3f' % (knn_r2_test, knn_mse_test))
    print('SVM:     R^2 = %.3f, MSE = %.3f' % (svm_r2_test, svm_mse_test))
    print('XGBoost: R^2 = %.3f, MSE = %.3f' % (xgb_r2_test, xgb_mse_test))
    print('MARS:    R^2 = %.3f, MSE = %.3f' % (mars_r2_test, mars_mse_test))
    print('Mean:    R^2 = %3f' % (np.mean(r2_test_scores)))
    print('\n')
    
    return np.mean(r2_test_scores)
    

In [88]:
test_mean_scores = {}
for nwp in ['NWP1','NWP3']:
    test_mean_scores[nwp] = train_models_by_nwp(nwp)

Test accuracies for NWP1
KNN:     R^2 = 0.623, MSE = 4.391
SVM:     R^2 = 0.663, MSE = 3.923
XGBoost: R^2 = 0.587, MSE = 4.811
MARS:    R^2 = 0.652, MSE = 4.055
Mean:    R^2 = 0.631225


Test accuracies for NWP3
KNN:     R^2 = 0.600, MSE = 4.664
SVM:     R^2 = 0.664, MSE = 3.909
XGBoost: R^2 = 0.645, MSE = 4.129
MARS:    R^2 = 0.647, MSE = 4.112
Mean:    R^2 = 0.639100




We get a better R$^2$ using tempetature forescasts from NWP3. So, finally we'll use the following weather variables:
* NWP1_U
* NWP1_V
* NWP3_T
* NWP4_CLCT

### Trying the best features found
<a id="best_feat"></a>

In [89]:
# Making the data set from best features previously found
X_train = X_train_cpy[['NWP1_wvel', 'NWP1_wdir', 'NWP3_T', 'NWP4_CLCT']]
X_test = X_test_cpy[['NWP1_wvel', 'NWP1_wdir', 'NWP3_T', 'NWP4_CLCT']]

In [90]:
X_train.rename(columns={'NWP1_wvel': 'wind_vel', 'NWP1_wdir': 'wind_dir', 
                'NWP3_T': 'temperature', 'NWP4_CLCT': 'CLCT'}, inplace=True)

In [91]:
X_test.rename(columns={'NWP1_wvel': 'wind_vel', 'NWP1_wdir': 'wind_dir', 
                'NWP3_T': 'temperature', 'NWP4_CLCT': 'CLCT'}, inplace=True)

In [92]:
X_train.head()

Unnamed: 0,wind_vel,wind_dir,temperature,CLCT
0,3.958163,34.607537,12.85,82.5625
1,2.830924,59.276909,12.85,100.0
2,4.548818,312.285273,12.85,98.375
3,7.239816,329.200597,12.516667,94.875
4,6.647299,325.002463,12.183333,95.875


In [93]:
def train_models(X_train, X_test, y_train, y_test):
    #scaling features 
    pt = MinMaxScaler()
    X_train_trn = pt.fit_transform(X_train)
    X_test_trn = pt.fit_transform(X_test)
    
    # KNN model
    knn_reg = KNeighborsRegressor()
    knn_reg.fit(X_train_trn, y_train)
    
    y_train_pred = knn_reg.predict(X_train_trn)
    y_test_pred = knn_reg.predict(X_test_trn)
    
    knn_mse_train = mean_squared_error(y_train, y_train_pred)
    knn_mse_test  = mean_squared_error(y_test, y_test_pred)
    knn_r2_train = r2_score(y_train, y_train_pred)
    knn_r2_test = r2_score(y_test, y_test_pred)
    
    # SVM model
    svm_reg = SVR()
    svm_reg.fit(X_train_trn, y_train)
    
    y_train_pred = svm_reg.predict(X_train_trn)
    y_test_pred = svm_reg.predict(X_test_trn)
    
    svm_mse_train = mean_squared_error(y_train, y_train_pred)
    svm_mse_test  = mean_squared_error(y_test, y_test_pred)
    svm_r2_train = r2_score(y_train, y_train_pred)
    svm_r2_test = r2_score(y_test, y_test_pred)
    
    # XGBoost model
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.fit(X_train, y_train)
    
    y_train_pred = xgb_reg.predict(X_train)
    y_test_pred = xgb_reg.predict(X_test)
    
    xgb_mse_train = mean_squared_error(y_train, y_train_pred)
    xgb_mse_test  = mean_squared_error(y_test, y_test_pred)
    xgb_r2_train = r2_score(y_train, y_train_pred)
    xgb_r2_test = r2_score(y_test, y_test_pred)
    
    # MARS model
    reg = Earth()
    reg.fit(X_train, y_train)
    
    y_train_pred = reg.predict(X_train)
    y_test_pred = reg.predict(X_test)
    
    mars_mse_train = mean_squared_error(y_train, y_train_pred)
    mars_mse_test  = mean_squared_error(y_test, y_test_pred)
    mars_r2_train = r2_score(y_train, y_train_pred)
    mars_r2_test = r2_score(y_test, y_test_pred) 
    
    r2_test_scores = []
    r2_test_scores.append(knn_r2_test)
    r2_test_scores.append(svm_r2_test)
    r2_test_scores.append(xgb_r2_test)
    r2_test_scores.append(mars_r2_test)
    
    print('Test accuracies'.format(nwp))
    print('KNN:     R^2 = %.3f, MSE = %.3f' % (knn_r2_test, knn_mse_test))
    print('SVM:     R^2 = %.3f, MSE = %.3f' % (svm_r2_test, svm_mse_test))
    print('XGBoost: R^2 = %.3f, MSE = %.3f' % (xgb_r2_test, xgb_mse_test))
    print('MARS:    R^2 = %.3f, MSE = %.3f' % (mars_r2_test, mars_mse_test))
    print('Mean:    R^2 = %3f' % (np.mean(r2_test_scores)))
    print('\n')
    
    del X_train
    del X_test

In [94]:
train_models(X_train, X_test, y_train, y_test)

Test accuracies
KNN:     R^2 = 0.578, MSE = 4.914
SVM:     R^2 = 0.652, MSE = 4.052
XGBoost: R^2 = 0.647, MSE = 4.107
MARS:    R^2 = 0.647, MSE = 4.112
Mean:    R^2 = 0.631102




### Trying the features obtained form the average of all NWP
<a id="average_feat"></a>

In [95]:
X_train_cpy.head()

Unnamed: 0,Time,ID,WF,NWP1_00h_D-2_U,NWP1_00h_D-2_V,NWP1_00h_D-2_T,NWP1_06h_D-2_U,NWP1_06h_D-2_V,NWP1_06h_D-2_T,NWP1_12h_D-2_U,NWP1_12h_D-2_V,NWP1_12h_D-2_T,NWP1_18h_D-2_U,NWP1_18h_D-2_V,NWP1_18h_D-2_T,NWP1_00h_D-1_U,NWP1_00h_D-1_V,NWP1_00h_D-1_T,NWP1_06h_D-1_U,NWP1_06h_D-1_V,NWP1_06h_D-1_T,NWP1_12h_D-1_U,NWP1_12h_D-1_V,NWP1_12h_D-1_T,NWP1_18h_D-1_U,NWP1_18h_D-1_V,NWP1_18h_D-1_T,NWP1_00h_D_U,NWP1_00h_D_V,NWP1_00h_D_T,NWP1_06h_D_U,NWP1_06h_D_V,NWP1_06h_D_T,NWP1_12h_D_U,NWP1_12h_D_V,NWP1_12h_D_T,NWP1_18h_D_U,NWP1_18h_D_V,NWP1_18h_D_T,NWP2_00h_D-2_U,NWP2_00h_D-2_V,NWP2_12h_D-2_U,NWP2_12h_D-2_V,NWP2_00h_D-1_U,NWP2_00h_D-1_V,NWP2_12h_D-1_U,NWP2_12h_D-1_V,NWP2_00h_D_U,NWP2_00h_D_V,NWP2_12h_D_U,NWP2_12h_D_V,NWP3_00h_D-2_U,NWP3_00h_D-2_V,NWP3_00h_D-2_T,NWP3_06h_D-2_U,NWP3_06h_D-2_V,NWP3_06h_D-2_T,NWP3_12h_D-2_U,NWP3_12h_D-2_V,NWP3_12h_D-2_T,NWP3_18h_D-2_U,NWP3_18h_D-2_V,NWP3_18h_D-2_T,NWP3_00h_D-1_U,NWP3_00h_D-1_V,NWP3_00h_D-1_T,NWP3_06h_D-1_U,NWP3_06h_D-1_V,NWP3_06h_D-1_T,NWP3_12h_D-1_U,NWP3_12h_D-1_V,NWP3_12h_D-1_T,NWP3_18h_D-1_U,NWP3_18h_D-1_V,NWP3_18h_D-1_T,NWP3_00h_D_U,NWP3_00h_D_V,NWP3_00h_D_T,NWP3_06h_D_U,NWP3_06h_D_V,NWP3_06h_D_T,NWP3_12h_D_U,NWP3_12h_D_V,NWP3_12h_D_T,NWP3_18h_D_U,NWP3_18h_D_V,NWP3_18h_D_T,NWP4_00h_D-2_U,NWP4_00h_D-2_V,NWP4_00h_D-2_CLCT,NWP4_12h_D-2_U,NWP4_12h_D-2_V,NWP4_12h_D-2_CLCT,NWP4_00h_D-1_U,NWP4_00h_D-1_V,NWP4_00h_D-1_CLCT,NWP4_12h_D-1_U,NWP4_12h_D-1_V,NWP4_12h_D-1_CLCT,NWP4_00h_D_U,NWP4_00h_D_V,NWP4_00h_D_CLCT,NWP4_12h_D_U,NWP4_12h_D_V,NWP4_12h_D_CLCT,NWP1_U,NWP1_V,NWP1_T,NWP2_U,NWP2_V,NWP3_U,NWP3_V,NWP3_T,NWP4_U,NWP4_V,NWP4_CLCT,NWP1_wvel,NWP1_wdir,NWP1_wdir_sin,NWP1_wdir_cos,NWP1_wshear,NWP2_wvel,NWP2_wdir,NWP2_wdir_sin,NWP2_wdir_cos,NWP2_wshear,NWP3_wvel,NWP3_wdir,NWP3_wdir_sin,NWP3_wdir_cos,NWP3_wshear,NWP4_wvel,NWP4_wdir,NWP4_wdir_sin,NWP4_wdir_cos,NWP4_wshear,hour,month,hour_sin,hour_cos,month_sin,month_cos
0,2018-05-01 01:00:00,1,WF1,,,,,,,,,,,,,,,,,,,,,,,,,-2.248047,-3.257812,286.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.254883,-0.289795,82.5625,,,,-2.248047,-3.257812,13.35,2.611328,-2.341797,-1.149414,-2.275391,12.85,1.254883,-0.289795,82.5625,3.958163,34.607537,0.567952,0.823062,3.592109,3.50757,311.885242,-0.744484,0.667641,3.183187,2.549226,26.800633,0.450887,0.892581,2.313472,1.28791,283.003572,-0.974356,0.225012,1.613397,1,5,0.258819,0.965926,0.5,-0.866025
1,2018-05-01 02:00:00,2,WF1,,,,,,,,,,,,,,,,,,,,,,,,,-2.433594,-1.446289,286.25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.490234,-0.41333,100.0,,,,-2.433594,-1.446289,13.1,2.611328,-2.341797,-1.149414,-2.275391,12.85,2.490234,-0.41333,100.0,2.830924,59.276909,0.859646,0.510889,2.569118,3.50757,311.885242,-0.744484,0.667641,3.183187,2.549226,26.800633,0.450887,0.892581,2.313472,2.524304,279.42406,-0.986503,0.16374,3.162259,2,5,0.5,0.866025,0.5,-0.866025
2,2018-05-01 03:00:00,3,WF1,,,,,,,,,,,,,,,,,,,,,,,,,3.365234,-3.060547,285.75,,,,,,,,,,,,,,,,,,2.611328,-2.341797,,,-2.953125,-2.277344,287.0,-0.098267,1.709961,287.0,3.826172,2.892578,286.0,-1.888672,0.326172,287.0,-5.820312,2.181641,287.0,-5.816406,2.222656,287.0,-3.796875,-0.853516,287.0,-1.707031,-1.460938,286.0,-1.149414,-2.275391,286.0,,,,,,,,,,,,,,,,,,,,,,0.99707,-1.415039,98.375,,,,3.365234,-3.060547,12.6,2.611328,-2.341797,-1.149414,-2.275391,12.85,0.99707,-1.415039,98.375,4.548818,312.285273,-0.739804,0.672822,4.12814,3.50757,311.885242,-0.744484,0.667641,3.183187,2.549226,26.800633,0.450887,0.892581,2.313472,1.731036,324.830561,-0.575996,0.817452,2.168512,3,5,0.707107,0.707107,0.5,-0.866025
3,2018-05-01 04:00:00,4,WF1,,,,,,,,,,,,,,,,,,,,,,,,,3.707031,-6.21875,284.75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.689453,-0.961426,94.875,,,,3.707031,-6.21875,11.6,3.178385,-2.783203,-0.519206,-2.721354,12.516667,0.689453,-0.961426,94.875,7.239816,329.200597,-0.512034,0.858965,6.570271,4.224731,311.207521,-0.752328,0.658788,3.834024,2.770441,10.801618,0.187409,0.982282,2.514228,1.183083,324.355114,-0.58276,0.812644,1.482078,4,5,0.866025,0.5,0.5,-0.866025
4,2018-05-01 05:00:00,5,WF1,,,,,,,,,,,,,,,,,,,,,,,,,3.8125,-5.445312,284.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.291016,-0.294922,95.875,,,,3.8125,-5.445312,11.35,3.745443,-3.224609,0.111003,-3.167318,12.183333,0.291016,-0.294922,95.875,6.647299,325.002463,-0.573541,0.819177,6.032551,4.942312,310.726546,-0.757832,0.65245,4.485243,3.169262,357.992819,-0.035025,0.999386,2.876166,0.41433,315.381966,-0.702377,0.711805,0.519041,5,5,0.965926,0.258819,0.5,-0.866025


In [96]:
X_train = X_train_cpy[['NWP1_U', 'NWP1_V', 'NWP1_T',
                      'NWP2_U','NWP2_V',
                      'NWP3_U','NWP3_V','NWP3_T',
                      'NWP4_U','NWP4_V','NWP4_CLCT']]

X_test = X_test_cpy[['NWP1_U', 'NWP1_V', 'NWP1_T',
                      'NWP2_U','NWP2_V',
                      'NWP3_U','NWP3_V','NWP3_T',
                      'NWP4_U','NWP4_V','NWP4_CLCT']]

X_train['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
X_train['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
X_train['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
X_train['CLCT'] = X_train_cpy.NWP4_CLCT

X_test['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
X_test['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
X_test['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
X_test['CLCT'] = X_test_cpy.NWP4_CLCT

X_train = X_train[['U','V','T','CLCT']]
X_test = X_test[['U','V','T','CLCT']]

In [97]:
X_train.head()

Unnamed: 0,U,V,T,CLCT
0,0.117188,-2.041199,13.1,82.5625
1,0.379639,-1.619202,12.975,100.0
2,1.456055,-2.273193,12.725,98.375
3,1.763916,-3.171183,12.058333,94.875
4,1.98999,-3.03304,11.766667,95.875


In [98]:
pre_process = ColumnTransformer(remainder = 'passthrough',
                                transformers = [('drop_columns', 'drop', ['U','V'])
                                               ])

prepare_data_pipeline = Pipeline(steps=[
    ('attr_adder', dtr.NewFeaturesAdder(add_time_feat=False, add_cycl_feat=False, add_inv_T=False)),
    ('pre_process', pre_process)
])

In [99]:
X_train = prepare_data_pipeline.fit_transform(X_train)
X_test = prepare_data_pipeline.transform(X_test)

In [100]:
train_models(X_train, X_test, y_train, y_test)

Test accuracies
KNN:     R^2 = 0.607, MSE = 4.578
SVM:     R^2 = 0.687, MSE = 3.650
XGBoost: R^2 = 0.607, MSE = 4.577
MARS:    R^2 = 0.587, MSE = 4.807
Mean:    R^2 = 0.621954




Finally we got the best results for the following features, that are the ones we're using for finding the best algorithm for WF1:
* NWP1_U 
* NWP1_V
* NWP3_T
* NWP4_CLCT

In [101]:
# Definitive data sets
X_train = X_train_cpy[['ID','Time','NWP1_U', 'NWP1_V', 'NWP3_T', 'NWP4_CLCT']]
X_test = X_test_cpy[['ID','Time','NWP1_U', 'NWP1_V', 'NWP3_T', 'NWP4_CLCT']]

X_train.rename(columns={'NWP1_U': 'U', 'NWP1_V': 'V', 
                        'NWP3_T': 'T', 'NWP4_CLCT': 'CLCT'}, inplace=True)

X_test.rename(columns={'NWP1_U': 'U', 'NWP1_V': 'V', 
                        'NWP3_T': 'T', 'NWP4_CLCT': 'CLCT'}, inplace=True)

In [102]:
X_train.head()

Unnamed: 0,ID,Time,U,V,T,CLCT
0,1,2018-05-01 01:00:00,-2.248047,-3.257812,12.85,82.5625
1,2,2018-05-01 02:00:00,-2.433594,-1.446289,12.85,100.0
2,3,2018-05-01 03:00:00,3.365234,-3.060547,12.85,98.375
3,4,2018-05-01 04:00:00,3.707031,-6.21875,12.516667,94.875
4,5,2018-05-01 05:00:00,3.8125,-5.445312,12.183333,95.875


In [136]:
X_train.to_csv('../../data/interim/for_modeling_by_WF/X_train.csv', index=False)
X_test.to_csv('../../data/interim/for_modeling_by_WF/X_test.csv', index=False)
y_train.to_csv('../../data/interim/for_modeling_by_WF/y_train.csv', index=False)
y_test.to_csv('../../data/interim/for_modeling_by_WF/test.csv', index=False)

## Modeling
<a id="modeling"></a>

### Multiple Adaptative Regression Splines (MARS)
<a id="mars"></a>

In [160]:
# pre-process data
pre_process = ColumnTransformer(remainder = 'passthrough',
                                transformers = [('drop_columns', 'drop', ['ID','Time','U','V'])
                                               ])

prepare_data_pipeline = Pipeline(steps=[
    ('attr_adder', dtr.NewFeaturesAdder(add_time_feat=True, add_cycl_feat=False, add_inv_T=False)),
    ('pre_process', pre_process)
])

X_train_pped = prepare_data_pipeline.fit_transform(X_train)
X_test_pped = prepare_data_pipeline.transform(X_test)

In [161]:
# MARS model

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn import metrics
from sklearn.model_selection import cross_validate

reg = Earth()
tscv= TimeSeriesSplit(n_splits=10)

r2_scorer = make_scorer(r2_score, greater_is_better=True)
cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

scores = cross_val_score(reg, X_train_pped, y_train, scoring=cape_scorer, cv=tscv)
mars_scores = -scores

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(mars_scores)

Scores: [83.67355962 36.12902762 45.16037828 54.1096935  62.77247554 45.67103855
 66.6259479  36.70843742 67.74725501 37.07475881]
Mean: 53.56725722541474
Standard deviation: 15.350444645780824
