In [7]:
%load_ext autoreload
%autoreload 2

# Common libraries
import numpy as np
import pandas as pd
import os
import datetime as dt

from src.functions import data_import as dimp
from src.functions import data_exploration as dexp
from src.functions import data_transformation as dtr
from src.functions import metric
from src.functions import utils

# Graphics
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly as pty
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

import re
from collections import OrderedDict

# Save images 
DIR = "../../TFM/reports/figures/"
WF = "WF1"
IMAGES_PATH = os.path.join(DIR, WF)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore warnings (SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Algunas pruebas con diferentes modelos

Para establecer un punto de referencia o *benchmark*, probaremos distintos modelos sin profundizar mucho en el tuning de los hiperparámetros. El objetivo es paulatinamente mejorar el rendimiento de los modelos, a base de modificar las fases del proceso (EDA, Feature Engineering, Hyperparameter Tuning, ...).

Elegiremos para conformar nuestro set de entrenamiento:
* Un solo NWP.
* Run de las 00h.
* Día D-1.
* Variables meteorológicas de partida `time`, `U`, `V` y `T`.

In [3]:
# Load data
DATA_PATH = 'C:/Users/Quark/Documents/Mis Cosas/Master/Asignaturas/TFM/Git/TFM/data/raw/'
X_train = dimp.import_data(os.path.join(DATA_PATH, 'X_train_v2.csv'))
X_test = dimp.import_data(os.path.join(DATA_PATH, 'X_test_v2.csv'))
Y_train = dimp.import_data(os.path.join(DATA_PATH, 'Y_train.csv'))

X_train['Time'] = pd.to_datetime(X_train['Time'], format='%d/%m/%Y %H:%M')
X_test['Time'] = pd.to_datetime(X_test['Time'], format='%d/%m/%Y %H:%M')

Memory usage of dataframe is 29.94 MB
Memory usage after optimization is: 7.72 MB
Decreased by 74.2%
Memory usage of dataframe is 29.26 MB



invalid value encountered in less


invalid value encountered in less



Memory usage after optimization is: 19.47 MB
Decreased by 33.5%
Memory usage of dataframe is 0.57 MB
Memory usage after optimization is: 0.21 MB
Decreased by 62.5%


In [4]:
def input_missing_values(df, cols):
    
    regex = 'NWP(?P<NWP>\d{1})_(?P<run>\d{2}h)_(?P<fc_day>D\W?\d?)_(?P<weather_var>\w{1,4})'
    p = re.compile(regex)  
    
    NWP_met_vars_dict = {
        '1': ['U','V','T'],
        '2': ['U','V'],
        '3': ['U','V','T'],
        '4': ['U','V','CLCT']
    }
    
    for col in reversed(cols):
        m = p.match(col)
        col_name = 'NWP' + m.group('NWP') + '_' +  m.group('run') + '_' + m.group('fc_day') + '_' + m.group('weather_var')

        for key, value in NWP_met_vars_dict.items():
            for i in value:
                if m.group('NWP') == key and m.group('weather_var') == i:
                    df['NWP'+ key + '_' + i] = df['NWP'+ key + '_' + i].fillna(df[col_name])
    
    return df

In [5]:
from metpy import calc
from metpy.units import units

# function to obtain the module of wind velocity
get_wind_velmod = lambda x : float(calc.wind_speed(
    x.U * units.meter/units.second, 
    x.V * units.meter/units.second
).magnitude)

# function to obtain the wind direction
get_wind_dir = lambda x : float(calc.wind_direction(
    x.U * units.meter/units.second, 
    x.V * units.meter/units.second, 
    convention="from"
).magnitude)

## Regresión con KNN

In [28]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsRegressor
import re
from collections import OrderedDict
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import OneClassSVM
from sklearn.metrics import r2_score, median_absolute_error
import hdbscan


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos y los scores de cada WF
models = []
scores_list = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )
        
    X_train_cpy.reset_index(inplace=True)
    
    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 
    

    X_train_cpy['U'] = X_train_cpy.NWP1_U
    X_train_cpy['V'] = X_train_cpy.NWP1_V
    X_train_cpy['T'] = X_train_cpy.NWP3_T
    X_train_cpy['CLCT'] = X_train_cpy.NWP4_CLCT
    
    X_test_cpy['U'] = X_test_cpy.NWP1_U
    X_test_cpy['V'] = X_test_cpy.NWP1_V
    X_test_cpy['T'] = X_test_cpy.NWP3_T
    X_test_cpy['CLCT'] = X_test_cpy.NWP4_CLCT
 
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)

    
    ####### Limpiar outliers y valores anómalos #######
    
    # valores negativos en CLCT
    X_train_cpy.loc[X_train_cpy['CLCT'] < 0, 'CLCT'] = 0.0
    X_test_cpy.loc[X_test_cpy['CLCT'] < 0, 'CLCT'] = 0.0

    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # algoritmo para detección de outliers
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20).fit(X)
    threshold = pd.Series(clusterer.outlier_scores_).quantile(0.96)
    outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
    
    # Eliminamos los registros outliers 
    X_train_cpy.drop(X_train_cpy.index[list(outliers)], inplace=True)
    
    # Eliminamos las observaciones corresp#ondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['Production']
    
    ##################################

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['ID','Time','U','V','T','month','hour'])]
    )
    

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder()), 
        ('pre_processing', pre_process),
        ('power_transf', PowerTransformer())
    ])
    
    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)
    

    print('Nº features antes de la seleccion: ', X_train_pped.shape[1])
    
    # seleccionar features más importantes mediante Random Forest
    rf = RandomForestRegressor(random_state=42)
    rf_reg = rf.fit(X_train_pped, Y_train_cpy)
    sel = SelectFromModel(rf_reg, prefit=True, threshold='1.25*median')
    X_train_pped = sel.transform(X_train_pped)
    
    print('Nº features después de la seleccion: ', X_train_pped.shape[1])

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización KNN: implementación en GridSearchCV
    param_grid = [
        {
            'n_neighbors': list(range(1,50,2)),
            'algorithm':['auto', 'kd_tree'],
            'weights': ['uniform','distance'],
            'p': [1,2]
        }
    ]
    
    btscv = utils.BlockingTimeSeriesSplit(n_splits=5)
    knn_reg = KNeighborsRegressor()
    grid_search_knn = GridSearchCV(
        knn_reg, 
        param_grid, 
        cv= btscv,
        n_jobs=-1,
        scoring=cape_scorer
    )

    grid_search_knn.fit(X_train_pped, Y_train_cpy)
    
    # Reentrenamos sin validación cruzada utilizando los mejores 
    # parámetros obtenidos con la validación cruzada
    
    knn_reg2 = KNeighborsRegressor(algorithm=grid_search_knn.best_params_['algorithm'],
                                   n_neighbors=grid_search_knn.best_params_['n_neighbors'],
                                   p=grid_search_knn.best_params_['p'], 
                                   weights=grid_search_knn.best_params_['weights'])
    
    ttreg = TransformedTargetRegressor(regressor=knn_reg2, 
                                       transformer=StandardScaler(), 
                                       check_inverse=False)

    ttreg.fit(X_train_pped, Y_train_cpy)

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_pped = sel.transform(X_test_pped)
    predictions = ttreg.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('R2 for {} is {}'.format(WF, ttreg.score(X_train_pped, Y_train_cpy)))
    print('Predictions for {} has been added to submission_df'.format(WF))
    
    scores_list.append(ttreg.score(X_train_pped, Y_train_cpy))
    
global_score = np.mean(scores_list)

print('********************************************')
print('Global score: ', global_score)
print('********************************************')


# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_knn_time_only.csv", index=False, sep=",") 

Nº features antes de la seleccion:  10
Nº features después de la seleccion:  4
R2 for WF1 is 0.8674065749207006
Predictions for WF1 has been added to submission_df
Nº features antes de la seleccion:  10
Nº features después de la seleccion:  4
R2 for WF2 is 0.9999998968002413
Predictions for WF2 has been added to submission_df
Nº features antes de la seleccion:  10
Nº features después de la seleccion:  3
R2 for WF3 is 0.999999901860768
Predictions for WF3 has been added to submission_df
Nº features antes de la seleccion:  10
Nº features después de la seleccion:  4
R2 for WF4 is 0.9999999035646198
Predictions for WF4 has been added to submission_df
Nº features antes de la seleccion:  10
Nº features después de la seleccion:  4
R2 for WF5 is 0.9999998978820784
Predictions for WF5 has been added to submission_df
Nº features antes de la seleccion:  10
Nº features después de la seleccion:  3
R2 for WF6 is 0.9999999174813817
Predictions for WF6 has been added to submission_df
*****************

In [43]:
X_train_pped.shape

(5989, 7)

In [30]:
ttreg.score(X_train_pped, Y_train_cpy)

<bound method RegressorMixin.score of TransformedTargetRegressor(check_inverse=False, func=None, inverse_func=None,
                           regressor=KNeighborsRegressor(algorithm='auto',
                                                         leaf_size=30,
                                                         metric='minkowski',
                                                         metric_params=None,
                                                         n_jobs=None,
                                                         n_neighbors=28, p=1,
                                                         weights='uniform'),
                           transformer=PowerTransformer(copy=True,
                                                        method='yeo-johnson',
                                                        standardize=True))>

In [25]:
np.array(Y_test_cpy).shape

NameError: name 'Y_test_cpy' is not defined

In [15]:
ttreg = TransformedTargetRegressor(regressor=knn_reg(algorithm='auto',
                                                    metric='minkowski',
                                                    n_neighbors=28,
                                                    p=1, weights='uniform'), func=np.log, inverse_func=np.exp)

ttreg.fit()

Unnamed: 0,ID,Time,U,V,T,w_vel,w_dir
0,1,2018-05-01 01:00:00,-2.248047,-3.257812,286.50,3.958163,34.607537
1,2,2018-05-01 02:00:00,-2.433594,-1.446289,286.25,2.830924,59.276909
2,3,2018-05-01 03:00:00,3.365234,-3.060547,285.75,4.548818,312.285273
3,4,2018-05-01 04:00:00,3.707031,-6.218750,284.75,7.239816,329.200597
4,5,2018-05-01 05:00:00,3.812500,-5.445312,284.50,6.647299,325.002463
...,...,...,...,...,...,...,...
6234,6235,2019-01-15 20:00:00,-2.546875,-7.687500,280.25,8.098409,18.330076
6235,6236,2019-01-15 21:00:00,-3.097656,-6.777344,279.75,7.451702,24.563269
6236,6237,2019-01-15 22:00:00,-3.261719,-5.277344,279.50,6.203964,31.718582
6237,6238,2019-01-15 23:00:00,-3.345703,-3.085938,279.25,4.551565,47.312845


## Prueba 1: Regresión polinomial con regularización *ridge*

In [6]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import RidgeCV

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['T2'] = np.sqrt(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['T2'] = np.sqrt(X_test_cpy['T'])

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','T2']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT','T2']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)


    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop',
                                                     ['Time','U','V','T','CLCT','T2'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False, add_vel_pot=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial utilizando CV con regularización tipo Ride
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    rreg = lm.RidgeCV(alphas=np.logspace(-4, -3, 3, 4), store_cv_values=True)
    rreg.fit(X_train_poly, Y_train_cpy)
    
    # guardamos modelo
    models.append(joblib.dump(rreg, WF + '_rreg'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = rreg.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('RMSE for {} is {}'.format(WF, np.mean(np.sqrt(rreg.cv_values_))))

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_rreg.csv", index=False, sep=",") 

AttributeError: module 'src.functions.data_transformation' has no attribute 'DerivedAttributesAdder'

In [None]:
X_train_cpy

In [None]:
nans = pd.DataFrame(data=np.isnan(X_test_pped).flatten())

## Prueba 2: Random Forest 

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','hour','month','day_of_week'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=True, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('power_transf', PowerTransformer(method='yeo-johnson', standardize=True))
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)


    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    # param_grid = [
    #    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    #    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    #]

    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(
        forest_reg, 
        random_grid, 
        cv=5,
        scoring=cape_scorer,
        n_jobs=4
    )

    grid_search.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search.best_estimator_

    # guardamos modelo
    #models.append(joblib.dump(final_model, WF))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_RF.csv", index=False, sep=",") 

## Prueba 3: SVM 

In [23]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.externals import joblib
import re
from collections import OrderedDict
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 



for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = X_train_cpy.NWP1_U
    X_train_cpy['V'] = X_train_cpy.NWP1_V
    X_train_cpy['T'] = X_train_cpy.NWP3_T
    X_train_cpy['CLCT'] = X_train_cpy.NWP4_CLCT
    
    X_test_cpy['U'] = X_test_cpy.NWP1_U
    X_test_cpy['V'] = X_test_cpy.NWP1_V
    X_test_cpy['T'] = X_test_cpy.NWP3_T
    X_test_cpy['CLCT'] = X_test_cpy.NWP4_CLCT
 
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)
    
    
    ####### Limpiar outliers y valores anómalos #######
    
    # valores negativos en CLCT
    X_train_cpy.loc[X_train_cpy['CLCT'] < 0, 'CLCT'] = 0.0
    X_test_cpy.loc[X_test_cpy['CLCT'] < 0, 'CLCT'] = 0.0

    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # algoritmo para detección de outliers
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20).fit(X)
    threshold = pd.Series(clusterer.outlier_scores_).quantile(0.96)
    outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
    
    # Eliminamos los registros outliers 
    X_train_cpy.drop(X_train_cpy.index[list(outliers)], inplace=True)
    
    # Eliminamos las observaciones corresp#ondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['Production']
    
    ###################################
    
    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['ID','Time','U','V','CLCT','month'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder(add_time_feat=True, add_cycl_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', PowerTransformer())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)
    
    
    
    '''
    print('Nº features antes de la seleccion: ', X_train_pped.shape[1])
    # seleccionar features más importantes mediante Random Forest
    rf = RandomForestRegressor(random_state=42)
    rf_reg = rf.fit(X_train_pped, Y_train_cpy)
    sel = SelectFromModel(rf_reg, prefit=True, threshold='1.75*median')
    X_train_pped = sel.transform(X_train_pped)
    
    print('Nº features después de la seleccion: ', X_train_pped.shape[1])
    '''
  

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    param_grid = {
        'kernel': ('linear', 'rbf','poly'), 
        'C':[0.01, 1, 0.1, 10],
        'gamma': [0.00001, 0.001, 1],
        'epsilon':[0.1,0.3,0.5]
    }

    svm_reg = SVR()
    grid_search_svm = GridSearchCV(
        svm_reg, 
        param_grid, 
        cv= utils.BlockingTimeSeriesSplit(n_splits=5),
        scoring=cape_scorer,
        n_jobs=-1
    )

    grid_search_svm.fit(X_train_pped, Y_train_cpy)

    # Reentrenamos sin validación cruzada utilizando los mejores 
    # parámetros obtenidos con la validación cruzada
    
    svm_reg2 = SVR(kernel = grid_search_svm.best_params_['kernel'],
                  C = grid_search_svm.best_params_['C'],
                  gamma = grid_search_svm.best_params_['gamma'],
                  epsilon = grid_search_svm.best_params_['epsilon'])
    
    ttreg = TransformedTargetRegressor(regressor=svm_reg2, 
                                       transformer=PowerTransformer(), 
                                       check_inverse=False)

    ttreg.fit(X_train_pped, Y_train_cpy)

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    # X_test_pped = sel.transform(X_test_pped)
    predictions = ttreg.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('R2 for {} is {}'.format(WF, ttreg.score(X_train_pped, Y_train_cpy)))
    print('Predictions for {} has been added to submission_df'.format(WF))
    
    scores_list.append(ttreg.score(X_train_pped, Y_train_cpy))
    
global_score = np.mean(scores_list)

print('********************************************')
print('Global score: ', global_score)
print('********************************************')


# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_SVR.csv", index=False, sep=",") 


overflow encountered in reduce



R2 for WF1 is 0.8605183607331424
Predictions for WF1 has been added to submission_df



overflow encountered in reduce



R2 for WF2 is 0.8699449287575118
Predictions for WF2 has been added to submission_df



overflow encountered in reduce



R2 for WF3 is 0.8672894353453126
Predictions for WF3 has been added to submission_df



overflow encountered in reduce



R2 for WF4 is 0.9051895906311048
Predictions for WF4 has been added to submission_df



overflow encountered in reduce



R2 for WF5 is 0.8893119917044603
Predictions for WF5 has been added to submission_df



overflow encountered in reduce



R2 for WF6 is 0.8691731791308887
Predictions for WF6 has been added to submission_df
********************************************
Global score:  0.7572046175247678
********************************************


## Prueba 4: XGBoost

In [51]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.externals import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
import hdbscan
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []
scores_list = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos
    X_test_cpy['CLCT'].fillna(method='bfill', limit=11, inplace=True)
    
     ####### Limpiar outliers y valores anómalos #######
    
    # valores negativos en CLCT
    X_train_cpy.loc[X_train_cpy['CLCT'] < 0, 'CLCT'] = 0.0
    X_test_cpy.loc[X_test_cpy['CLCT'] < 0, 'CLCT'] = 0.0

    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # algoritmo para detección de outliers
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20).fit(X)
    threshold = pd.Series(clusterer.outlier_scores_).quantile(0.96)
    outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
    
    # Eliminamos los registros outliers 
    X_train_cpy.drop(X_train_cpy.index[list(outliers)], inplace=True)
    
    # Eliminamos las observaciones corresp#ondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['Production']
    
    ###################################

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['ID','Time','U','V'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder()), 
        ('pre_processing', pre_process),
    ])


    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy, Y_train_cpy)
    print('Nº features antes de la seleccion: ', X_train_pped.shape[1])
    
    # seleccionar features más importantes mediante Random Forest
    rf = RandomForestRegressor(random_state=42)
    rf_reg = rf.fit(X_train_pped, Y_train_cpy)
    sel = SelectFromModel(rf_reg, prefit=True, threshold='1.75*median')
    X_train_pped = sel.transform(X_train_pped)
    
    print('Nº features después de la seleccion: ', X_train_pped.shape[1])               


    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    
    param_grid = {   
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
    }
    
    tscv = TimeSeriesSplit(n_splits=7)
    xgb_reg = xgb.XGBRegressor()
    grid_search_xgb = GridSearchCV(
        xgb_reg, 
        param_grid, 
        cv=tscv,
        scoring=cape_scorer,
        n_jobs=-1
    )
    
    grid_search_xgb.fit(X_train_pped, Y_train_cpy)
    
    reg = xgb.XGBRegressor(colsample_bytree = grid_search_xgb.best_params_['colsample_bytree'],
                           gamma = grid_search_xgb.best_params_['gamma'],
                           max_depth = grid_search_xgb.best_params_['max_depth'],
                           min_child_weight = grid_search_xgb.best_params_['min_child_weight'],
                           subsample = grid_search_xgb.best_params_['subsample'],
                           random_state=42)
                                       
    reg.fit(X_train_pped, Y_train_cpy)
    
    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_pped = sel.transform(X_test_pped)
    predictions = reg.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('R2 for {} is {}'.format(WF, reg.score(X_train_pped, Y_train_cpy)))
    print('Predictions for {} has been added to submission_df'.format(WF))
    
    scores_list.append(reg.score(X_train_pped, Y_train_cpy))
    
global_score = np.mean(scores_list)

print('********************************************')
print('Global score: ', global_score)
print('********************************************')


# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_xgb2.csv", index=False, sep=",")

Nº features antes de la seleccion:  12
Nº features después de la seleccion:  4
R2 for WF1 is 0.8956100729989427
Predictions for WF1 has been added to submission_df
Nº features antes de la seleccion:  12
Nº features después de la seleccion:  1
R2 for WF2 is 0.8794353715432277
Predictions for WF2 has been added to submission_df
Nº features antes de la seleccion:  12
Nº features después de la seleccion:  4
R2 for WF3 is 0.8792417854797664
Predictions for WF3 has been added to submission_df
Nº features antes de la seleccion:  12
Nº features después de la seleccion:  1
R2 for WF4 is 0.8877146620651525
Predictions for WF4 has been added to submission_df
Nº features antes de la seleccion:  12
Nº features después de la seleccion:  4
R2 for WF5 is 0.9270561316995192
Predictions for WF5 has been added to submission_df
Nº features antes de la seleccion:  12
Nº features después de la seleccion:  3
R2 for WF6 is 0.8932442722107024
Predictions for WF6 has been added to submission_df
****************

In [37]:
rg.named_steps.regression.

<function ndarray.mean>

## Prueba 5: Random Forest con validación Randomized Search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['T2'] = np.sqrt(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['T2'] = np.sqrt(X_test_cpy['T'])

    
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT','T2']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT','T2']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)
    
    ####### Limpiar outliers #######
    
    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # Definir el clasificador de outliers
    clf = OneClassSVM(nu=0.17, gamma=0.06)
    clf.fit(X)
    
    # Predección de outlier o inlier para cada punto
    y_pred = clf.predict(X)
    
    # Añadirmos la columna 'oulier' con la predicción 
    X_train_cpy['outlier'] = y_pred.tolist()
    
    # Eliminamos los registros outliers 
    X_train_cpy = X_train_cpy[X_train_cpy['outlier'] != -1]
    
    # Eliminamos las observaciones correspondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel', 'outlier' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['outlier']
    del X_train_cpy['Production']
    
    ###################################

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['ID','Time','U','V','T2'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder(add_w_shear=False, add_time_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])
    # aplciar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Randomized Search 

    forest_reg = RandomForestRegressor()

    rf_random = RandomizedSearchCV(
        estimator = forest_reg, 
        param_distributions = random_grid, 
        n_iter = 100, 
        cv = 5, 
        random_state=42, 
        scoring = cape_scorer,
        n_jobs = -1
    )

    rf_random.fit(X_train_pped, Y_train_cpy)
    final_model = rf_random.best_estimator_

    # guardamos modelo
    models.append(joblib.dump(final_model, WF + 'rfrand'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -rf_random.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')


# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_rfrand.csv", index=False, sep=",") 

## Prueba 6: Regresión con Elastic Net (Ridge + Lasso)

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.externals import joblib
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
# models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['T2'] = np.sqrt(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['T2'] = np.sqrt(X_test_cpy['T'])

    
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT','T2']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT','T2']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)
    
    ####### Limpiar outliers #######
    
    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # Definir el clasificador de outliers
    clf = OneClassSVM(nu=0.17, gamma=0.06)
    clf.fit(X)
    
    # Predección de outlier o inlier para cada punto
    y_pred = clf.predict(X)
    
    # Añadirmos la columna 'oulier' con la predicción 
    X_train_cpy['outlier'] = y_pred.tolist()
    
    # Eliminamos los registros outliers 
    X_train_cpy = X_train_cpy[X_train_cpy['outlier'] != -1]
    
    # Eliminamos las observaciones correspondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel', 'outlier' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['outlier']
    del X_train_cpy['Production']
    
    ###################################

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['ID','Time','U','V','T2'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder(add_w_shear=False, add_time_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)
    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial con regularización Elastic Net
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    param_grid = [{
        'alpha'     : np.logspace(-3, -2, 1, 2, 3),
        'l1_ratio'  : [0.00, 0.25, 0.50, 0.75, 1.0],
        'tol'       : [0.00001, 0.0001, 0.001]
    }]
    
                                               
    eNet = ElasticNet(selection='random')
    grid_search = GridSearchCV(
        eNet, 
        param_grid, 
        cv=10,
        scoring=cape_scorer,
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search.best_estimator_

    # guardamos modelo
    # models.append(joblib.dump(final_model, WF + '_eNet'))import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['T2'] = np.sqrt(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['T2'] = np.sqrt(X_test_cpy['T'])

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','T2']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT','T2']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)


    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop',
                                                     ['Time','U','V','T','CLCT','T2'])]
    )
    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/submission_eNet.csv", index=False, sep=",") 

## Prueba 7: LassoCV

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import LassoCV

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 



for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['inv_T'] = 1/(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['inv_T'] = 1/(X_test_cpy['T'])
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','inv_T']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT', 'inv_T']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)


    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial utilizando CV con regularización tipo Ride
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    lasso_reg = lm.Lasso()

    grid_search = GridSearchCV(
        lasso_reg, 
        param_grid, 
        cv=7,
        scoring=cape_scorer,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_lasso'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('CAPE for {} is {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_lasso.csv", index=False, sep=",") 

## Prueba 8: Regresión robusta con RANSAC

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto
    X_train_cpy = X_train_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_train_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    X_test_cpy = X_test_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_test_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    # Limpieza de datos: eliminar valores perdidos
    only_na = X_train_cpy[~X_train_cpy.index.isin(X_train_cpy.dropna().index)]
    X_train_cpy.dropna(inplace=True)
    Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    X_test_cpy.dropna(inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['time','U','V','T','w_dir'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión Robusta utilizando RANSAC
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    ransac = RANSACRegressor(LinearRegression(), loss='absolute_loss')
    param_grid = [
        {
            'max_trials': [100, 1000, 10000],
            'min_samples': [10, 30, 50],
        }
    ]
    
    grid_search_ransac = GridSearchCV(
        ransac, 
        param_grid, 
        cv=5,
        scoring=cape_scorer
    )

    grid_search_ransac.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search_ransac.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_ransac'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('CAPE for {} is {}'.format(WF, -grid_search_ransac.best_score_))

# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_ransac.csv", index=False, sep=",") 

## Ejemplo 10: MARS

In [28]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from pyearth import Earth
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
np.warnings.filterwarnings('ignore')
import hdbscan

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []
scores_list = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train.columns[3:]
    cols_test = X_test.columns[3:-9]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_test)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    X_train_cpy = X_train_cpy[['ID','Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['ID','Time','U','V','T','CLCT']]
    
    # Hay 11 valores perdidos en la columna CLCT en X_test_cpy. Los imputamos con la moda
    X_test_cpy.fillna(method='bfill', limit=11, inplace=True)
    
     ####### Limpiar outliers y valores anómalos #######
    
    # valores negativos en CLCT
    X_train_cpy.loc[X_train_cpy['CLCT'] < 0, 'CLCT'] = 0.0
    X_test_cpy.loc[X_test_cpy['CLCT'] < 0, 'CLCT'] = 0.0

    # añadir columna Production
    X_train_cpy['Production'] = Y_train_cpy.to_list()
    
    # calcular módulo velocidad del viento
    X_train_cpy['vel'] = X_train_cpy.apply(get_wind_velmod, axis=1)
    
    # formar matriz de datos 
    X1 = X_train_cpy['vel'].values.reshape(-1,1)
    X2 = X_train_cpy['Production'].values.reshape(-1,1)
    X = np.concatenate((X1,X2), axis=1)
    
    # algoritmo para detección de outliers
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20).fit(X)
    threshold = pd.Series(clusterer.outlier_scores_).quantile(0.96)
    outliers = np.where(clusterer.outlier_scores_ > threshold)[0]
    
    # Eliminamos los registros outliers 
    X_train_cpy.drop(X_train_cpy.index[list(outliers)], inplace=True)
    
    # Eliminamos las observaciones corresp#ondientes de Y_train
    Y_train_cpy = Y_train_cpy.loc[X_train_cpy['ID'].values - 1]
    
    # Eliminamos las columnas 'vel' y 'Production'
    del X_train_cpy['vel']
    del X_train_cpy['Production']
    
    ###################################

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['ID','Time','U','V','month','CLCT'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.NewFeaturesAdder(add_cycl_feat=False, add_time_feat=True)), 
        ('pre_processing', pre_process)
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy, Y_train_cpy)
    # print('Nº features antes de la seleccion: ', X_train_pped.shape[1])
    
    # seleccionar features más importantes mediante Random Forest
    # rf = RandomForestRegressor(random_state=42)
    # rf_reg = rf.fit(X_train_pped, Y_train_cpy)
    # sel = SelectFromModel(rf_reg, prefit=True, threshold='1.75*median')
    # X_train_pped = sel.transform(X_train_pped)    
    # print('Nº features después de la seleccion: ', X_train_pped.shape[1])
    
    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: MARS utilizando py-earth
    param_grid = [{'max_degree': [1,2,3], 
                   'allow_linear': [False, True], 
                   'penalty': [0.,1.,2.,3.,4.,5.,6.],
                   # 'endspan_alpha': list(np.arange(0,1,0.1)),
                   #'minspan_alpha': list(np.arange(0,1,0.1))
                  }]
    
    tscv = TimeSeriesSplit(n_splits=7)
    grid_search = GridSearchCV(
        Earth(), 
        param_grid, 
        cv=tscv,
        scoring=cape_scorer,
        n_jobs=-1,
    )

    grid_search.fit(X_train_pped, Y_train_cpy)
    
    # Reentrenamos sin validación cruzada utilizando los mejores 
    # parámetros obtenidos con la validación cruzada
    
    mars = Earth(max_degree=grid_search.best_params_['max_degree'],
                 allow_linear=grid_search.best_params_['allow_linear'],
                 penalty=grid_search.best_params_['penalty'],
                 # endspan=grid_search.best_params_['endspan'],
                 # minspan=grid_search.best_params_['minspan']
                )
    
    #ttreg = TransformedTargetRegressor(regressor=mars, transformer=StandardScaler(), check_inverse=False)
    
    mars.fit(X_train_pped, Y_train_cpy)

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    # X_test_pped = sel.transform(X_test_pped)
    predictions = mars.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('R2 for {} is {}'.format(WF, mars.rsq_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    
    scores_list.append(mars.rsq_)
    
global_score = np.mean(scores_list)

print('********************************************')
print('Global score: ', global_score)
print('********************************************')


# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_mars.csv", index=False, sep=",") 

Nº features después de la seleccion:  4
R2 for WF1 is 0.8236895292320006
Predictions for WF1 has been added to submission_df
Nº features después de la seleccion:  4
R2 for WF2 is 0.8754899687597073
Predictions for WF2 has been added to submission_df
Nº features después de la seleccion:  4
R2 for WF3 is 0.841930576034218
Predictions for WF3 has been added to submission_df
Nº features después de la seleccion:  4
R2 for WF4 is 0.88094477423363
Predictions for WF4 has been added to submission_df
Nº features después de la seleccion:  4
R2 for WF5 is 0.8943742198669061
Predictions for WF5 has been added to submission_df
Nº features después de la seleccion:  4
R2 for WF6 is 0.8615119003145766
Predictions for WF6 has been added to submission_df
********************************************
Global score:  0.8629901614068398
********************************************
