In [1]:
%load_ext autoreload
%autoreload 2

# Librerias comunes
import numpy as np
import pandas as pd
import os
import datetime as dt

from src.functions import data_import as dimp
from src.functions import data_exploration as dexp
from src.functions import data_transformation as dtr

# gráficas
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly as pty
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

import re
from collections import OrderedDict

# Dónde guardar las imágenes 
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignorar warnings no útiles (SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")


# Algunas pruebas con diferentes modelos

Para establecer un punto de referencia o *benchmark*, probaremos distintos modelos sin profundizar mucho en el tuning de los hiperparámetros. El objetivo es paulatinamente mejorar el rendimiento de los modelos, a base de modificar las fases del proceso (EDA, Feature Engineering, Hyperparameter Tuning, ...).

Elegiremos para conformar nuestro set de entrenamiento:
* Un solo NWP.
* Run de las 00h.
* Día D-1.
* Variables meteorológicas de partida `time`, `U`, `V` y `T`.

In [9]:
# Load data
DATA_PATH = 'D:/Master/Asignaturas/TFM/Git/TFM/data/raw/'
X_train = dimp.import_data(os.path.join(DATA_PATH, 'X_train_v2.csv'))
X_test = dimp.import_data(os.path.join(DATA_PATH, 'X_test_v2.csv'))
Y_train = dimp.import_data(os.path.join(DATA_PATH, 'Y_train.csv'))

X_train['Time'] = pd.to_datetime(X_train['Time'], format='%d/%m/%Y %H:%M')
X_test['Time'] = pd.to_datetime(X_test['Time'], format='%d/%m/%Y %H:%M')

Memory usage of dataframe is 29.94 MB
Memory usage after optimization is: 7.72 MB
Decreased by 74.2%
Memory usage of dataframe is 29.26 MB



invalid value encountered in less


invalid value encountered in less



Memory usage after optimization is: 19.47 MB
Decreased by 33.5%
Memory usage of dataframe is 0.57 MB
Memory usage after optimization is: 0.21 MB
Decreased by 62.5%


In [32]:
X_train.head()

Unnamed: 0,Time,ID,WF,NWP1_00h_D-2_U,NWP1_00h_D-2_V,NWP1_00h_D-2_T,NWP1_06h_D-2_U,NWP1_06h_D-2_V,NWP1_06h_D-2_T,NWP1_12h_D-2_U,...,NWP3_U,NWP3_V,NWP3_T,NWP4_U,NWP4_V,NWP4_CLCT,U,V,T,CLCT
0,2018-05-01 01:00:00,1,WF1,,,,,,,,...,-1.149414,-2.275391,286.0,1.254883,-0.289795,82.5625,0.117188,-2.041199,286.25,82.5625
1,2018-05-01 02:00:00,2,WF1,,,,,,,,...,-1.149414,-2.275391,286.0,2.490234,-0.41333,100.0,0.379639,-1.619202,286.125,100.0
2,2018-05-01 03:00:00,3,WF1,,,,,,,,...,-1.149414,-2.275391,286.0,0.99707,-1.415039,98.375,1.456055,-2.273193,285.875,98.375
3,2018-05-01 04:00:00,4,WF1,,,,,,,,...,-0.519206,-2.721354,285.666667,0.689453,-0.961426,94.875,1.763916,-3.171183,285.208333,94.875
4,2018-05-01 05:00:00,5,WF1,,,,,,,,...,0.111003,-3.167318,285.333333,0.291016,-0.294922,95.875,1.98999,-3.03304,284.916667,95.875


In [33]:
X_test.head()

Unnamed: 0,Time,ID,WF,NWP1_00h_D-2_U,NWP1_00h_D-2_V,NWP1_00h_D-2_T,NWP1_06h_D-2_U,NWP1_06h_D-2_V,NWP1_06h_D-2_T,NWP1_12h_D-2_U,...,NWP3_U,NWP3_V,NWP3_T,NWP4_U,NWP4_V,NWP4_CLCT,U,V,T,CLCT
0,2019-01-16 01:00:00,37376,WF1,-4.550781,-1.514648,279.0,-3.048828,-2.640625,278.0,-3.363281,...,-3.904297,-0.499268,281.25,-0.70166,-1.819336,-1.6e-05,-2.29895,-0.775757,280.25,-1.6e-05
1,2019-01-16 02:00:00,37377,WF1,-5.792969,0.418701,279.5,-3.115234,-0.492188,277.75,-3.640625,...,-3.904297,-0.499268,281.25,-1.011719,-1.740234,-1.6e-05,-2.509277,-0.339233,280.375,-1.6e-05
2,2019-01-16 03:00:00,37378,WF1,-5.980469,1.007812,280.25,-2.958984,1.196289,277.75,-3.666016,...,-3.904297,-0.499268,281.25,-1.192383,-1.71582,-1.6e-05,-2.786865,-0.178101,280.75,-1.6e-05
3,2019-01-16 04:00:00,37379,WF1,-6.167969,0.498291,280.75,-2.552734,2.375,278.0,-3.853516,...,-4.34375,-0.31189,281.25,-1.756836,-1.636719,7.855469,-3.447835,-0.23524,281.0,7.855469
4,2019-01-16 05:00:00,37380,WF1,-6.917969,0.222778,281.0,-2.736328,2.484375,278.5,-4.523438,...,-4.783203,-0.124512,281.25,-2.044922,-1.605469,-1.6e-05,-4.126628,-0.324346,281.0,-1.6e-05


In [10]:
def input_missing_values(df, cols):
    regex = 'NWP(?P<NWP>\d{1})_(?P<run>\d{2}h)_(?P<fc_day>D\W?\d?)_(?P<weather_var>\w{1,4})'
    p = re.compile(regex)  
    
    NWP_met_vars_dict = {
        '1': ['U','V','T'],
        '2': ['U','V'],
        '3': ['U','V','T'],
        '4': ['U','V','CLCT']
    }
    
    for col in reversed(cols):
        m = p.match(col)
        col_name = 'NWP' + m.group('NWP') + '_' +  m.group('run') + '_' + m.group('fc_day') + '_' + m.group('weather_var')
        nwp = m.group('NWP')

        for key, value in NWP_met_vars_dict.items():
            for i in value:
                if m.group('NWP') == key and m.group('weather_var') == i:
                    df['NWP'+ key + '_' + i] = df['NWP'+ key + '_' + i].fillna(df[col_name])
    
    return df

## Regresión con KNN

In [21]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsRegressor
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['inv_T'] = 1/(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['inv_T'] = 1/(X_test_cpy['T'])

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','inv_T']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT','inv_T']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T','CLCT'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: KNN
    knn_reg = KNeighborsRegressor()
    param_grid = [
        {
            'n_neighbors': list(range(1,30)),
            'algorithm':['auto', 'kd_tree'],
            'weights': ['uniform','distance'],
            'p': [1,2]
        }
    ]
    
    grid_search_knn = GridSearchCV(
        knn_reg, 
        param_grid, 
        cv=5,
        n_jobs=-1,
        scoring=cape_scorer
    )

    grid_search_knn.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search_knn.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_knn'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('CAPE for {} is {}'.format(WF, -grid_search_knn.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))


# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_knn.csv", index=False, sep=",") 

CAPE for WF1 is 43.28328915661583
Predictions for WF1 has been added to submission_df
CAPE for WF2 is 35.6829080057921
Predictions for WF2 has been added to submission_df
CAPE for WF3 is 38.011376408521606
Predictions for WF3 has been added to submission_df
CAPE for WF4 is 33.62318687507031
Predictions for WF4 has been added to submission_df
CAPE for WF5 is 36.27550335648889
Predictions for WF5 has been added to submission_df
CAPE for WF6 is 33.99157114478972
Predictions for WF6 has been added to submission_df


In [12]:
grid_search_knn.best_params_

{'n_neighbors': 21, 'p': 1, 'weights': 'uniform'}

## Prueba 1: Regresión polinomial con regularización *ridge*

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import RidgeCV

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto
    X_train_cpy = X_train_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_train_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    X_test_cpy = X_test_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_test_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    # Limpieza de datos: eliminar valores perdidos
    only_na = X_train_cpy[~X_train_cpy.index.isin(X_train_cpy.dropna().index)]
    X_train_cpy.dropna(inplace=True)
    Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    X_test_cpy.dropna(inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop',
                                                     ['time','U','V','month','hour','day_of_week'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder()), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial utilizando CV con regularización tipo Ride
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    rreg = lm.RidgeCV(alphas=np.logspace(-4, -3, 3, 4), store_cv_values=True)
    rreg.fit(X_train_poly, Y_train_cpy)
    
    # guardamos modelo
    models.append(joblib.dump(rreg, WF + '_rreg'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = rreg.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('RMSE for {} is {}'.format(WF, np.sqrt(rreg.cv_values_)))

# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_rreg.csv", index=False, sep=",") 

## Prueba 2: Random Forest 

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=True, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)


    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    # param_grid = [
    #    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    #    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    #]

    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(
        forest_reg, 
        random_grid, 
        cv=5,
        scoring=cape_scorer,
        n_jobs=-1
    )

    grid_search.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search.best_estimator_

    # guardamos modelo
    #models.append(joblib.dump(final_model, WF))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_RF.csv", index=False, sep=",") 

In [131]:
pred_matrix

array([[6.77150000e+04, 8.24496733e-02],
       [6.77160000e+04, 9.10754914e-02],
       [6.77170000e+04, 2.03519919e-02],
       ...,
       [7.39020000e+04, 2.40697737e-01],
       [7.39030000e+04, 1.21634805e-01],
       [7.39040000e+04, 2.31507012e-01]])

## Prueba 3: SVM 

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.externals import joblib
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 



for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['inv_T'] = 1/(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['inv_T'] = 1/(X_test_cpy['T'])

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)


    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    param_grid = [
        {
            'kernel': ('linear', 'rbf','poly'), 
            'C':[0.001, 0.1, 10],
            'gamma': [0.001, 0.1, 10]
        }
    ]

    svm_reg = SVR()
    grid_search_svm = GridSearchCV(
        svm_reg, 
        param_grid, 
        cv=5,
        scoring=cape_scorer,
        n_jobs=-1
    )

    grid_search_svm.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search_svm.best_estimator_

    # guardamos modelo
    # models.append(joblib.dump(final_model, WF + '_SVM'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Best score for {}: {}'.format(WF, -grid_search_svm.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')
        

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_SVM.csv", index=False, sep=",") 

In [None]:
final_model.score

## Prueba 4: XGBoost

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.externals import joblib

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['inv_T'] = 1/(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['inv_T'] = 1/(X_test_cpy['T'])

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','inv_T']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT','inv_T']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T','CLCT'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    param_grid = [{
        'n_estimators': [400, 700, 1000],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [15,20,25],
        'reg_alpha': [1.1, 1.2, 1.3],
        'reg_lambda': [1.1, 1.2, 1.3],
        'subsample': [0.7, 0.8, 0.9]
    }]

    xgb_reg = xgb.XGBRegressor()
    grid_search_xgb = GridSearchCV(
        xgb_reg, 
        param_grid, 
        cv=5,
        scoring=cape_scorer,
        n_jobs=-1
    )

    grid_search_xgb.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search_xgb.best_estimator_

    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_xgb'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Predictions for {} has been added to submission_df'.format(WF))
        

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_xgb.csv", index=False, sep=",") 

## Prueba 5: Random Forest con validación Randomized Search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto
    X_train_cpy = X_train_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_train_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    X_test_cpy = X_test_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_test_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    # Limpieza de datos: eliminar valores perdidos
    only_na = X_train_cpy[~X_train_cpy.index.isin(X_train_cpy.dropna().index)]
    X_train_cpy.dropna(inplace=True)
    Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    X_test_cpy.dropna(inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['time','U','V','month','hour','day_of_week'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder()), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplciar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Random Forest
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Randomized Search 

    forest_reg = RandomForestRegressor()

    rf_random = RandomizedSearchCV(
        estimator = forest_reg, 
        param_distributions = random_grid, 
        n_iter = 100, 
        cv = 3, 
        random_state=42, 
        scoring = cape_scorer,
        n_jobs = -1
    )

    rf_random.fit(X_train_pped, Y_train_cpy)
    final_model = rf_random.best_estimator_

    # guardamos modelo
    models.append(joblib.dump(final_model, WF + 'rfrand'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -rf_random.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')


# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_rfrand.csv", index=False, sep=",") 

## Prueba 6: Regresión con Elastic Net (Ridge + Lasso)

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.externals import joblib
import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
# models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)
    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial con regularización Elastic Net
    # Entrenamiento del modelo mediante k-fold cross validation
    # Búsqueda de hiperparámetros mediante Gridsearch 
    
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    param_grid = [{
        'alpha'     : np.logspace(-3, -2, 1, 2, 3),
        'l1_ratio'  : [0.00, 0.25, 0.50, 0.75, 1.0],
        'tol'       : [0.00001, 0.0001, 0.001]
    }]
    
                                               
    eNet = ElasticNet(selection='random')
    grid_search = GridSearchCV(
        eNet, 
        param_grid, 
        cv=10,
        scoring=cape_scorer,
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search.best_estimator_

    # guardamos modelo
    # models.append(joblib.dump(final_model, WF + '_eNet'))import re
from collections import OrderedDict


# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)

    
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T','CLCT'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)

    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True)  
    print('Best score for {}: {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('--------')

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/submission_eNet.csv", index=False, sep=",") 

## Prueba 7: LassoCV

In [16]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import LassoCV

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 



for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto

    # We'll add new columns NWPX_<met_var> without missing values
    # new cols: NWP1_U, NWP1_V, NWP1_T, NWP2_U, NWP2_V, NWP3_U, NWP3_V, NWP3_T, NWP4_U, NWP4_V, NWP4_CLCT
    
    new_cols = ['NWP1_U','NWP1_V','NWP1_T','NWP2_U',
                'NWP2_V','NWP3_U','NWP3_V','NWP3_T',
                'NWP4_U','NWP4_V','NWP4_CLCT']

    def add_new_cols(new_cols, df):
        for col in new_cols:
            df[col] = np.nan    
            
    add_new_cols(new_cols, X_train_cpy)
    add_new_cols(new_cols, X_test_cpy)
    
    cols_train = X_train_cpy.columns[3:-11]
    cols_test = X_test_cpy.columns[3:-11]
    X_train_cpy = input_missing_values(X_train_cpy, cols_train)
    X_test_cpy = input_missing_values(X_test_cpy, cols_train)
    
    col_list = ['NWP2_U','NWP2_V','NWP3_U','NWP3_V','NWP3_T']
    X_train_cpy.index = X_train_cpy['Time']
    
    del X_train_cpy['Time']
        
    for var in col_list:
        X_train_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=2,
            limit_direction='both'
        )

    
    X_train_cpy.reset_index(inplace=True)

    X_test_cpy.index = X_test_cpy['Time']
    
    del X_test_cpy['Time']
        
    for var in col_list:
        X_test_cpy[var].interpolate(
            method='time', 
            inplace=True,
            limit=100,
            limit_direction='both'
        )
    
        
    X_test_cpy.reset_index(inplace=True) 

    X_train_cpy['U'] = (X_train_cpy.NWP1_U + X_train_cpy.NWP2_U + X_train_cpy.NWP3_U + X_train_cpy.NWP4_U)/4
    X_train_cpy['V'] = (X_train_cpy.NWP1_V + X_train_cpy.NWP2_V + X_train_cpy.NWP3_V + X_train_cpy.NWP4_V)/4
    X_train_cpy['T'] = (X_train_cpy.NWP1_T + X_train_cpy.NWP3_T)/2
    X_train_cpy['CLCT'] = (X_train_cpy.NWP4_CLCT)
    X_train_cpy['inv_T'] = 1/(X_train_cpy['T'])
    
    X_test_cpy['U'] = (X_test_cpy.NWP1_U + X_test_cpy.NWP2_U + X_test_cpy.NWP3_U)/3
    X_test_cpy['V'] = (X_test_cpy.NWP1_V + X_test_cpy.NWP2_V + X_test_cpy.NWP3_V)/3
    X_test_cpy['T'] = (X_test_cpy.NWP1_T + X_test_cpy.NWP3_T)/2
    X_test_cpy['CLCT'] = (X_test_cpy.NWP4_CLCT)
    X_test_cpy['inv_T'] = 1/(X_test_cpy['T'])
    
    X_train_cpy = X_train_cpy[['Time','U','V','T','CLCT','inv_T']]
    X_test_cpy = X_test_cpy[['Time','U','V','T','CLCT', 'inv_T']]

    
    ## Limpieza de datos: imputar valores perdidos en X_
    #only_na = X_test_cpy[~X_test_cpy.ID.isin(X_test_cpy.dropna().ID)]
    #X_test_cpy.dropna(inplace=True)
    #Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)
    #X_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', 
                                                     ['Time','U','V','T'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)


    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión polinomial utilizando CV con regularización tipo Ride
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    lasso_reg = lm.Lasso()

    grid_search = GridSearchCV(
        lasso_reg, 
        param_grid, 
        cv=7,
        scoring=cape_scorer,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_lasso'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('CAPE for {} is {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))

# generamos fichero csv para el submission
submission_df.to_csv("../../TFM/models/submission_lasso.csv", index=False, sep=",") 

Fitting 7 folds for each of 6 candidates, totalling 42 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  42 | elapsed:   13.9s finished


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Prueba 8: Regresión robusta con RANSAC

In [None]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto
    X_train_cpy = X_train_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_train_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    X_test_cpy = X_test_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_test_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    # Limpieza de datos: eliminar valores perdidos
    only_na = X_train_cpy[~X_train_cpy.index.isin(X_train_cpy.dropna().index)]
    X_train_cpy.dropna(inplace=True)
    Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    X_test_cpy.dropna(inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['time','U','V','T','w_dir'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: Regresión Robusta utilizando RANSAC
    poly_features = PolynomialFeatures(degree=3, include_bias=False)
    X_train_poly = poly_features.fit_transform(X_train_pped)
    
    ransac = RANSACRegressor(LinearRegression(), loss='absolute_loss')
    param_grid = [
        {
            'max_trials': [100, 1000, 10000],
            'min_samples': [10, 30, 50],
        }
    ]
    
    grid_search_ransac = GridSearchCV(
        ransac, 
        param_grid, 
        cv=5,
        scoring=cape_scorer
    )

    grid_search_ransac.fit(X_train_poly, Y_train_cpy)
    final_model = grid_search_ransac.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_ransac'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    X_test_poly = poly_features.fit_transform(X_test_pped)
    predictions = final_model.predict(X_test_poly)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('Predictions for {} has been added to submission_df'.format(WF))
    print('CAPE for {} is {}'.format(WF, -grid_search_ransac.best_score_))

# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_ransac.csv", index=False, sep=",") 

## Ejemplo 10: MARS

In [38]:
# Automatización de pre-procesado, modelización, validación y prediccion
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from src.functions import metric
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model as lm
from pyearth import Earth

# Lista de WFs
WF_lst = X_train['WF'].unique()

# lista para guardar los modelos, uno por WF
models = []

# Data frame para guardar las predicciones (ID, Production)
submission_df = pd.DataFrame([], columns=['ID','Production']) 

for WF in WF_lst:
    # Creamos copia de los datos para no perder su formato inicial
    X_train_cpy = X_train.copy()
    Y_train_cpy = Y_train.copy()
    X_test_cpy = X_test.copy()

    # Selección de filas por WF
    X_train_cpy = X_train_cpy[X_train_cpy['WF'] == WF]
    X_test_cpy = X_test_cpy[X_test_cpy['WF'] == WF]

    # Identificador de observaciones
    ID_train = X_train_cpy['ID']
    ID_test = X_test_cpy['ID']

    # Seleccion de las filas de Y_train
    Y_train_cpy = Y_train_cpy['Production']
    Y_train_cpy = Y_train_cpy.loc[ID_train.values - 1]

    # Pre-procesado de los datos en bruto
    X_train_cpy = X_train_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_train_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    X_test_cpy = X_test_cpy[['Time','NWP1_00h_D-1_U','NWP1_00h_D-1_V','NWP1_00h_D-1_T']]
    X_test_cpy.rename(
        columns={"Time": "time", "NWP1_00h_D-1_U": "U", "NWP1_00h_D-1_V": "V", "NWP1_00h_D-1_T":"T"}, 
        inplace=True)

    # Limpieza de datos: eliminar valores perdidos
    only_na = X_train_cpy[~X_train_cpy.index.isin(X_train_cpy.dropna().index)]
    X_train_cpy.dropna(inplace=True)
    Y_train_cpy.drop(labels=only_na.index, axis=0, inplace=True)

    X_test_cpy.dropna(inplace=True)

    # Pre-procesado de los datos
    pre_process = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [('drop_columns', 'drop', ['time','U','V'])]
    )

    # definir pipeline
    prepare_data_pipeline = Pipeline(steps=[
        ('attr_adder', dtr.DerivedAttributesAdder(add_time_feat=False, add_cyclic_feat=False)), 
        ('pre_processing', pre_process),
        ('std_scaler', StandardScaler())
    ])

    # aplicar pipeline
    X_train_pped = prepare_data_pipeline.fit_transform(X_train_cpy)

    # creamos nuestro scorer basado en el CAPE
    cape_scorer = make_scorer(metric.get_cape, greater_is_better=False)

    # Modelización: MARS utilizando py-earth
    param_grid = [{'max_degree': [1,2,3], 
                   'allow_linear': [False, True], 
                   'penalty': [0.,1.,2.,3.,4.,5.,6.],
                  }]

    grid_search = GridSearchCV(
        Earth(), 
        param_grid, 
        cv=5,
        scoring=cape_scorer,
        n_jobs=-1,
        verbose=1
    )

    grid_search.fit(X_train_pped, Y_train_cpy)
    final_model = grid_search.best_estimator_
    
    # guardamos modelo
    models.append(joblib.dump(final_model, WF + '_mars2'))

    # evaluación sobre el conjunto de test
    X_test_pped = prepare_data_pipeline.transform(X_test_cpy)
    predictions = final_model.predict(X_test_pped)
    
    # generamos matriz de predicciones (ID,Production)
    pred_matrix = np.stack((np.array(ID_test).astype(int), predictions), axis=-1)
    df_pred = pd.DataFrame(data=pred_matrix, columns=['ID','Production'])

    # añadimos las predicciones al dataframe final que contendrá las de todas las WF
    submission_df = submission_df.append(df_pred, ignore_index=True) 
    print('CAPE for {} is {}'.format(WF, -grid_search.best_score_))
    print('Predictions for {} has been added to submission_df'.format(WF))

# generamos fichero csv para el submission
submission_df.to_csv("./Data/submission_mars2.csv", index=False, sep=",") 

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   60.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed: 15.0min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF1 is 54.45958588792473
Predictions for WF1 has been added to submission_df
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:  5.0min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF2 is 43.96392106085858
Predictions for WF2 has been added to submission_df
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed: 14.5min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF3 is 44.344418509673
Predictions for WF3 has been added to submission_df
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:  6.8min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF4 is 40.70388633516939
Predictions for WF4 has been added to submission_df
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed: 13.4min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF5 is 53.24633113853439
Predictions for WF5 has been added to submission_df
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:  1.4min finished

`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.


Passing attributes to check_is_fitted is deprecated and will be removed in 0.23. The attributes argument is ignored.



CAPE for WF6 is 41.21084026442138
Predictions for WF6 has been added to submission_df
