# RERF

In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import time

In [2]:
tiempo = pd.read_parquet('dataframe_final_1970-2023_alicante_alicante.parquet')

In [3]:
# 70% del tiempo para el train, 30% validacion (hasta 31 diciembre 2022)
# el conjunto de test cogeremos 2023

fecha_test = '2007-12-31'

In [4]:
# las variables del día siguiente las eliminamos y también las correspondientes a la localización ya que estamos solo en Alicante

model_columns = list(set(tiempo.columns) - set(['tmed_mañana', 'prec_mañana', 'tmin_mañana', 'horatmin_mañana', 'tmax_mañana',
                                                                                    'horatmax_mañana', 'dir_mañana', 'velmedia_mañana', 'racha_mañana', 'horaracha_mañana', 
                                                                                    'sol_mañana', 'presMax_mañana', 'horaPresMax_mañana', 'presMin_mañana', 'horaPresMin_mañana',
                                                                                    'indicativo', 'nombre', 'provincia', 'altitud', 'fecha']))

In [5]:
tiempo.horatmin = tiempo.horatmin.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax = tiempo.horatmax.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha = tiempo.horaracha.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaPresMax = tiempo.horaPresMax.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin = tiempo.horaPresMin.apply(lambda x: x.hour if x is not None else np.nan)

tiempo.horatmin_1 = tiempo.horatmin_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_2 = tiempo.horatmin_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_3 = tiempo.horatmin_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_mañana = tiempo.horatmin_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horatmax_1 = tiempo.horatmax_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_2 = tiempo.horatmax_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_3 = tiempo.horatmax_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_mañana = tiempo.horatmax_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horaracha_1 = tiempo.horaracha_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_2 = tiempo.horaracha_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_3 = tiempo.horaracha_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_mañana = tiempo.horaracha_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horaPresMax_1 = tiempo.horaPresMax_1.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_2 = tiempo.horaPresMax_2.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_3 = tiempo.horaPresMax_3.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_mañana = tiempo.horaPresMax_mañana.apply(lambda x: x.hour if x is not None else np.nan)

tiempo.horaPresMin_1 = tiempo.horaPresMin_1.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_2 = tiempo.horaPresMin_2.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_3 = tiempo.horaPresMin_3.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_mañana = tiempo.horaPresMin_mañana.apply(lambda x: x.hour if x is not None else np.nan)

In [6]:
tiempo['año'] = tiempo['fecha'].dt.year
tiempo['mes'] = tiempo['fecha'].dt.month
tiempo['dia'] = tiempo['fecha'].dt.day

In [7]:
train = tiempo[tiempo.fecha <= fecha_test].fillna(-1)
test = tiempo[tiempo.fecha > fecha_test].fillna(-1)

## Target: tmax_mañana

In [8]:
train.tmax_mañana = train.tmax_mañana.astype(float)
test.tmax_mañana = test.tmax_mañana.astype(float)

In [9]:
metrics = {}
it = 1

for alpha in [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 2]:
    start_time1 = time.time()
    lasso = Lasso(alpha=alpha)
    lasso.fit(train[model_columns], train.tmax_mañana);
    
    train_pred_lasso = lasso.predict(train[model_columns])
    test_pred_lasso = lasso.predict(test[model_columns])
    
    train_residuals = train.tmax_mañana - train_pred_lasso
    test_residuals = test.tmax_mañana - test_pred_lasso
    
    time1 = time.time() - start_time1
    
    for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
        for max_depth in [5, 10, 20, 40]:
            for min_samples_leaf in [1, 5, 10]:
                start_time2 = time.time()
                rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = 'sqrt', min_samples_leaf=min_samples_leaf)
                rf.fit(train[model_columns], train_residuals);

                train_pred_rf = rf.predict(train[model_columns])
                test_pred_rf = rf.predict(test[model_columns])
                
                train_values = train_pred_lasso + train_pred_rf
                test_values = test_pred_lasso + test_pred_rf
                
                time2 = time.time() - start_time2
                
                metrics['RERF_'+ str(it)] = {
                    'Train_MAE': mean_absolute_error(train.tmax_mañana, train_values),
                    'Test_MAE': mean_absolute_error(test.tmax_mañana, test_values),
                    'Run_Time': time1 + time2,
                    'alpha': alpha,
                    'n_estimators': n_estimators, 
                    'max_depth': max_depth, 
                    'min_samples_leaf': min_samples_leaf
                }
                
                it += 1

metrics_RERF = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'alpha', 'n_estimators', 'max_depth', 'min_samples_leaf', 
                                                                                                                                   'Train_MAE', 'Test_MAE',])

metrics_RERF['delta'] = metrics_RERF.Test_MAE - metrics_RERF.Train_MAE
metrics_RERF


Unnamed: 0,Run_Time,alpha,n_estimators,max_depth,min_samples_leaf,Train_MAE,Test_MAE,delta
RERF_1,0.983344,0.25,5,5,1,1.511506,1.687469,0.175964
RERF_2,0.980352,0.25,5,5,5,1.499663,1.696789,0.197126
RERF_3,0.977360,0.25,5,5,10,1.502819,1.683999,0.181180
RERF_4,1.048171,0.25,5,10,1,1.365839,1.710476,0.344637
RERF_5,1.036203,0.25,5,10,5,1.371571,1.701650,0.330079
...,...,...,...,...,...,...,...,...
RERF_584,20.680651,2.00,500,20,5,0.991513,1.600257,0.608744
RERF_585,17.487232,2.00,500,20,10,1.183412,1.602549,0.419137
RERF_586,31.025930,2.00,500,40,1,0.544179,1.600542,1.056364
RERF_587,20.762433,2.00,500,40,5,0.959810,1.600017,0.640207


In [10]:
metrics_RERF.to_parquet('rerf_tmax.parquet')

## Target: tmin_mañana

In [11]:
train.tmin_mañana = train.tmin_mañana.astype(float)
test.tmin_mañana = test.tmin_mañana.astype(float)

In [12]:
metrics = {}
it = 1

for alpha in [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 2]:
    start_time1 = time.time()
    lasso = Lasso(alpha=alpha)
    lasso.fit(train[model_columns], train.tmin_mañana);
    
    train_pred_lasso = lasso.predict(train[model_columns])
    test_pred_lasso = lasso.predict(test[model_columns])
    
    train_residuals = train.tmin_mañana - train_pred_lasso
    test_residuals = test.tmin_mañana - test_pred_lasso
    
    time1 = time.time() - start_time1
    
    for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
        for max_depth in [5, 10, 20, 40]:
            for min_samples_leaf in [1, 5, 10]:
                start_time2 = time.time()
                rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = 'sqrt', min_samples_leaf=min_samples_leaf)
                rf.fit(train[model_columns], train_residuals);

                train_pred_rf = rf.predict(train[model_columns])
                test_pred_rf = rf.predict(test[model_columns])
                
                train_values = train_pred_lasso + train_pred_rf
                test_values = test_pred_lasso + test_pred_rf
                
                time2 = time.time() - start_time2
                
                metrics['RERF_'+ str(it)] = {
                    'Train_MAE': mean_absolute_error(train.tmin_mañana, train_values),
                    'Test_MAE': mean_absolute_error(test.tmin_mañana, test_values),
                    'Run_Time': time1 + time2,
                    'alpha': alpha,
                    'n_estimators': n_estimators, 
                    'max_depth': max_depth, 
                    'min_samples_leaf': min_samples_leaf
                }
                
                it += 1

metrics_RERF = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'alpha', 'n_estimators', 'max_depth', 'min_samples_leaf', 
                                                                                                                                   'Train_MAE', 'Test_MAE',])

metrics_RERF['delta'] = metrics_RERF.Test_MAE - metrics_RERF.Train_MAE
metrics_RERF


Unnamed: 0,Run_Time,alpha,n_estimators,max_depth,min_samples_leaf,Train_MAE,Test_MAE,delta
RERF_1,0.743011,0.25,5,5,1,1.299439,1.397221,0.097782
RERF_2,0.744979,0.25,5,5,5,1.294583,1.384720,0.090137
RERF_3,0.747002,0.25,5,5,10,1.298641,1.393208,0.094567
RERF_4,0.803874,0.25,5,10,1,1.168097,1.388234,0.220137
RERF_5,0.814792,0.25,5,10,5,1.177753,1.371188,0.193435
...,...,...,...,...,...,...,...,...
RERF_584,19.876832,2.00,500,20,5,0.838792,1.240277,0.401485
RERF_585,17.342644,2.00,500,20,10,1.004982,1.243360,0.238378
RERF_586,30.170276,2.00,500,40,1,0.459732,1.236395,0.776664
RERF_587,20.419384,2.00,500,40,5,0.814184,1.239263,0.425079


In [13]:
metrics_RERF.to_parquet('rerf_tmin.parquet')

## Target: prec_mañana

In [14]:
train.prec_mañana = train.prec_mañana.astype(float)
test.prec_mañana = test.prec_mañana.astype(float)

In [15]:
metrics = {}
it = 1

for alpha in [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 2]:
    start_time1 = time.time()
    lasso = Lasso(alpha=alpha)
    lasso.fit(train[model_columns], train.prec_mañana);
    
    train_pred_lasso = lasso.predict(train[model_columns])
    test_pred_lasso = lasso.predict(test[model_columns])
    
    train_residuals = train.prec_mañana - train_pred_lasso
    test_residuals = test.prec_mañana - test_pred_lasso
    
    time1 = time.time() - start_time1
    
    for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
        for max_depth in [5, 10, 20, 40]:
            for min_samples_leaf in [1, 5, 10]:
                start_time2 = time.time()
                rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = 'sqrt', min_samples_leaf=min_samples_leaf)
                rf.fit(train[model_columns], train_residuals);

                train_pred_rf = rf.predict(train[model_columns])
                test_pred_rf = rf.predict(test[model_columns])
                
                train_values = train_pred_lasso + train_pred_rf
                test_values = test_pred_lasso + test_pred_rf
                
                time2 = time.time() - start_time2
                
                metrics['RERF_'+ str(it)] = {
                    'Train_MAE': mean_absolute_error(train.prec_mañana, train_values),
                    'Test_MAE': mean_absolute_error(test.prec_mañana, test_values),
                    'Run_Time': time1 + time2,
                    'alpha': alpha,
                    'n_estimators': n_estimators, 
                    'max_depth': max_depth, 
                    'min_samples_leaf': min_samples_leaf
                }
                
                it += 1

metrics_RERF = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'alpha', 'n_estimators', 'max_depth', 'min_samples_leaf', 
                                                                                                                                   'Train_MAE', 'Test_MAE',])

metrics_RERF['delta'] = metrics_RERF.Test_MAE - metrics_RERF.Train_MAE
metrics_RERF


Unnamed: 0,Run_Time,alpha,n_estimators,max_depth,min_samples_leaf,Train_MAE,Test_MAE,delta
RERF_1,0.694144,0.25,5,5,1,1.390887,1.542892,0.152005
RERF_2,0.704117,0.25,5,5,5,1.428612,1.480242,0.051630
RERF_3,0.686166,0.25,5,5,10,1.472787,1.492630,0.019844
RERF_4,0.746004,0.25,5,10,1,1.233588,1.747843,0.514255
RERF_5,0.745007,0.25,5,10,5,1.342433,1.589773,0.247340
...,...,...,...,...,...,...,...,...
RERF_584,20.693904,2.00,500,20,5,1.132918,1.449850,0.316932
RERF_585,18.233397,2.00,500,20,10,1.246521,1.401765,0.155244
RERF_586,28.020469,2.00,500,40,1,0.587250,1.674027,1.086777
RERF_587,21.965483,2.00,500,40,5,1.124149,1.450950,0.326800


In [16]:
metrics_RERF.to_parquet('rerf_prec.parquet')

## Target: sol_mañana

In [17]:
train.sol_mañana = train.sol_mañana.astype(float)
test.sol_mañana = test.sol_mañana.astype(float)

In [18]:
metrics = {}
it = 1

for alpha in [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 2]:
    start_time1 = time.time()
    lasso = Lasso(alpha=alpha)
    lasso.fit(train[model_columns], train.sol_mañana);
    
    train_pred_lasso = lasso.predict(train[model_columns])
    test_pred_lasso = lasso.predict(test[model_columns])
    
    train_residuals = train.sol_mañana - train_pred_lasso
    test_residuals = test.sol_mañana - test_pred_lasso
    
    time1 = time.time() - start_time1
    
    for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
        for max_depth in [5, 10, 20, 40]:
            for min_samples_leaf in [1, 5, 10]:
                start_time2 = time.time()
                rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = 'sqrt', min_samples_leaf=min_samples_leaf)
                rf.fit(train[model_columns], train_residuals);

                train_pred_rf = rf.predict(train[model_columns])
                test_pred_rf = rf.predict(test[model_columns])
                
                train_values = train_pred_lasso + train_pred_rf
                test_values = test_pred_lasso + test_pred_rf
                
                time2 = time.time() - start_time2
                
                metrics['RERF_'+ str(it)] = {
                    'Train_MAE': mean_absolute_error(train.sol_mañana, train_values),
                    'Test_MAE': mean_absolute_error(test.sol_mañana, test_values),
                    'Run_Time': time1 + time2,
                    'alpha': alpha,
                    'n_estimators': n_estimators, 
                    'max_depth': max_depth, 
                    'min_samples_leaf': min_samples_leaf
                }
                
                it += 1

metrics_RERF = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'alpha', 'n_estimators', 'max_depth', 'min_samples_leaf', 
                                                                                                                                   'Train_MAE', 'Test_MAE',])

metrics_RERF['delta'] = metrics_RERF.Test_MAE - metrics_RERF.Train_MAE
metrics_RERF


Unnamed: 0,Run_Time,alpha,n_estimators,max_depth,min_samples_leaf,Train_MAE,Test_MAE,delta
RERF_1,0.629282,0.25,5,5,1,2.221058,2.348357,0.127299
RERF_2,0.641250,0.25,5,5,5,2.225923,2.351519,0.125596
RERF_3,0.622332,0.25,5,5,10,2.236047,2.351504,0.115457
RERF_4,0.683107,0.25,5,10,1,1.986661,2.379311,0.392651
RERF_5,0.683159,0.25,5,10,5,2.042551,2.356897,0.314346
...,...,...,...,...,...,...,...,...
RERF_584,19.069012,2.00,500,20,5,1.501931,2.308478,0.806547
RERF_585,16.651508,2.00,500,20,10,1.780580,2.305062,0.524482
RERF_586,29.904010,2.00,500,40,1,0.825498,2.373641,1.548143
RERF_587,19.871865,2.00,500,40,5,1.461053,2.309763,0.848709


In [19]:
metrics_RERF.to_parquet('rerf_sol.parquet')