# LightGBM

In [1]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 3.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import time

In [6]:
tiempo = pd.read_parquet('dataframe_final_1970-2023_alicante_alicante.parquet')

In [7]:
# 70% del tiempo para el train, 30% validacion (hasta 31 diciembre 2022)
# el conjunto de test cogeremos 2023

fecha_test = '2007-12-31'

In [8]:
# las variables del día siguiente las eliminamos y también las correspondientes a la localización ya que estamos solo en Alicante

model_columns = list(set(tiempo.columns) - set(['tmed_mañana', 'prec_mañana', 'tmin_mañana', 'horatmin_mañana', 'tmax_mañana',
                                                                                    'horatmax_mañana', 'dir_mañana', 'velmedia_mañana', 'racha_mañana', 'horaracha_mañana', 
                                                                                    'sol_mañana', 'presMax_mañana', 'horaPresMax_mañana', 'presMin_mañana', 'horaPresMin_mañana',
                                                                                    'indicativo', 'nombre', 'provincia', 'altitud', 'fecha']))

In [9]:
tiempo.horatmin = tiempo.horatmin.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax = tiempo.horatmax.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha = tiempo.horaracha.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaPresMax = tiempo.horaPresMax.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin = tiempo.horaPresMin.apply(lambda x: x.hour if x is not None else np.nan)

tiempo.horatmin_1 = tiempo.horatmin_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_2 = tiempo.horatmin_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_3 = tiempo.horatmin_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmin_mañana = tiempo.horatmin_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horatmax_1 = tiempo.horatmax_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_2 = tiempo.horatmax_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_3 = tiempo.horatmax_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horatmax_mañana = tiempo.horatmax_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horaracha_1 = tiempo.horaracha_1.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_2 = tiempo.horaracha_2.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_3 = tiempo.horaracha_3.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)
tiempo.horaracha_mañana = tiempo.horaracha_mañana.apply(lambda x: pd.to_datetime(str(x), format='%H:%M:%S').round('H').hour if x is not None else np.nan)

tiempo.horaPresMax_1 = tiempo.horaPresMax_1.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_2 = tiempo.horaPresMax_2.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_3 = tiempo.horaPresMax_3.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMax_mañana = tiempo.horaPresMax_mañana.apply(lambda x: x.hour if x is not None else np.nan)

tiempo.horaPresMin_1 = tiempo.horaPresMin_1.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_2 = tiempo.horaPresMin_2.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_3 = tiempo.horaPresMin_3.apply(lambda x: x.hour if x is not None else np.nan)
tiempo.horaPresMin_mañana = tiempo.horaPresMin_mañana.apply(lambda x: x.hour if x is not None else np.nan)

In [10]:
tiempo['año'] = tiempo['fecha'].dt.year
tiempo['mes'] = tiempo['fecha'].dt.month
tiempo['dia'] = tiempo['fecha'].dt.day

In [11]:
train = tiempo[tiempo.fecha <= fecha_test].fillna(-1)
test = tiempo[tiempo.fecha > fecha_test].fillna(-1)

In [13]:
for columna in model_columns:
    train[columna] = train[columna].astype(float)
    test[columna] = test[columna].astype(float)

## Target: tmax_mañana

In [15]:
train.tmax_mañana = train.tmax_mañana.astype(float)

In [19]:
metrics = {}
for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
    for max_depth in [5, 10, 20, 40]:
        for eta in [0.01 ,0.05, 0.1]:
            start_time = time.time()
            model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=eta)
            model.fit(train[model_columns], train.tmax_mañana);

            train_pred = model.predict(train[model_columns])
            test_pred = model.predict(test[model_columns])

            metrics['LGBM_'+str(n_estimators)+'_'+ str(max_depth)+'_'+str(eta)] = {
                'Train_MAE': mean_absolute_error(train.tmax_mañana, train_pred),
                'Test_MAE': mean_absolute_error(test.tmax_mañana, test_pred),
                'Run_Time': time.time() - start_time
            }

metrics_LGBM = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_MAE', 'Test_MAE'])
metrics_LGBM['delta'] = metrics_LGBM.Test_MAE - metrics_LGBM.Train_MAE
metrics_LGBM

Unnamed: 0,Run_Time,Train_MAE,Test_MAE,delta
LGBM_5_5_0.01,0.101726,4.584386,4.778143,0.193757
LGBM_5_5_0.05,0.084774,3.844622,4.018452,0.173830
LGBM_5_5_0.1,0.080784,3.108704,3.269540,0.160836
LGBM_5_10_0.01,0.088762,4.584387,4.777846,0.193459
LGBM_5_10_0.05,0.090757,3.844185,4.017968,0.173784
...,...,...,...,...
LGBM_500_20_0.05,1.335430,0.940391,1.607047,0.666656
LGBM_500_20_0.1,1.256640,0.656619,1.628360,0.971741
LGBM_500_40_0.01,1.697462,1.321267,1.586812,0.265545
LGBM_500_40_0.05,1.356373,0.940391,1.607047,0.666656


In [20]:
metrics_LGBM.to_parquet('lgbm_tmax.parquet')

## Target: tmin_mañana

In [22]:
train.tmin_mañana = train.tmin_mañana.astype(float)

In [23]:
metrics = {}
for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
    for max_depth in [5, 10, 20, 40]:
        for eta in [0.01 ,0.05, 0.1]:
            start_time = time.time()
            model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=eta)
            model.fit(train[model_columns], train.tmin_mañana);

            train_pred = model.predict(train[model_columns])
            test_pred = model.predict(test[model_columns])

            metrics['LGBM_'+str(n_estimators)+'_'+ str(max_depth)+'_'+str(eta)] = {
                'Train_MAE': mean_absolute_error(train.tmin_mañana, train_pred),
                'Test_MAE': mean_absolute_error(test.tmin_mañana, test_pred),
                'Run_Time': time.time() - start_time
            }

metrics_LGBM = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_MAE', 'Test_MAE'])
metrics_LGBM['delta'] = metrics_LGBM.Test_MAE - metrics_LGBM.Train_MAE
metrics_LGBM

Unnamed: 0,Run_Time,Train_MAE,Test_MAE,delta
LGBM_5_5_0.01,0.102693,4.659281,4.951013,0.291731
LGBM_5_5_0.05,0.087765,3.874162,4.126978,0.252816
LGBM_5_5_0.1,0.089760,3.084823,3.298051,0.213228
LGBM_5_10_0.01,0.087766,4.659390,4.951397,0.292006
LGBM_5_10_0.05,0.085770,3.874295,4.129529,0.255233
...,...,...,...,...
LGBM_500_20_0.05,1.350390,0.796213,1.226377,0.430165
LGBM_500_20_0.1,1.292545,0.551985,1.243489,0.691504
LGBM_500_40_0.01,2.291871,1.120315,1.244967,0.124653
LGBM_500_40_0.05,1.474060,0.796213,1.226377,0.430165


In [24]:
metrics_LGBM.to_parquet('lgbm_tmin.parquet')

## Target: prec_mañana

In [25]:
train.prec_mañana = train.prec_mañana.astype(float)

In [31]:
metrics = {}
for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
    for max_depth in [5, 10, 20, 40]:
        for eta in [0.01 ,0.05, 0.1]:
            start_time = time.time()
            model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=eta)
            model.fit(train[model_columns], train.prec_mañana);

            train_pred = model.predict(train[model_columns])
            test_pred = model.predict(test[model_columns])

            metrics['LGBM_'+str(n_estimators)+'_'+ str(max_depth)+'_'+str(eta)] = {
                'Train_MAE': mean_absolute_error(train.prec_mañana, train_pred),
                'Test_MAE': mean_absolute_error(test.prec_mañana, test_pred),
                'Run_Time': time.time() - start_time
            }

metrics_LGBM = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_MAE', 'Test_MAE'])
metrics_LGBM['delta'] = metrics_LGBM.Test_MAE - metrics_LGBM.Train_MAE
metrics_LGBM

Unnamed: 0,Run_Time,Train_MAE,Test_MAE,delta
LGBM_5_5_0.01,0.090757,1.553401,1.498560,-0.054841
LGBM_5_5_0.05,0.078789,1.517614,1.462242,-0.055372
LGBM_5_5_0.1,0.076795,1.483438,1.429151,-0.054287
LGBM_5_10_0.01,0.081782,1.551187,1.499526,-0.051661
LGBM_5_10_0.05,0.084772,1.505642,1.465889,-0.039753
...,...,...,...,...
LGBM_500_20_0.05,1.420203,0.881201,1.551412,0.670210
LGBM_500_20_0.1,1.363356,0.691169,1.707580,1.016411
LGBM_500_40_0.01,1.476053,1.145396,1.363916,0.218520
LGBM_500_40_0.05,1.402251,0.882876,1.551864,0.668988


In [33]:
metrics_LGBM.to_parquet('lgbm_prec.parquet')

## Target: sol_mañana

In [34]:
train.sol_mañana = train.sol_mañana.astype(float)

In [35]:
metrics = {}
for n_estimators in [5, 10, 20, 50, 100, 200, 500]:
    for max_depth in [5, 10, 20, 40]:
        for eta in [0.01 ,0.05, 0.1]:
            start_time = time.time()
            model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=eta)
            model.fit(train[model_columns], train.sol_mañana);

            train_pred = model.predict(train[model_columns])
            test_pred = model.predict(test[model_columns])

            metrics['LGBM_'+str(n_estimators)+'_'+ str(max_depth)+'_'+str(eta)] = {
                'Train_MAE': mean_absolute_error(train.sol_mañana, train_pred),
                'Test_MAE': mean_absolute_error(test.sol_mañana, test_pred),
                'Run_Time': time.time() - start_time
            }

metrics_LGBM = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_MAE', 'Test_MAE'])
metrics_LGBM['delta'] = metrics_LGBM.Test_MAE - metrics_LGBM.Train_MAE
metrics_LGBM

Unnamed: 0,Run_Time,Train_MAE,Test_MAE,delta
LGBM_5_5_0.01,0.091722,2.828194,3.077527,0.249333
LGBM_5_5_0.05,0.082777,2.675697,2.903704,0.228007
LGBM_5_5_0.1,0.083776,2.530951,2.740759,0.209808
LGBM_5_10_0.01,0.087765,2.825981,3.076162,0.250181
LGBM_5_10_0.05,0.093750,2.665609,2.901851,0.236242
...,...,...,...,...
LGBM_500_20_0.05,2.592070,1.375978,2.205322,0.829345
LGBM_500_20_0.1,2.295864,0.953584,2.221906,1.268322
LGBM_500_40_0.01,2.669860,1.941746,2.232943,0.291196
LGBM_500_40_0.05,2.054507,1.375978,2.205322,0.829345


In [36]:
metrics_LGBM.to_parquet('lgbm_sol.parquet')