In [None]:
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

In [None]:
wd = pd.read_feather('Dados_Jan1980_mar2020_interpolado.feather')

In [None]:
wd

In [None]:
# Reordena por localidade ao invés de ano.
wdc = wd.sort_values(by=['lat', 'lon'])

# Adiciona mais uma coluna para representar o 'y'
wdc['prec_GPCP_roll'] = np.roll(wdc['prgpcp'], -1)

In [None]:
wdc

In [None]:
wdc = wdc.loc[wdc['year'] < 2020]

In [None]:
wdc

In [None]:
class PrecModel:

    def __init__(self, data, split_year, train_size, name):
        self.data = data
        self.train_size = train_size
        self.name = name

        # Fit normalization to current data

        self.x_scaler = MinMaxScaler()
        self.x_scaler.fit(data.loc[:, 'temp850':'prgpcp'])

        self.y_scaler = MinMaxScaler()
        self.y_scaler.fit(data.loc[:, 'prec_GPCP_roll'].to_numpy().reshape(-1, 1))

        # Split train and test set

        self.trainval_data = data.loc[data['year'] < split_year]
        self.test_data = data.loc[data['year'] >= split_year]

        # Normalize training data

        self.X = self.x_scaler.transform(self.trainval_data.loc[:, 'temp850':'prgpcp'].values)
        self.y = self.y_scaler.transform(self.trainval_data.loc[:, 'prec_GPCP_roll'].values.reshape(-1, 1))

        # Normalize test data

        self.test_data_input = self.test_data.loc[:, 'temp850':'prgpcp']
        self.test_data_x = self.x_scaler.transform(self.test_data_input)
        self.test_data_output = self.test_data.loc[:, 'prec_GPCP_roll']
        self.test_data_y = self.y_scaler.transform(self.test_data_output.values.reshape(-1, 1))

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size=self.train_size,
                                                               shuffle=True)

    def _objective(self, trial):

        if trial.number >= 100:
            self.study.stop()
            return

        # Parâmetros a serem otimizados

        params = {
            "objective": "regression",
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 256),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        }

        cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=1)

        gbm = lgb.LGBMRegressor(**params, early_stopping_rounds=100)

        n_scores = cross_val_score(gbm, self.X_train, self.y_train.ravel(),
                                   scoring='neg_mean_squared_error',
                                   cv=cv, n_jobs=-1, error_score='raise',
                                   fit_params={'eval_metric': 'l2', 'eval_set': [(self.X_test, self.y_test.flatten())]})

        return np.mean(np.abs(n_scores))

    def fit(self):

        # Optimize

        study_name = 'lgb-{}'.format(self.name)
        storage_name = "sqlite:///lgb-{}.db".format(self.name)

        self.study = optuna.create_study(direction="minimize",
                                         pruner=optuna.pruners.HyperbandPruner(),
                                         study_name=study_name,
                                         storage=storage_name,
                                         load_if_exists=True)

        self.study.optimize(self._objective, n_trials=500)
        DX = lgb.Dataset(self.X_train, self.y_train.flatten())
        Dval = lgb.Dataset(self.X_test, self.y_test.flatten())

        self.bst = lgb.train(self.study.best_params, DX, num_boost_round=1000)

    def _denorm_calc(self, y_scaler, y, pred, interval=3):
        y_denorm = y_scaler.inverse_transform(y)
        pred_denorm = y_scaler.inverse_transform(np.expand_dims(pred, 0))
        diff = np.subtract(y_denorm.flatten(), pred_denorm.flatten())

        ####
        diff = pd.Series(diff)
        unc = diff.rolling(3, center=True).var()
        unc.iloc[0] = 0
        unc.iloc[-1] = 0
        ####

        #years = np.array(np.array_split(diff, len(diff)//interval))
        #var = np.var(years, axis=1)
        #unc = np.repeat(var, interval)
        return pred_denorm, unc

    def train_test_error(self):
        train_error = np.mean((self.y.flatten() - self.bst.predict(self.X))**2)
        test_error = np.mean((self.test_data_y.flatten() - self.bst.predict(self.test_data_x))**2)
        print(f"Training error: {train_error}")
        print(f"Test error: {test_error}")

    def time_series(self):
        test_error = self.test_data_y.flatten() - self.bst.predict(self.test_data_x)
        return test_error

    def best_params(self):
        return self.study.best_params

    def save_trials(self):

        # Save all tries

        df = self.study.trials_dataframe()
        df.to_excel("best-params-lgb-{}.xlsx".format(self.name))

    def save_model(self):

        self.bst.save_model('lgb_{}_model.txt'.format(self.name), num_iteration=self.bst.best_iteration)

    def save_data(self):

        train_pred = self.bst.predict(self.X)
        train_pred_denorm, train_unc = self._denorm_calc(self.y_scaler, self.y, train_pred)
        np.save("lgb-train-pred.npy", train_pred_denorm)
        np.save("lgb-train-unc.npy", train_unc)

        test_pred = self.bst.predict(self.test_data_x)
        test_pred_denorm, test_unc = self._denorm_calc(self.y_scaler, self.test_data_y, test_pred)
        np.save("lgb-test-pred.npy", test_pred_denorm)
        np.save("lgb-test-unc.npy", test_unc)

        dfprec = pd.DataFrame(test_pred_denorm[0], columns=['prec_lgb_denorm'])
        dfprec.to_feather("prec_lgb_denorm.feather")


In [None]:
Prec = PrecModel(wdc, 2018, 0.75, "prec")

In [None]:
Prec.fit()

In [None]:
Prec.best_params()

In [None]:
Prec.train_test_error()

In [None]:
Prec.save_trials()

In [None]:
Prec.save_model()

In [None]:
Prec.save_data()

In [None]:
class UncModel(PrecModel):

    def __init__(self, data, train_size, name):
        self.data = data
        self.train_size = train_size
        self.name = name

        # Fit normalization to current data

        self.x_scaler = MinMaxScaler()
        self.x_scaler.fit(data.loc[:, 'temp850':'unc'])

        self.y_scaler = MinMaxScaler()
        self.y_scaler.fit(data.loc[:, 'unc_roll'].to_numpy().reshape(-1, 1))

        # Split train and test set

        self.trainval_data = data.loc[data['year'] < 2018]
        self.test_data = data.loc[data['year'] >= 2018]

        # Normalize training data

        self.X = self.x_scaler.transform(self.trainval_data.loc[:, 'temp850':'unc'].values)
        self.y = self.y_scaler.transform(self.trainval_data.loc[:, 'unc_roll'].values.reshape(-1, 1))

        # Normalize test data

        self.test_data_input = self.test_data.loc[:, 'temp850':'unc']
        self.test_data_x = self.x_scaler.transform(self.test_data_input)
        self.test_data_output = self.test_data.loc[:, 'unc_roll']
        self.test_data_y = self.y_scaler.transform(self.test_data_output.values.reshape(-1, 1))

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, train_size=self.train_size,
                                                               shuffle=True)

    def save_data(self):
        test_pred_unc = self.bst.predict(self.test_data_x)
        y_unc_denorm = self.y_scaler.inverse_transform(self.test_data_y)
        pred_unc_denorm = self.y_scaler.inverse_transform(np.expand_dims(test_pred_unc, 0))
        dfunc = pd.DataFrame(pred_unc_denorm[0], columns=['lgb_unc_denorm'])
        dfprec = pd.read_feather("prec_lgb_denorm.feather")
        df_final = pd.concat([self.test_data.reset_index(drop=True), dfprec, dfunc], axis=1)
        df_final['prec_error_denorm'] = df_final['prgpcp'] - df_final['prec_lgb_denorm']
        df_final['error_unc'] = df_final['unc_roll'] - df_final['lgb_unc_denorm']
        df_final.to_excel('incerteza.xlsx')

In [None]:
train_unc = np.load("lgb-train-unc.npy")
test_unc = np.load("lgb-test-unc.npy")

traintest_unc = np.concatenate((train_unc, test_unc))

In [None]:
wdc.loc[:, 'unc'] = traintest_unc

In [None]:
# Adiciona mais uma coluna para representar o 'y'
wdc['unc_roll'] = np.roll(wdc['unc'], -1)

In [None]:
wdc

In [None]:
Unc = UncModel(wdc, 0.75, "unc")

In [None]:
Unc.fit()

In [None]:
Unc.best_params()

In [None]:
Unc.train_test_error()

In [None]:
Unc.save_trials()

In [None]:
Unc.save_model()

In [None]:
Unc.save_data()