In [None]:
# Bibliotecas necessárias
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Carregar a matriz principal
df = pd.read_csv('Matriz_Meteorologia_FreqCID-J_lag.csv', sep=',', index_col=False)

# Desconsiderar o ano de 2020 (pandemia)
df = df[pd.to_datetime(df['data']).dt.year != 2020]

# Separar variáveis independentes e dependente
Y = df['freq_intern'] # Caso queira usar a frequência adiantada, mudar o nome da variável escolhida
X = df.drop(columns=['data', 'freq_intern',
                     'freq_intern_1d', 'freq_intern_2d', 'freq_intern_3d', 'freq_intern_4d', 'freq_intern_5d',
                     'idade_mod', 'idade_med', 'dias_intern_mod', 'dias_intern_med'])

# Reduzir Y para uma dimensão
Y = Y.values.ravel()

print(X.shape)
print(Y.shape)

(1076, 23)
(1076,)


In [None]:
# Validação cruzada usando K-Fold com 5 folds
model = RandomForestRegressor(n_estimators = 500, oob_score = True, max_depth=None,
                            min_samples_leaf = 1, min_samples_split = 2,
                            n_jobs=-1, verbose=0)

kf = KFold(n_splits=5, shuffle=True)

rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    rmse_scores.append(rmse)

    mae = mean_absolute_error(Y_test, Y_pred)
    mae_scores.append(mae)

    r2 = r2_score(Y_test, Y_pred)
    r2_scores.append(r2)

# Resumo das métricas
print('RMSE:', rmse_scores)
print('MAE:', mae_scores)
print('R2:', r2_scores)

RMSE: [7.278458745071219, 7.233002027207057, 7.175155884863619, 6.902327770481694, 7.308606864989848]
MAE: [5.904685185185186, 5.535627906976743, 5.542558139534884, 5.379506976744186, 5.858074418604652]
R2: [0.7233093322183523, 0.7197471089468506, 0.7055235789906906, 0.7629675707252452, 0.6447553257243323]


In [None]:
# Validação cruzada train/test com N repetições
model = RandomForestRegressor(n_estimators = 500, oob_score = True, max_depth=None,
                            min_samples_leaf = 1, min_samples_split = 2,
                            n_jobs=-1, verbose=0)
# Numero de repetições
loops = int(10)

rmse_scores = []
mae_scores = []
r2_scores = []

for i in range(loops):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    rmse_scores.append(rmse)

    mae = mean_absolute_error(Y_test, Y_pred)
    mae_scores.append(mae)

    r2 = r2_score(Y_test, Y_pred)
    r2_scores.append(r2)

# Resumo das métricas
print('RMSE:', rmse_scores)
print('MAE:', mae_scores)
print('R2:', r2_scores)

RMSE: [7.365350597624388, 6.871434007164358, 6.96195846487658, 6.585715324346577, 7.714165800655311, 7.401685055895492, 6.9088534544854525, 6.940430648928969, 7.768019797482115, 7.4707028327202805]
MAE: [5.9894537037037034, 5.459083333333333, 5.4756481481481485, 5.392648148148147, 6.120499999999999, 5.768222222222223, 5.383935185185185, 5.397277777777777, 5.990212962962963, 5.83812962962963]
R2: [0.7142547158394433, 0.7557274069963282, 0.7628661264031881, 0.784776403908458, 0.6850358291937416, 0.711986449816068, 0.6896365984547025, 0.73820247430251, 0.6914509153849282, 0.7393785452069108]
