In [1]:
# Bibliotecas necessárias
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [2]:
# Baixar o arquivo diretamente do GitHub
!wget -q {'https://raw.githubusercontent.com/oukawa/ML_Salvador/f50a3ada2139c66a8c9b1b5048b61fc57d18b2e3/Dados/Matriz_Meteorologia_FreqCID-J_lag.csv'}

In [3]:
# Carregar a matriz principal
df = pd.read_csv('Matriz_Meteorologia_FreqCID-J_lag.csv', sep=',', index_col=False)

# Desconsiderar o ano de 2020 (pandemia)
df = df[pd.to_datetime(df['data']).dt.year != 2020]

# Separar variáveis independentes e dependente
Y = df['freq_intern'] # Caso queira usar a frequência adiantada, mudar o nome da variável escolhida
X = df.drop(columns=['data', 'freq_intern',
                     'freq_intern_1d', 'freq_intern_2d', 'freq_intern_3d', 'freq_intern_4d', 'freq_intern_5d',
                     'idade_mod', 'idade_med', 'dias_intern_mod', 'dias_intern_med'])

# Reduzir Y para uma dimensão
Y = Y.values.ravel()

print(X.shape)
print(Y.shape)

(1076, 23)
(1076,)


In [6]:
# Validação cruzada usando K-Fold com 5 folds
model = RandomForestRegressor(n_estimators = 500, oob_score = True, max_depth=None,
                            min_samples_leaf = 1, min_samples_split = 2,
                            n_jobs=-1, verbose=0)

kf = KFold(n_splits=5, shuffle=True)

rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    model.fit(X_train, Y_train)

    Y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
    rmse_scores.append(rmse)

    mae = mean_absolute_error(Y_test, Y_pred)
    mae_scores.append(mae)

    r2 = r2_score(Y_test, Y_pred)
    r2_scores.append(r2)

# Resumo das métricas
print('RMSE:', rmse_scores)
print('MAE:', mae_scores)
print('R2:', r2_scores)

RMSE: [7.211330640811799, 7.200307550278992, 7.1036393568105565, 7.126078412716636, 7.009947796640708]
MAE: [5.648157407407407, 5.793106976744186, 5.449237209302325, 5.698446511627908, 5.545693023255814]
R2: [0.7229556823158645, 0.7093537985022043, 0.690204482520384, 0.7183719239312513, 0.7513189872893324]
