# SARIMAX

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# Processamento do dataset -> pandas DataFrame
df = pd.read_csv('entrada_sarimax.csv')
df = df.rename(columns={'data_iniSE_sm': 'Data', 'casos_sm': 'casos'})
df['data'] = pd.to_datetime(df['data'],format='%Y-%m-%d')
df = df.set_index(['data'])
df.head()

In [None]:
# Estatística descritiva
df.describe()

In [None]:
df['casos'].plot(figsize=(12,3))

In [None]:
## Colunas selecionadas como exógenas
#df = df[['p_inc100k_sm', 'casos_poa', 'p_inc100k_poa','casos_rj', 'p_inc100k_rj', 'tempmin', 'umidmin', 'tempmax', 'umidmax','vento', 'chuva']]

In [None]:
## Adaptado de: https://github.com/marcopeix/datasciencewithmarco/blob/master/sklearn_time_series.ipynb
def window_input(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    i = 1
    while i < window_length:
        df[f'x_{i}'] = df['casos'].shift(-i)
        i = i + 1
    if i == window_length:
        df['y'] = df['casos'].shift(-i)
    # Preenche com zeros onde há valores NaN
    #df = df.dropna(axis=0)
    df = df.fillna(0)
    return df

In [None]:
input_length = 12

In [None]:
## Cria input_length defasagens no dataset
df = window_input(input_length, df)

In [None]:
## Separação em treinamento e teste
X = df.drop(columns=['casos','y','SE'], axis=1)
Y = df['casos']

Y_train, Y_test = Y.iloc[:-input_length], Y.iloc[-input_length:]
X_train, X_test = X.iloc[:-input_length], X.iloc[-input_length:]

In [None]:
## Separação em treinamento e teste
X = df.drop(columns=['casos','SE'], axis=1)
Y = df['casos']

Y_train, Y_test = Y.iloc[:-input_length], Y.iloc[-input_length:]
X_train, X_test = X.iloc[:-input_length], X.iloc[-input_length:]

In [None]:
X_train.columns

In [None]:
## 'auto_arima' para buscar os melhores parâmetros
fit_arima = auto_arima(Y_train.to_numpy(), exougenous = X_train.to_numpy(),
                       start_p=0, start_q=0, max_p=4, max_q=4,
                       start_P=0, start_Q=0, max_P=4, max_Q=4,
                       seasonal=True,
                       m=52,
                       information_criterion='aic',
                       trace=True,
                       stepwise=True,
                       suppress_warnings = True)

In [None]:
## Resultados
fit_arima.plot_diagnostics(figsize=(12, 8))
plt.show()

In [None]:
## SARIMAX fit com parâmetros encontrados pelo auto_arima

from statsmodels.tsa.statespace.sarimax import SARIMAX

model = SARIMAX(endog = Y_train.to_numpy(),
                order=fit_arima.get_params()['order'],
                seasonal_order=fit_arima.get_params()['seasonal_order'],
                exog=X_train.to_numpy().astype(float),
                enforce_exog=True)

sarimax_model = model.fit()
#sarimax_model.summary()

In [None]:
## Predições
predictions = sarimax_model.get_prediction(start=1, end=input_length, exog=X_test.to_numpy().astype(float),dynamic=False)
predictions.predicted_mean

In [None]:
## Preparação dos dados para plot
pred_sarimax = pd.DataFrame(predictions.predicted_mean[:input_length],index=Y_test.index[-input_length:])
pred_sarimax.rename(columns={0: 'previsto'}, inplace=True)

## SARIMAX Plot

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def calculate_figsize(num_entries, base_length=10, max_length=20):
    if num_entries <= 10:
        return (base_length, 4)  # Altura fixa, comprimento base_length
    else:
        # Calcule o comprimento proporcional, mas não ultrapasse max_length
        length = min(max_length, base_length * (num_entries / 10))
        return (length, 4)

def my_plot(predicted_values, actual_values):

  num_entries = len(predicted_values)
  figsize_ = calculate_figsize(num_entries)

  # Plotting
  plt.figure(figsize=figsize_)

  # Plot actual values
  plt.plot(actual_values.index, actual_values, label='Real', color='blue', marker='o')

  # Plot predicted values
  plt.plot(actual_values.index, predicted_values, label='Previsto', color='red', marker='x')

  # Adding labels and title
  plt.xlabel('Datas')
  plt.ylabel('Casos')
  plt.title('Valores reais vs valores previstos')
  plt.xlim(actual_values.index.min(), actual_values.index.max())

  # Defining and displaying all time axis ticks
  ticks = list(actual_values.index)
  plt.xticks(ticks)
  plt.xticks(ticks, fontsize=8, rotation=30)
  plt.legend()

  # Display the plot
  plt.show()

my_plot(pred_sarimax, Y_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calcular RMSE
mse = mean_squared_error(Y_test,pred_sarimax)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# Calcular MAE
mae = mean_absolute_error(Y_test, pred_sarimax)
print(f'MAE: {mae}')

# Calcular R2
r2 = r2_score(Y_test, pred_sarimax)
print(f'R2: {r2}')

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt


def sarimax_loop(input_length, num_semanas, data):
    data_prov = df.copy()
    predicted_values = np.array([])
    actual_values = pd.DataFrame()

    # Calcular o ponto de início
    start_index = len(data) - num_semanas

    # Iterar sobre o DataFrame a partir do ponto de início até o final
    for i in range(start_index, len(data), input_length):
        # Criar defasagens
        seq_df = window_input(input_length, data_prov)

        # Separação dos dados em treino e teste
        X = seq_df.drop(columns=['casos', 'y', 'SE'], axis=1)
        Y = seq_df[['casos']]

        train_size = i

        y_train, y_test = Y.iloc[:train_size], Y.iloc[train_size:train_size + input_length]
        X_train, X_test = X.iloc[:train_size], X.iloc[train_size:train_size + input_length]

        ##########################

        ## 'auto_arima' para buscar os melhores parâmetros
        fit_arima = auto_arima(Y_train.to_numpy(), exougenous = X_train.to_numpy(),
                       start_p=0, start_q=0, max_p=5, max_q=5,
                       start_P=0, start_Q=0, max_P=5, max_Q=5,
                       seasonal=True,
                       m=52,
                       information_criterion='aic',
                       trace=True,
                       stepwise=True,
                       suppress_warnings = True)

        # Ajuste do modelo SARIMAX
        model = SARIMAX(y_train, exog=X_train,
                        order=fit_arima.get_params()['order'],
                        seasonal_order=fit_arima.get_params()['seasonal_order'],)
        model_fit = model.fit(disp=False)

        predictions = model_fit.predict(start=train_size, end=train_size + input_length - 1, exog=X_test)
        predicted_values = np.append(predicted_values, predictions)
        actual_values = pd.concat([actual_values, y_test])

        # Atualizar os valores de data_prov com as previsões
        data_prov.iloc[train_size:train_size + input_length, data_prov.columns.get_loc('casos')] = predictions.values

    # Combine the actual values and predictions into a single DataFrame
    df_predictions = pd.DataFrame(predicted_values, index=actual_values.index, columns=['predicted_values'])
    df_combined = pd.concat([actual_values, df_predictions], axis=1)

    print(predicted_values)
    # my_plot2(mean_values)
    print(len(predicted_values))
    my_plot(predicted_values, actual_values)

    # Calcular RMSE
    mse = mean_squared_error(actual_values, predicted_values)
    rmse = np.sqrt(mse)
    print(f'RMSE: {rmse}')

    # Calcular MAE
    mae = mean_absolute_error(actual_values, predicted_values)
    print(f'MAE: {mae}')

    # Calcular R2
    r2 = r2_score(actual_values, predicted_values)
    print(f'R2: {r2}')

# Executar a função
# sarimax_loop(input_length, num_semanas, data)
# sarimax_loop(1, 12, data) = de 1 em 1, prevê 12 semanas
sarimax_loop(1, 12, df)