# CatBoost

In [None]:
!pip install catboost

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('entrada_modelos.CSV')
data = data.interpolate()
data.isnull().sum()

In [None]:
seq_df = data

In [None]:
## Adaptado de: https://github.com/marcopeix/datasciencewithmarco/blob/master/sklearn_time_series.ipynb
def window_input(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
    df = data.copy()
    i = 1
    while i < window_length:
        df[f'lag_{i}'] = df['casos_sm'].shift(-i)
        i = i + 1
    if i == window_length:
        df['y'] = df['casos_sm'].shift(-i)
    # Preenche com zeros onde há valores NaN
    #df = df.dropna(axis=0)
    df = df.fillna(0)
    return df

In [None]:
seq_df = window_input(4, seq_df)
seq_df

In [None]:
# prompt: Usando o DataFrame seq_df: faça a coluna 'data' ser índice, remova todas as outras exceto colunas 'casos_sm' e lags

seq_df = seq_df.set_index('data')[['casos_sm', 'lag_1', 'lag_2', 'lag_3']]
seq_df

In [None]:
# prompt: Usando o DataFrame seq_df: converta o print dessa tabela para código em latex, deixando apenas as 6 últimas linhas

print(seq_df.tail(6).to_latex())


#Função em loop para múltiplos treinamentos com CatBoost

In [None]:
def format_data(seq_df):
  # Ajusta coluna de datas e reseta índice
  seq_df['data'] = pd.to_datetime(seq_df['data'], format='%Y-%m-%d')
  seq_df = seq_df.drop(seq_df.columns[0], axis=1)
  seq_df.set_index('data', inplace=True)
  return seq_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def my_plot2(df):

  # Plotting
  plt.figure(figsize=(10, 5))

  # Plot actual values
  plt.plot(df.index, df['casos_sm'], label='Actual', color='blue', marker='o')

  # Plot predicted values
  plt.plot(df.index, df['predicted_values'], label='Predicted', color='red', marker='x')

  # Adding labels and title
  plt.xlabel('Sample Index')
  plt.ylabel('Value')
  plt.title('Predicted vs Actual Values')
  plt.xlim(df.index.min(), df.index.max())

  # Defining and displaying all time axis ticks
  ticks = list(df.index)
  plt.xticks(ticks)
  plt.xticks(ticks, fontsize=8, rotation=30)
  plt.legend()

  # Display the plot
  plt.show()

def calculate_figsize(num_entries, base_length=10, max_length=20):
    if num_entries <= 10:
        return (base_length, 4)  # Altura fixa, comprimento base_length
    else:
        # Calcule o comprimento proporcional, mas não ultrapasse max_length
        length = min(max_length, base_length * (num_entries / 10))
        return (length, 4)

def my_plot(predicted_values, actual_values):

  num_entries = len(predicted_values)
  figsize_ = calculate_figsize(num_entries)

  # Plotting
  plt.figure(figsize=figsize_)

  # Lista de valores
  #sarimax = [9.87779829e-01, 1.00536145e+00, -5.53064119e-02, 1.08174458e+00, 6.45207754e-03, -1.96113408e-02, 1.99098661e+00, 2.97346153e+00, 2.99599465e+00, -8.95766081e-02, 1.56663454e+01, 3.26781066e+01]

  sarimax = [232.5902948,261.07891422,252.37894709,285.02830806,312.36586635,219.82140723,246.05941762,286.54788506,258.00177699,172.03113754,154.68495018,158.8637181]

  # Converter a lista em um DataFrame
  #s = pd.DataFrame(sarimax, columns=['values'])

  # Plot actual values
  plt.plot(actual_values.index, actual_values, label='Real', color='blue', marker='o')

  # Plot predicted values
  plt.plot(actual_values.index, predicted_values, label='CatBoost', color='red', marker='x')

  #plt.plot(actual_values.index, s['values'], label='SARIMAX', color='black', marker='p')

  # Adding labels and title
  plt.xlabel('Datas')
  plt.ylabel('Casos')
  #plt.title('Valores reais vs valores previstos')
  plt.xlim(actual_values.index.min(), actual_values.index.max())

  # Defining and displaying all time axis ticks
  ticks = list(actual_values.index)
  plt.xticks(ticks)
  plt.xticks(ticks, fontsize=8, rotation=30)
  plt.legend()

  # Display the plot
  plt.show()


Mostrar que a predição converge em 2023 com os valores reais atualizados passo a passo (sem atualização nos dados pelos valores preditos):
```
new_data = data.iloc[:-52]

#Defasagem de 1 lag, durante as próximas 12 semanas
#catboost_loop(input,length, num_semanas, dados)   
catboost_loop(1, 12, new_data)

```

Para monstrar que não converge, descomentar linha:
```
# data_prov.iloc[train_size:train_size + input_length, data_prov.columns.get_loc('casos_sm')] = predictions

```



In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def catboost_loop(input_length, num_semanas, data):

    data_prov = data.copy()
    predicted_values = np.array([])
    actual_values = pd.DataFrame()

    # Calcular o ponto de início
    start_index = len(data) - num_semanas

    # Iterar sobre o DataFrame a partir do ponto de início até o final
    for i in range(start_index, len(data), input_length):

        print("Iteração: ", i)
        # Criar defasagens
        seq_df = window_input(input_length, data_prov)
        seq_df = format_data(seq_df)

        # Separação dos dados em treino e teste
        X = seq_df.drop(columns=['casos_sm', 'y'], axis=1)
        Y = seq_df[['casos_sm']]

        train_size = i

        y_train, y_test = Y.iloc[:train_size], Y.iloc[train_size:train_size + input_length]
        X_train, X_test = X.iloc[:train_size], X.iloc[train_size:train_size + input_length]

        model = CatBoostRegressor(random_state=42, loss_function='RMSE', verbose=False)
        model.fit(X_train, y_train)

        predictions = model.predict(X_test)
        predicted_values = np.append(predicted_values, predictions)
        actual_values = pd.concat([actual_values, y_test])

        # Atualizar os valores de data_prov com as previsões
        data_prov.iloc[train_size:train_size + input_length, data_prov.columns.get_loc('casos_sm')] = predictions

    # Combine the actual values and predictions into a single DataFrame
    df_predictions = pd.DataFrame(predicted_values, index=actual_values.index, columns=['predicted_values'])
    df_combined = pd.concat([actual_values, df_predictions], axis=1)

    # my_plot2(mean_values)
    print(len(predicted_values))
    my_plot(predicted_values, actual_values)

    # Calcular RMSE
    mse = mean_squared_error(actual_values, predicted_values)
    rmse = np.sqrt(mse)
    print(f'RMSE: {rmse}')

    # Calcular MAE
    mae = mean_absolute_error(actual_values, predicted_values)
    print(f'MAE: {mae}')

    # Calcular R2
    r2 = r2_score(actual_values, predicted_values)
    print(f'R2: {r2}')

# catboost_loop(input_length, num_semanas, data)
# catboost_loop(1, 12, data) = de 1 em 1, prevê 12 semanas
data = data.drop(['p_inc100k_sm'], axis=1)

catboost_loop(1, 52, data)


In [None]:
## Criação das defasagens no dataset
input_length = 12
seq_df = window_input(input_length, data)

In [None]:
seq_df['data'] = pd.to_datetime(seq_df['data'], format='%Y-%m-%d')
seq_df = seq_df.drop(seq_df.columns[0], axis=1)
seq_df.set_index('data', inplace=True)
#seq_df

In [None]:
#seq_df = seq_df.drop('p_inc100k_sm', axis=1)

In [None]:
## Separação dos dados em treino e teste
X = seq_df.drop(columns=['casos_sm', 'y'], axis=1)
Y = seq_df[['casos_sm']]

train_size = len(X) - input_length

y_train, y_test = Y.iloc[:train_size], Y.iloc[train_size:]
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=42, loss_function='RMSE', verbose=False)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)
predictions

# Métricas de Erro

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calcular RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# Calcular MAE
mae = mean_absolute_error(y_test, predictions)
print(f'MAE: {mae}')

# Calcular R2
r2 = r2_score(y_test, predictions)
print(f'R2: {r2}')

In [None]:
my_plot(predictions,y_test[-12:])

In [None]:
# Plotar previsões vs. valores reais
plt.figure(figsize=(14, 7))

# Dados de treino
# plt.plot(X_train[-12:].index, y_train[-12:], label='Treino', color='blue')

# Dados de teste
plt.plot(X_test[-12:].index, y_test[-12:], label='Teste', color='green')

# Previsões
plt.plot(X_test[-12:].index, predictions, label='Previsão', color='red')

plt.legend()
plt.title('Previsões com CatBoost')
plt.xlabel('Data')
plt.ylabel('Casos')
plt.show()


In [None]:
## DataFrame para auxiliar nos plots
dates = pd.DataFrame(index=y_test.index)
feature_df = pd.DataFrame({'Feature': X_train.columns, "Importance": model.get_feature_importance()})
feature_df.sort_values('Importance', ascending=False)

In [None]:
## DataFrame para auxiliar nos plots
dates = pd.DataFrame(index=y_test.index)

feature_df = pd.DataFrame({'Feature': X_train.columns, "Imp": model.get_feature_importance()})
feature_df.sort_values('Imp', ascending=False)

fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(dates[-input_length:].index, y_test.values, marker='.', color='blue', label='Observados')
ax.plot(dates[-input_length:].index, predictions, marker='P', color='black', label='CatBoost')

ax.set_xlabel('Datas')
ax.set_ylabel('Casos de dengue')

plt.xticks(dates.index[-input_length:], dates.index[-input_length:]) # Changed line
plt.legend(loc=1)

fig.autofmt_xdate()
plt.tight_layout()

In [None]:
!pip install shap

In [None]:
import shap

# Calcular valores SHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Plotar a importância das features
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
# Plotar a importância das features detalhadamente
shap.summary_plot(shap_values, X)

In [None]:
# Plotar a dependência de uma feature específica
shap.dependence_plot("lag_1", shap_values, X)