# Import libraries

In [None]:
import numpy as np
import pandas as pd

import datetime as dt

import requests
import yfinance as yf
import pandas_datareader.data as web

# Extract data

In [None]:
years = 10
dt_start = (dt.datetime.now() - dt.timedelta(days=years*365)).date()
dt_end=dt.datetime.now().date()

print('Extraindo informações de {} até {}'.format(dt_start, dt_end))

### yfinance

| **Indicator**        | **Ticker (Yahoo Finance)** | **Description** |
|---------------------|--------------------------|-------------|
| **IBOVESPA**       | `^BVSP`                   | Brazil Stock Market Index |
| **Commodities**     | `GC=F`, `CL=F`, `SB=F`, `ZC=F` | Gold, Crude Oil, Sugar, Corn |
| **Stock Market Index (S&P 500)** | `^GSPC` | Standard & Poor’s 500 (S&P 500) Index |
| **Cryptocurrency (Bitcoin)** | `BTC-USD` | Bitcoin price in USD |

In [None]:
# Define the IBOVESPA ticker symbol used on Yahoo Finance
tickers = ["^BVSP","^GSPC","BTC-USD", "GC=F", "CL=F", "SB=F"]

In [None]:
# Download historical data (default is daily interval)
# You can adjust the period (e.g., '1y', '5y', 'max') or set specific dates
df_yf = yf.download(tickers, start=dt_start, end=dt_end).ffill()

# Ensure the 'Date' column exists and is in datetime format before setting it as the index
if 'Date' in df_yf.columns:
    df_yf['Date'] = pd.to_datetime(df_yf['Date'])
    df_yf.set_index('Date', inplace=True)
else:
    print("The 'Date' column is not present in the dataset.")

# Display the first few rows
df_yf.tail()

| **Exchange Rate (Forex)** | `USDBRL=X`, `EURBRL=X` | USD/BRL (Dollar to Real), EUR/BRL (Euro to Real) |

In [None]:
# Flatten the multi-level column index
df_yf.columns = ['_'.join(col).strip() for col in df_yf.columns.values]

# Display the first few rows of the updated dataset
df_yf.tail()

### bcb - Banco Central do Brasil

https://www3.bcb.gov.br/sgspub/localizarseries/localizarSeries.do?method=prepararTelaLocalizarSeries

In [None]:
series_br = {
    'SELIC':11,
    'CDI':12,
    'SELIC_Anual': 1178,
    'SELIC_Meta_Anual': 432,
    'IPCA_Mensal': 433,
    'IGP_M_Mensal': 189,
    'INCC_Mensal': 192,
    'Indice_Condicoes_Econ_BR': 27574,
    'Indice_Condicoes_Econ_BR_USD': 29042,
    'Salario_Minimo': 1619,
    'IBC_BR': 24363,
    'Populacao_BR': 21774,
    'PIB_Trimestral_Real': 4380,
    'PIB_Anual_Corrente': 7326,
    'Deflator_Implicito_PIB': 1211
}


In [None]:
# Função para buscar uma série do SGS
def get_bcb_series(sgs_code, start,end):
    url = f'https://api.bcb.gov.br/dados/serie/bcdata.sgs.{sgs_code}/dados'
    
    # Monta os parâmetros corretamente no formato da API
    params = {
        'formato': 'json',
        'dataInicial': start.strftime('%d/%m/%Y'),  # Formato dd/mm/yyyy
        'dataFinal': end.strftime('%d/%m/%Y'),      # Formato dd/mm/yyyy
    }

    # Requisição
    response = requests.get(url, params=params)
    data = response.json()

    # Verifica se a resposta está vazia
    if not data:
        print(f"Warning: No data found for SGS code {sgs_code} between {start} and {end}.")
        return data
    
    return data

In [None]:
# Baixar todas as séries e armazenar num dicionário
br_dataframes = {}
for name, code in series_br.items():
    print(f'Baixando {name} (código {code})...')
    try:
        br_dataframes[name] = pd.DataFrame(get_bcb_series(code, start=dt_start,end=dt_end))
    except Exception as e:
        print(f"Erro ao baixar a série {name} (código {code}): {e}")

In [None]:
# Combine all DataFrames in the dictionary into a single DataFrame
df_br = pd.concat(
    {key: df.assign(data=pd.to_datetime(df['data'], format='%d/%m/%Y'))
          .set_index('data')['valor']
     for key, df in br_dataframes.items()},
    axis=1
)

# Sort the DataFrame by index (date)
df_br.sort_index(inplace=True)

# Display the resulting DataFrame
df_br.ffill().tail()

### pandas_datareader

In [None]:
# Dicionário com os códigos do FRED e nomes mais amigáveis
series_usa = {
    'DEXBZUS': 'BRL_USD',
    'CPIAUCSL': 'CPI_USA',
}

In [None]:
# Puxar todas as séries e juntar num único DataFrame
df_usa = pd.concat(
    [web.DataReader(code, 'fred', dt_start, dt_end).rename(columns={code: name})
     for code, name in series_usa.items()],
    axis=1
)

In [None]:
df_usa.tail()

## Load dataset

In [None]:
# Juntar os três DataFrames com base no índice
dataset = df_yf.join([df_br, df_usa], how='left')

dataset.ffill(inplace=True) # Preencher valores ausentes com o último valor conhecido
dataset.bfill(inplace=True) # Preencher valores ausentes com o último valor conhecido

In [None]:
# Exibir as primeiras linhas do DataFrame resultante
print(dataset.head())

In [None]:
# Exibir as primeiras linhas do DataFrame resultante
dataset.tail()

In [None]:
dataset.loc['2025-01-02']

In [None]:
dataset.info(verbose=True)

In [None]:
dataset.iloc[:, 4]

In [None]:
# Save to a CSV file
dataset.to_csv('../data/raw/dataset.csv')

## Data processing

In [None]:
'''
import pandas as pd
import numpy as np
import datetime as dt
'''
dataset = pd.read_csv('../data/raw/dataset.csv', index_col=0, parse_dates=True)


In [None]:
df = pd.DataFrame(dataset)

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder

# ColumnTransformer usando make_column_selector
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder='passthrough'
)

# Pipeline final
pipeline_process = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the pipeline on the selected data
X_all = pipeline_process.fit_transform(df)

In [None]:
for index in df.columns[df.columns.str.contains('BVSP')]:
    print(df.columns.get_loc(index))

In [None]:
from scipy.sparse import issparse

if issparse(X_all):
    X_all = X_all.toarray()

X_all.astype(np.float64)

In [None]:
X_all.shape

In [None]:
# Criar os pares (X, y) para todo o histórico
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

sequence_length = 200  # Número de dias para prever o fechamento do ibovespa
target_column_index = [4,10,16,22,28]  # o fechamento do ibovespa está no íncide 4 de X_all

generator = TimeseriesGenerator(
    X_all, X_all[:, target_column_index],
    length=sequence_length, batch_size=1
)

In [None]:
generator.data

In [None]:
# Get the first batch from the generator
X_batch, y_batch = generator[0]

# Print the shapes of the input (X) and target (y)
print("X shape:", X_batch.shape)
print("y shape:", y_batch.shape)

## SLIP DATA

## Modeling

In [None]:
generator.targets.shape

In [None]:
generator[0][0].shape[1]

In [None]:
generator[0][0].shape[2]

In [None]:

# 2. Modelo LSTM
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense

# Define and compile the LSTM model
model = Sequential([
    LSTM(70, input_shape=(generator[0][0].shape[1], generator[0][0].shape[2]), return_sequences=True),
    Dropout(0.3),
    LSTM(50, return_sequences=True),
    Dropout(0.3),
    LSTM(30, return_sequences=False),
    Dense(5)
])
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(generator, epochs=30)

## Predict

In [None]:
# Últimos 7 dias de X
last_window = X_all[-sequence_length:]  # shape (7, features)
last_window = last_window.reshape((1, sequence_length, X_all.shape[1]))  # (1, 7, features)

# Previsão do próximo dia
next_prediction = model.predict(last_window)


In [None]:
next_prediction[0,0]

In [None]:
ct = pipeline_process.named_steps['preprocessor']

In [None]:
numeric_cols = ct.transformers_[0][2]  # Pega os nomes/índices das colunas usadas no primeiro transformador
scaler = ct.transformers_[0][1]        # Pega o StandardScaler associado

In [None]:
numeric_cols[4]

In [None]:
scaler

In [None]:
numeric_cols[4]

In [None]:
next_prediction

In [None]:
# Find the integer index of the target column in numeric_cols
target_in_scaler_index = 4

dummy_input = np.zeros((1, len(numeric_cols)))
dummy_input[0, target_in_scaler_index] = next_prediction[0, 0]

inv = scaler.inverse_transform(dummy_input)
next_prediction_real = inv[0, target_in_scaler_index]

In [None]:
# Reshape para 2D: (7, features)
last_window_2d = last_window.reshape(-1, X_all.shape[1])

# Selecionar apenas as colunas numéricas correspondentes a numeric_cols
last_window_2d_numeric = last_window_2d[:, :len(numeric_cols)]

# Inverter a transformação
last_window_real_numeric = scaler.inverse_transform(last_window_2d_numeric)

# Recriar o array completo com os valores invertidos
last_window_real = last_window.copy()
last_window_real[:, :, :len(numeric_cols)] = last_window_real_numeric

# Se quiser, pode voltar ao shape 3D depois
last_window_real = last_window_real.reshape(1, 7, X_all.shape[1])


In [None]:
last_window_real_numeric

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Eixo X para a janela de entrada (últimos 7 dias)
x_input_dates = pd.date_range(end=dt_end, periods=sequence_length).strftime('%Y-%m-%d')

# Eixo X para a predição (logo após a janela)
x_pred_dates = [x_input_dates[-1], (pd.to_datetime(x_input_dates[-1]) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')]

# Último valor real + predição real
y_pred_real = [last_window_real[0, -1, target_column_index], next_prediction_real]

# Calcular a diferença percentual entre o predito e o último real
diff_percent = ((y_pred_real[1] - y_pred_real[0]) / y_pred_real[0]) * 100

# Formatar os valores para exibição
y_pred_real_formatted = [f"{int(y):,}".replace(",", ".") for y in y_pred_real]
diff_percent_formatted = f"{diff_percent:.2f}%"

# Plot da janela de dados reais
plt.plot(x_input_dates, last_window_real[0, :, target_column_index], label='Últimos dados reais')

# Plot da predição real como linha pontilhada conectando ao último ponto real
plt.plot(x_pred_dates, y_pred_real, 'r--', label='Predição real')

# Adicionar rótulo ao último ponto real
plt.annotate(f'{y_pred_real_formatted[0]}', 
             (x_input_dates[-1], y_pred_real[0]), 
             textcoords="offset points", 
             xytext=(-10, 10), 
             ha='center', 
             fontsize=9, 
             color='blue')

# Adicionar rótulo ao ponto predito com a diferença percentual
plt.annotate(f'{y_pred_real_formatted[1]} ({diff_percent_formatted})', 
             (x_pred_dates[-1], y_pred_real[1]), 
             textcoords="offset points", 
             xytext=(-10, 10), 
             ha='center', 
             fontsize=9, 
             color='red')

# Legenda e rótulos
plt.xlabel('Data')
plt.ylabel('Valor Real')
plt.title('Predição do modelo vs Últimos dados reais')
plt.xticks(rotation=45)
plt.legend()

plt.show()
