In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from statsforecast import StatsForecast
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.model_selection import TimeSeriesSplit
from statsforecast.models import Naive, SeasonalNaive, SeasonalWindowAverage, AutoARIMA





In [None]:
historic_df = pd.read_csv(r'Base de Dados//Dados Históricos - Ibovespa.csv', parse_dates=['Data'], index_col='Data')
historic_df = historic_df.sort_index()

historic_df['Var%'] = historic_df['Var%'].str.replace('%', '').str.replace(',', '.').astype(float)

historic_df['Vol.'] = historic_df['Vol.'].str.replace(',', '.')
historic_df['Vol.'] = historic_df['Vol.'].apply(
    lambda x: float(str(x)[:-1].replace(',', '.'))/1_000  if str(x)[-1] == 'K'
    else float(str(x)[:-1].replace(',', '.')) if str(x)[-1] == 'M'
    else float(str(x)[:-1].replace(',', '.')) * 1_000 if str(x)[-1] == 'B'
    else float(str(x).replace(',', '.'))/1_000_000 )

historic_df.rename(columns={'Vol.':'Vol (M)'}, inplace=True)

In [None]:
historic_df.info()

In [None]:
historic_df.describe()

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

# Plot da média móvel da variação percentual no eixo primário
line1, = ax.plot(historic_df.index, historic_df['Var%'].rolling(window=30).mean(), color='blue', label='Variação %')
ax.plot(historic_df.index, historic_df['Var%'].rolling(window=30).mean(), color='blue', label='Variação %')
ax.set_xlabel('Data')
ax.set_ylabel('Variação da Bolsa (%)')
ax.tick_params(axis='y')

# Criação do segundo eixo Y (para o fechamento da bolsa)
ay = ax.twinx()
line2, = ay.plot(historic_df.index, historic_df['Último'].rolling(window=30).mean(), color='red', label='Fechamento Valor')
ay.plot(historic_df.index, historic_df['Último'].rolling(window=30).mean(), color='red', label='Fechamento Valor')
ay.set_ylabel('Fechamento da Bolsa')
ay.tick_params(axis='y')

# Título e layout
lines = [line1, line2]
labels = [line.get_label() for line in lines]
ax.legend(lines, labels, loc='upper left')

plt.title('Média Móvel 30 dias - Variação % e Fechamento')
fig.tight_layout()

# Exibir gráfico
plt.show()

In [None]:
historic_var_df = historic_df[['Var%']]
X = historic_var_df[['Var%']].values

result = adfuller(X)

print("Teste ADF")
print(f"Teste Estatístico: {result[0]}")
print(f"P-Value: {result[1]}")
print("Valores críticos:")

for key, value in result[4].items():
  print(f"\t{key}: {value}")


### CRIANDO O NAIVE PREDICT

In [None]:
# Naive baseline: use last observed value
naive_pred = historic_df['Último'].shift(1)

# Evaluation
from sklearn.metrics import mean_squared_error

mse_naive = mean_squared_error(historic_df['Último'][1:], naive_pred[1:])
print(f'Naive Baseline MSE: {mse_naive}')

In [None]:
fig, ax = plt.subplots(figsize=(12,6))

# Plot da média móvel da variação percentual no eixo primário
line1, = ax.plot(historic_df.index, historic_df['Var%'].rolling(window=30).mean(), color='blue', label='Variação %')
ax.plot(historic_df.index, historic_df['Var%'].rolling(window=30).mean(), color='blue', label='Variação %')
ax.set_xlabel('Data')
ax.set_ylabel('Variação da Bolsa (%)')
ax.tick_params(axis='y')

# Criação do segundo eixo Y (para o fechamento da bolsa)
ay = ax.twinx()
line2, = ay.plot(historic_df.index, historic_df['Último'].rolling(window=30).mean(), color='red', label='Fechamento Valor')
ay.plot(historic_df.index, historic_df['Último'].rolling(window=30).mean(), color='red', label='Fechamento Valor')
line3, = ay.plot(historic_df.index, naive_pred.rolling(window=30).mean(), color='green', label='Naive Var')
ay.plot(historic_df.index, naive_pred.rolling(window=30).mean(), color='green', label='Naive Var')
ay.set_ylabel('Fechamento da Bolsa')
ay.tick_params(axis='y')



# Título e layout
lines = [line1, line2, line3]
labels = [line.get_label() for line in lines]
ax.legend(lines, labels, loc='upper left')

plt.title('Média Móvel 30 dias - Variação % e Fechamento')
fig.tight_layout()

# Exibir gráfico
plt.show()

### CRIANDO XGBoost PREDICT

In [None]:
historic_df['day'] = historic_df.index.day
historic_df['week_of_year'] = [dt.weekofyear for dt in historic_df.index]
historic_df['month'] = historic_df.index.month
historic_df['year'] = historic_df.index.year


# Plot da ACF e PACF da série diferenciada
plt.figure(figsize=(14, 6))
plot_acf(historic_df["Último"].dropna(), ax=plt.gca(), lags=40)
plt.title("Função de Autocorrelação (ACF)")


n_lags = 40
for lag in range(1, n_lags + 1):
    historic_df[f'lag_{lag}'] = historic_df['Último'].shift(lag)


# Rolling statistics
historic_df[f'rolling_mean_{n_lags}'] = historic_df['Último'].rolling(window=n_lags).mean().shift(1)
historic_df[f'rolling_std_{n_lags}'] = historic_df['Último'].rolling(window=n_lags).std().shift(1)

# Exponential Moving Averages (EMA)
historic_df[f'ema_{n_lags}'] = historic_df['Último'].ewm(span=n_lags).mean().shift(1)
historic_df[f'ema_std_{n_lags}'] = historic_df['Último'].ewm(span=n_lags).std().shift(1)



from scipy.signal import periodogram
frequency, power = periodogram(historic_df['Último'].dropna(), fs=1)

freq = frequency[np.argmax(power)]
period = 1 / freq

print(f'Period: {period} days')

historic_df['fourier_sin'] = np.sin(2 * np.pi * freq * np.arange(len(historic_df)))
historic_df['fourier_cos'] = np.cos(2 * np.pi * freq * np.arange(len(historic_df)))


predictors = [
    'day',
    'week_of_year',
    'month',
    'year',
    *['lag_{}'.format(i) for i in range(1, n_lags + 1)],
    'rolling_mean_{n_lags}'.format(n_lags=n_lags),
    'rolling_std_{n_lags}'.format(n_lags=n_lags),
    'ema_{n_lags}'.format(n_lags=n_lags),
    'ema_std_{n_lags}'.format(n_lags=n_lags),
    'fourier_sin',
    'fourier_cos',
]

In [None]:
# Define training and test sets
train_end = '2025-01-01'
df_train = historic_df.loc[:train_end]
df_test = historic_df.loc[train_end:]

# Define features and target
X_train = df_train[predictors]
y_train = df_train['Último']
X_test = df_test[predictors]
y_test = df_test['Último']

# TimeSeriesSplit for validation
tscv = TimeSeriesSplit(n_splits=100)
xgb = XGBRegressor(objective='reg:squarederror')

# Train with cross-validation
xgb_models = []
mse = []
count = 0
for train_idx, val_idx in tscv.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    xgb.fit(X_tr, y_tr)
    val_pred = xgb.predict(X_val)
    print(f'{count} Validation MSE: {mean_squared_error(y_val, val_pred)}')
    count = count + 1 
    mse.append(mean_squared_error(y_val, val_pred))
    xgb_models.append(xgb)

best_model_idx = mse.index(min(mse))
xgb = xgb_models[best_model_idx]

# Predict on the test set
xgb_pred = xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, xgb_pred)
print(f'XGBoost MSE: {mse_xgb}')

0 Validation MSE: 1.5477035451527166
1 Validation MSE: 9.1185464321921
2 Validation MSE: 31.33077259944875
3 Validation MSE: 18.253585431529437
4 Validation MSE: 0.984548212920491
5 Validation MSE: 1.9579458117210569
6 Validation MSE: 0.900673925091793
7 Validation MSE: 2.5068405161050964
8 Validation MSE: 0.6237711254931821
9 Validation MSE: 1.1229927083225928
10 Validation MSE: 6.9321077365592
11 Validation MSE: 3.8031878375349004
12 Validation MSE: 1.5139718482428146
13 Validation MSE: 2.479440903360805
14 Validation MSE: 4.618704924479808
15 Validation MSE: 2.4961086345403696
16 Validation MSE: 1.2871269261098681
17 Validation MSE: 1.7846704373710467
18 Validation MSE: 3.0017497232565313
19 Validation MSE: 0.9806416202355017
20 Validation MSE: 2.2446056867322013
21 Validation MSE: 2.281468537165702
22 Validation MSE: 2.5900730667081255
23 Validation MSE: 2.2095362295350593
24 Validation MSE: 0.40201037913719057
25 Validation MSE: 1.6545858720697206
26 Validation MSE: 1.728542862176

XGBoost MSE: 2.6483869542384295
