In [1]:
import pandas as pd

df = pd.read_csv("../data/2023_stock_with_features.csv")
df.head()

Unnamed: 0,date,ticker,name,open,close,min,max,avg,quantity,volume,ibovespa_close,day_of_week,daily_return,price_range,volume_per_quantity
0,2023-01-02,ABEV3,AMBEVS/A,14.4,14.19,14.02,14.45,14.19,14995900,212856900.0,106376.0,Monday,-0.014583,0.43,14.194337
1,2023-01-02,BBDC4,BRADESCO,14.9,14.75,14.66,14.99,14.74,24748300,365003800.0,106376.0,Monday,-0.010067,0.33,14.74864
2,2023-01-02,ITUB4,ITAUUNIBANCO,24.43,24.49,24.04,24.53,24.26,23340700,566420800.0,106376.0,Monday,0.002456,0.49,24.267516
3,2023-01-02,PETR4,PETROBRAS,23.54,22.92,22.8,23.81,23.09,78424700,1811379000.0,106376.0,Monday,-0.026338,1.01,23.097042
4,2023-01-02,VALE3,VALE,88.68,89.4,88.53,89.9,89.42,12783800,1143138000.0,106376.0,Monday,0.008119,1.37,89.420798


In [2]:
# This aligns today’s features with tomorrow’s close.
df['target'] = df.groupby('ticker')['close'].shift(-1)

In [3]:
# That .shift(1) is critical — it prevents future leakage.
df['rolling_volume'] = df.groupby('ticker')['volume'].shift(1).rolling(5).mean()

In [4]:
# splitting training data based on months and not random picked

df['date'] = pd.to_datetime(df['date'])  # make sure date is datetime
df = df.sort_values(by='date')  # sort chronologically just in case

In [5]:
# Define split date
# splitting at 80% of the data
split_date = "2023-09-13"

# Create train/test splits
train_df = df[df['date'] < split_date]
test_df = df[df['date'] >= split_date]


In [6]:
features = [
    'open', 'close', 'min', 'max', 'avg', 'quantity',
    'volume', 'ibovespa_close', 'day_of_week',
    'daily_return', 'price_range', 'volume_per_quantity'
]

X_train = train_df[features]
y_train = train_df['target']

X_test = test_df[features]
y_test = test_df['target']


In [7]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

for train_index, val_index in tscv.split(X_train):
    X_t, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    # fit model here


In [9]:
df.head()

Unnamed: 0,date,ticker,name,open,close,min,max,avg,quantity,volume,ibovespa_close,day_of_week,daily_return,price_range,volume_per_quantity,target,rolling_volume
0,2023-01-02,ABEV3,AMBEVS/A,14.4,14.19,14.02,14.45,14.19,14995900,212856900.0,106376.0,Monday,-0.014583,0.43,14.194337,14.17,
1,2023-01-02,BBDC4,BRADESCO,14.9,14.75,14.66,14.99,14.74,24748300,365003800.0,106376.0,Monday,-0.010067,0.33,14.74864,14.0,
2,2023-01-02,ITUB4,ITAUUNIBANCO,24.43,24.49,24.04,24.53,24.26,23340700,566420800.0,106376.0,Monday,0.002456,0.49,24.267516,23.98,
3,2023-01-02,PETR4,PETROBRAS,23.54,22.92,22.8,23.81,23.09,78424700,1811379000.0,106376.0,Monday,-0.026338,1.01,23.097042,22.34,
4,2023-01-02,VALE3,VALE,88.68,89.4,88.53,89.9,89.42,12783800,1143138000.0,106376.0,Monday,0.008119,1.37,89.420798,89.24,
