In [1]:
import pandas as pd
import numpy as np

ds = pd.read_csv('../data/2023_selected_stocks.csv', low_memory=False)
ds.head()

Unnamed: 0,date,ticker,name,open,close,min,max,avg,quantity,volume,currency,marketType,bdiCode,prazoT,paperSpecification,optionPrice,priceCorrection,paperDueDate,quoteFactor
0,2023-01-02,ABEV3,AMBEVS/A,14.4,14.19,14.02,14.45,14.19,14995900,212856900.0,R$,10,2,,ON,0.0,0,99991231,1
1,2023-01-02,BBDC4,BRADESCO,14.9,14.75,14.66,14.99,14.74,24748300,365003800.0,R$,10,2,,PN N1,0.0,0,99991231,1
2,2023-01-02,ITUB4,ITAUUNIBANCO,24.43,24.49,24.04,24.53,24.26,23340700,566420800.0,R$,10,2,,PN EJ N1,0.0,0,99991231,1
3,2023-01-02,PETR4,PETROBRAS,23.54,22.92,22.8,23.81,23.09,78424700,1811379000.0,R$,10,2,,PN N2,0.0,0,99991231,1
4,2023-01-02,VALE3,VALE,88.68,89.4,88.53,89.9,89.42,12783800,1143138000.0,R$,10,2,,ON NM,0.0,0,99991231,1


In [2]:
ds.drop(columns=['currency','marketType','bdiCode','prazoT','paperSpecification','optionPrice', 'priceCorrection','paperDueDate', 'quoteFactor'], inplace=True)
ds.head()

Unnamed: 0,date,ticker,name,open,close,min,max,avg,quantity,volume
0,2023-01-02,ABEV3,AMBEVS/A,14.4,14.19,14.02,14.45,14.19,14995900,212856900.0
1,2023-01-02,BBDC4,BRADESCO,14.9,14.75,14.66,14.99,14.74,24748300,365003800.0
2,2023-01-02,ITUB4,ITAUUNIBANCO,24.43,24.49,24.04,24.53,24.26,23340700,566420800.0
3,2023-01-02,PETR4,PETROBRAS,23.54,22.92,22.8,23.81,23.09,78424700,1811379000.0
4,2023-01-02,VALE3,VALE,88.68,89.4,88.53,89.9,89.42,12783800,1143138000.0


In [3]:
bovespaDf = pd.read_csv('../data/ibovespa_2023.csv', low_memory=False)

# changing uppercase "Date" to "date"
bovespaDf = bovespaDf.rename(columns={'Date': 'date'})

# renaming to avoid duplicates in "close" variable
bovespaDf = bovespaDf.rename(columns={'close': 'ibovespa_close'})
bovespaDf.head()

Unnamed: 0,date,ibovespa_close
0,,^BVSP
1,2023-01-02,106376.0
2,2023-01-03,104166.0
3,2023-01-04,105334.0
4,2023-01-05,107518.0


In [4]:
# making sure the date formats are the same
bovespaDf['date'] = pd.to_datetime(bovespaDf['date'])
ds['date'] = pd.to_datetime(ds['date'])

# merge on the date column
ds = ds.merge(bovespaDf[['date', 'ibovespa_close']], on='date', how='left')

ds.tail()


Unnamed: 0,date,ticker,name,open,close,min,max,avg,quantity,volume,ibovespa_close
1095,2023-11-17,ABEV3,AMBEVS/A,13.71,13.6,13.52,13.74,13.56,36788200,499146400.0,125062.0
1096,2023-11-17,BBDC4,BRADESCO,15.57,15.43,15.35,15.6,15.45,34493400,533187900.0,125062.0
1097,2023-11-17,VALE3,VALE,74.23,74.24,74.1,74.65,74.35,24001500,1784705000.0,125062.0
1098,2023-11-17,PETR4,PETROBRAS,35.83,36.71,35.82,36.92,36.53,85895400,3138326000.0,125062.0
1099,2023-11-17,ITUB4,ITAUUNIBANCO,30.29,30.45,30.12,30.59,30.45,24575400,748326800.0,125062.0


In [5]:
## Features
# - Day of the week : different days of the week may influence if a stock's price change, eg Monday & Fridays more agitated
# - daily_return = (close - open) / open  - Simple return rate %
# - price_range = max - min - Intraday volatility
# - volume_per_quantity = volume / quantity - Trade size indicator

import pandas as pd

# Convert date column to datetime if not already
ds["date"] = pd.to_datetime(ds["date"])

# --- Feature Engineering ---

# 1. Day of the week (Monday=0, Sunday=6)
ds["day_of_week"] = ds["date"].dt.day_name()  # if you prefer string names
# or: ds["day_of_week"] = ds["date"].dt.weekday  # if you prefer numeric (0â€“6)

# 2. Daily return = (close - open) / open
ds["daily_return"] = (ds["close"] - ds["open"]) / ds["open"]

# 3. Price range = max - min
ds["price_range"] = ds["max"] - ds["min"]

# 4. Volume per quantity = volume / quantity
ds["volume_per_quantity"] = ds["volume"] / ds["quantity"]

# --- Optional: handle infinities or NaNs ---
ds.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
ds.dropna(subset=["daily_return", "price_range", "volume_per_quantity"], inplace=True)

# --- View results ---
print(ds.head())

# --- Save to CSV ---
output_path = "../data/2023_stock_with_features.csv"
ds.to_csv(output_path, index=False)


        date ticker          name   open  close    min    max    avg  \
0 2023-01-02  ABEV3      AMBEVS/A  14.40  14.19  14.02  14.45  14.19   
1 2023-01-02  BBDC4      BRADESCO  14.90  14.75  14.66  14.99  14.74   
2 2023-01-02  ITUB4  ITAUUNIBANCO  24.43  24.49  24.04  24.53  24.26   
3 2023-01-02  PETR4     PETROBRAS  23.54  22.92  22.80  23.81  23.09   
4 2023-01-02  VALE3          VALE  88.68  89.40  88.53  89.90  89.42   

   quantity        volume ibovespa_close day_of_week  daily_return  \
0  14995900  2.128569e+08       106376.0      Monday     -0.014583   
1  24748300  3.650038e+08       106376.0      Monday     -0.010067   
2  23340700  5.664208e+08       106376.0      Monday      0.002456   
3  78424700  1.811379e+09       106376.0      Monday     -0.026338   
4  12783800  1.143138e+09       106376.0      Monday      0.008119   

   price_range  volume_per_quantity  
0         0.43            14.194337  
1         0.33            14.748640  
2         0.49            24.267