# Dataset first commit

## Dependencies: [Pandas](https://pandas.pydata.org/)

In [1]:
import pandas as pd

### Dependencies: [Yfinance](https://pypi.org/project/yfinance/)

```bash
poetry add yfinance
```

In [2]:
import yfinance as yf

In [3]:
ticker = "^BVSP"
start = "2025-10-01"
end="2025-11-01"
interval="1d"

df = yf.download(
    ticker,
    start=start,
    end=end,
    interval=interval,
    auto_adjust=True
)
df

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,^BVSP,^BVSP,^BVSP,^BVSP,^BVSP
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-10-01,145517.0,146879.0,145193.0,146237.0,8340700
2025-10-02,143950.0,145621.0,143635.0,145517.0,7245600
2025-10-03,144201.0,144518.0,143676.0,143950.0,6177600
2025-10-06,143608.0,144532.0,143376.0,144202.0,5981500
2025-10-07,141356.0,143606.0,141035.0,143606.0,9272700
2025-10-08,142145.0,142385.0,141356.0,141356.0,7525200
2025-10-09,141708.0,143212.0,141603.0,142148.0,6821400
2025-10-10,140680.0,142274.0,140231.0,141725.0,8280800
2025-10-13,141783.0,142303.0,140682.0,140682.0,5531200
2025-10-14,141683.0,142589.0,141334.0,141788.0,7122300


In [4]:
isinstance(df.columns, pd.MultiIndex)

True

In [5]:
df.columns = df.columns.get_level_values(0)
df

Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-10-01,145517.0,146879.0,145193.0,146237.0,8340700
2025-10-02,143950.0,145621.0,143635.0,145517.0,7245600
2025-10-03,144201.0,144518.0,143676.0,143950.0,6177600
2025-10-06,143608.0,144532.0,143376.0,144202.0,5981500
2025-10-07,141356.0,143606.0,141035.0,143606.0,9272700
2025-10-08,142145.0,142385.0,141356.0,141356.0,7525200
2025-10-09,141708.0,143212.0,141603.0,142148.0,6821400
2025-10-10,140680.0,142274.0,140231.0,141725.0,8280800
2025-10-13,141783.0,142303.0,140682.0,140682.0,5531200
2025-10-14,141683.0,142589.0,141334.0,141788.0,7122300


In [6]:
df["Ticker"] = ticker

df

Price,Close,High,Low,Open,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-10-01,145517.0,146879.0,145193.0,146237.0,8340700,^BVSP
2025-10-02,143950.0,145621.0,143635.0,145517.0,7245600,^BVSP
2025-10-03,144201.0,144518.0,143676.0,143950.0,6177600,^BVSP
2025-10-06,143608.0,144532.0,143376.0,144202.0,5981500,^BVSP
2025-10-07,141356.0,143606.0,141035.0,143606.0,9272700,^BVSP
2025-10-08,142145.0,142385.0,141356.0,141356.0,7525200,^BVSP
2025-10-09,141708.0,143212.0,141603.0,142148.0,6821400,^BVSP
2025-10-10,140680.0,142274.0,140231.0,141725.0,8280800,^BVSP
2025-10-13,141783.0,142303.0,140682.0,140682.0,5531200,^BVSP
2025-10-14,141683.0,142589.0,141334.0,141788.0,7122300,^BVSP


#### Add to infra

In [7]:
from src.dataset.domain.interfaces import Dataset
from src.dataset.domain.value_objects import Batch
from src.config import logger

[32m2025-11-01 15:39:40.195[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Repositories\omni-ml[0m


In [None]:

class YfinancePandasDataset(Dataset):
    def __init__(self, ticker, start_date: str, end_date: str,interval="1d"):
        self.ticker = ticker
        self.start_date = start_date
        self.end_date = end_date
        self.interval = interval

    def load(self) -> Batch:        
        df = pd.DataFrame()
        try:
            df = yf.download(self.ticker, start=self.start_date, end=self.end_date,interval=self.interval,auto_adjust=True) # return pd.DataFrame
            if not df.empty:
                df.columns = df.columns.get_level_values(0)
                df["Ticker"] = self.ticker
            else:
                logger.warning(f'No data returned for {self.ticker}')
        except Exception as e:
            logger.error(f'Error loading {self.ticker}: {e}')

        return Batch(
            X=df
            )


#### Simple Test

```bash
poetry run pytest
```

In [None]:
DatasetTestYfinancePandas = YfinancePandasDataset(
    ticker="^BVSP",
    start_date="2025-10-01",
    end_date="2025-11-01",
    interval="1d"
)

batch = DatasetTestYfinancePandas.load()
assert not batch.X.empty, "DataFrame should not be empty"
assert "Ticker" in batch.X.columns, "'Ticker' column should be present in DataFrame"

batch.X

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume,Ticker
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-10-01,145517.0,146879.0,145193.0,146237.0,8340700,^BVSP
2025-10-02,143950.0,145621.0,143635.0,145517.0,7245600,^BVSP
2025-10-03,144201.0,144518.0,143676.0,143950.0,6177600,^BVSP
2025-10-06,143608.0,144532.0,143376.0,144202.0,5981500,^BVSP
2025-10-07,141356.0,143606.0,141035.0,143606.0,9272700,^BVSP
2025-10-08,142145.0,142385.0,141356.0,141356.0,7525200,^BVSP
2025-10-09,141708.0,143212.0,141603.0,142148.0,6821400,^BVSP
2025-10-10,140680.0,142274.0,140231.0,141725.0,8280800,^BVSP
2025-10-13,141783.0,142303.0,140682.0,140682.0,5531200,^BVSP
2025-10-14,141683.0,142589.0,141334.0,141788.0,7122300,^BVSP
