# Dataset construction

- Download CSV data for selected tickers.
- Rename columns.
- Concatenate all downloads into single dataframe.

In [35]:
import urllib.request
import pandas as pd
import pickle
import io
import tempfile

In [36]:
def build_url(ticker: str) -> str:
    """Build an URL for CSV data download for given stock ticker.
    
    Args:
        ticker: Stock ticker in format used by stooq.pl provider.

    Returns:
        URL to download CSV data for given stock ticker.
    """
    return f"https://stooq.pl/q/d/l/?s={ticker}&i=d"

In [45]:
def get_data(ticker: str) -> pd.DataFrame:
    """Download CSV data for given stock ticker.
    
    Args:
        ticker: Stock ticker in format used by stooq.pl provider.

    Returns:
        DataFrame with stock data & columns in OHLCV format.
    """
    with tempfile.NamedTemporaryFile(delete_on_close=True) as tmp:
        url = build_url(ticker)
        with urllib.request.urlopen(url) as response:
            tmp.write(response.read())
            tmp.seek(0)
            data = pd.read_csv(tmp, index_col="Data", parse_dates=True)
        _map = {
            "otwarcie": "open",
            "najwyzszy": "high",
            "najnizszy": "low",
            "zamkniecie": "close",
            "wolumen": "volume",
            "najwyższy": "high",
            "najniższy": "low",
        }
        data.columns = [_map[col.lower()] for col in data.columns]
        data.index.name = "date"
    return data

In [48]:
tickers = {
    "wig20": "wig20",
    "usdpln": "usdpln",
    "10y": "10yply.b",
    "spx": "^spx",
    "dax": "^dax",
    "swig80": "swig80",
}

In [54]:
raw_downloads = {ticker: get_data(handle) for ticker, handle in tickers.items()}

In [60]:
# clip dates to <= 2024-10-31

raw_downloads = {ticker: data.loc[:"2024-10-31"] for ticker, data in raw_downloads.items()}

In [67]:
# clip dates to >= minimum date across all dataframes

min_date = max(data.index.min() for data in raw_downloads.values())
raw_downloads = {ticker: data.loc[min_date:] for ticker, data in raw_downloads.items()}

print(f"data clipped to {min_date}")

data clipped to 1999-05-25 00:00:00


In [68]:
# rename columns before concatenating
for ticker, df in raw_downloads.items():
    df.columns = [f"{ticker}_{col}" for col in df.columns]

# concat
df = pd.concat(raw_downloads.values(), axis=1)

In [69]:
df.head(2)

Unnamed: 0_level_0,wig20_open,wig20_high,wig20_low,wig20_close,wig20_volume,usdpln_open,usdpln_high,usdpln_low,usdpln_close,10y_open,...,dax_open,dax_high,dax_low,dax_close,dax_volume,swig80_open,swig80_high,swig80_low,swig80_close,swig80_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-05-25,1489.7,1507.8,1487.9,1502.1,4640264.0,3.929,3.96,3.929,3.94,7.79,...,5213.79,5218.29,5136.76,5143.1,0.0,1845.3,1845.3,1845.3,1845.3,627241.0
1999-05-26,1500.3,1511.6,1500.3,1507.7,5787051.0,3.945,3.977,3.936,3.965,7.937,...,5118.58,5194.36,5110.38,5160.44,0.0,1847.3,1847.3,1847.3,1847.3,486379.0


In [70]:
with open("raw_data.pkl", "wb") as f:
    pickle.dump(df, f)