In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as plx
from pathlib import Path
from random import random

In [2]:
period = 'daily'
market = 'us'

exchange = 'nasdaq'
partition = 2
ticker = 'msft'

In [None]:
pd.read_csv(f'data/stooq/{period}/{market}/{exchange} stocks/{partition}/{ticker}.{market}.txt')

In [3]:
def load_table(path: Path) -> pd.DataFrame | None:
    try: 
        return pd.read_csv(path)
    except pd.errors.EmptyDataError:
        return None

In [4]:
pathes = Path(f'data/stooq/{period}/{market}').glob(f'* stocks/*/*.{market}.txt')
tables = [load_table(path) for path in pathes if path]

In [5]:
tables = pd.concat(tables)

In [6]:
tables.drop(columns=['<PER>', '<OPENINT>', '<TIME>'], inplace=True, errors='ignore')

In [7]:
tables['<DATE>'] = pd.to_datetime(tables['<DATE>'].astype(str))

In [8]:
tables.reset_index(drop=True, inplace=True)

In [9]:
tables

Unnamed: 0,<TICKER>,<DATE>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,SAH.US,2005-02-25,20.5791,20.6557,20.4444,20.5859,235572.034359
1,SAH.US,2005-02-28,20.6193,20.6193,19.5078,19.6406,459954.119780
2,SAH.US,2005-03-01,19.7802,20.0552,19.7359,19.9040,288780.261915
3,SAH.US,2005-03-02,19.9040,20.1800,19.4814,19.8706,176818.696604
4,SAH.US,2005-03-03,19.9040,20.0002,19.6140,19.6769,290593.693072
...,...,...,...,...,...,...,...
18453997,ANSS.US,2024-05-24,327.2500,333.6100,326.0700,331.1100,300299.000000
18453998,ANSS.US,2024-05-28,328.4100,329.8250,325.8000,328.0800,466658.000000
18453999,ANSS.US,2024-05-29,325.1500,326.9200,321.1000,322.4400,460218.000000
18454000,ANSS.US,2024-05-30,320.8400,323.4100,319.5300,319.9100,366715.000000


In [10]:
tables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18454002 entries, 0 to 18454001
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   <TICKER>  object        
 1   <DATE>    datetime64[ns]
 2   <OPEN>    float64       
 3   <HIGH>    float64       
 4   <LOW>     float64       
 5   <CLOSE>   float64       
 6   <VOL>     float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 985.6+ MB


In [11]:
tables.sort_values(['<TICKER>', '<DATE>'], inplace=True)

In [12]:
tables.to_parquet(Path(f'data/stooq/{period}/{market}/all_stocks.parquet'))

### Rerun from here

In [24]:
tables = pd.read_parquet(Path(f'data/stooq/{period}/{market}/all_stocks.parquet'))

In [25]:
tables['<CLOSE>'] = tables['<CLOSE>'].apply(lambda x: x + (random())*0.01 )

In [26]:
sample_companies = [
    'MSFT.US',
    'AVGO.US',
    'NVDA.US',
    'TSM.US',
    'TSLA.US',
    'META.US',
    'IBM.US',
    'AAPL.US',
    'ASML.US'
]

In [27]:
plx.line(
    data_frame=tables[tables['<TICKER>'].isin(sample_companies)], 
    x='<DATE>', 
    y='<CLOSE>', 
    color='<TICKER>',
    log_y=True,
    height=720,
    width=1280
)

In [28]:
tables['<RETURN>'] = tables['<CLOSE>'] / tables.groupby('<TICKER>')['<CLOSE>'].shift()

In [29]:
tables['<LOG_CLOSE>'] = np.log(tables['<CLOSE>'])

In [30]:
tables['<LOG_RETURN>'] = np.log(tables['<RETURN>'])

In [31]:
sample_table = tables[tables['<TICKER>'].isin(sample_companies)]

In [None]:
plx.histogram(
    data_frame=tables,#[sample_table['<RETURN>'] == 1.0],
    x='<LOG_CLOSE>',
    #color='<TICKER>',
    height=720,
    width=1280,
    nbins=1024*4,
)

In [None]:
plx.histogram(
    data_frame=tables,#[sample_table['<RETURN>'] == 1.0],
    x='<LOG_RETURN>',
    #color='<TICKER>',
    height=720,
    width=1280,
    nbins=1024*4,
)

In [35]:
tables.to_parquet(Path(f'data/stooq/{period}/{market}/all_stocks_processed.parquet'))

In [34]:
tables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18454002 entries, 6436603 to 13848047
Data columns (total 10 columns):
 #   Column        Dtype         
---  ------        -----         
 0   <TICKER>      object        
 1   <DATE>        datetime64[ns]
 2   <OPEN>        float64       
 3   <HIGH>        float64       
 4   <LOW>         float64       
 5   <CLOSE>       float64       
 6   <VOL>         float64       
 7   <RETURN>      float64       
 8   <LOG_CLOSE>   float64       
 9   <LOG_RETURN>  float64       
dtypes: datetime64[ns](1), float64(8), object(1)
memory usage: 1.5+ GB
