In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import glob

%matplotlib inline

# Data

Source: https://www.kaggle.com/jacksoncrow/stock-market-dataset

In [3]:
astypes = {'Volume': np.int64 }
float32s = { t : np.float32 for t in {'Open', 'High', 'Low', 'Close','Adj Close'}}
astypes.update(float32s)

In [None]:
%%time

DATADIR = 'stock-market-dataset/stocks'

full = list()

for i, f in enumerate(glob.glob(f'{DATADIR}/*.csv'), 1):
    
    if i % 1000 == 0: print(i, f)    
    df = pd.read_csv(f, parse_dates=['Date'], low_memory=False)
    
    symbolname = f[len(DATADIR)+1:-4]
    
    df['Symbol'] = symbolname
    df = df.set_index('Symbol')
    
    full.append(df)
    
full_stocks = pd.concat(full).dropna()
full_stocks = full_stocks.astype(astypes)

full_stocks.to_csv('DATA.csv', encoding='ascii')

1000 stock-market-dataset/stocks\CETV.csv
2000 stock-market-dataset/stocks\FITBI.csv
3000 stock-market-dataset/stocks\KNSA.csv
4000 stock-market-dataset/stocks\OSIS.csv
5000 stock-market-dataset/stocks\STND.csv


# Work in Progress

In [None]:
%%time

df = pd.read_csv("DATA.csv", parse_dates=['Date'], low_memory=False, dtype=float32s)

In [None]:
df.head()

In [None]:
Y = df.resample('Y', on='Date')

In [None]:
uniqs_per_year = Y.agg({'Symbol': 'nunique'})

uniqs_per_year.plot.bar(figsize=(16,10))

In [None]:
average_volume_per_year = Y.agg({'Volume': 'mean'})

average_volume_per_year.plot.bar(figsize=(16,10))

In [None]:
average_max_min_delta_per_year = Y[['Open', 'High', 'Low']].agg({'Open': 'median', 'High': 'median', 'Low': 'median'})

# why are the MEDIAN YEARLY opens categorically so low for the years 1962-1968 and 1974-1986?

# thoughts: 
# - the median for all possible stocks doesn't HAVE to be between the median for the highs and lows;
#  especially if there are not very many stocks or much of the year the stock values are 0?
#   - but why would the open be 0 for any stock at any time?

average_max_min_delta_per_year.plot(color=['green', 'orange', 'red'], figsize=(16,10))