In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics as stat

import os
import logging
import math
import itertools

import scipy.stats


import tensorflow as tf

logging.basicConfig(level=logging.DEBUG)

In [2]:
DATASET_DIR = "dataset/dataset-2017-10-11"
STOCK_DIR = f"{DATASET_DIR}/Stocks"
ETF_DIR = f"{DATASET_DIR}/ETFs"

sp500 = set(pd.read_csv('dataset/s&p500.tsv', sep='\t')['Ticker symbol'])

In [3]:
def millibel_normalize(values, references):
    """
    Normalizes one timeseries in reference to the other, 
    and converts the result in milliBells (log-scale)
    """
    for value, ref in zip(values, references):
        yield 1000*math.log10(value/ref)
    
def normalize_dataset(df):
    """
    Apply `millibel_normalize` to Open, Close, High and Low prices of the dataset
    """
    data = list(zip(
        df.Date[1:],
        df.Volume[1:],
        millibel_normalize(df.Open[1:],  df.Close[:-1]), 
        millibel_normalize(df.Close[1:], df.Close[:-1]), 
        millibel_normalize(df.High[1:],  df.Close[:-1]), 
        millibel_normalize(df.Low[1:],   df.Close[:-1])))
        
    return pd.DataFrame(data, columns=['Date', 'Volume', 'Open', 'Close', 'High', 'Low'])


def dataset_to_timeseries(df, days):
    """
    Concatenates `days` sequential values to create a larger feature array.
    """
    parallel_series = [
        df[d:len(df)+1-days+d]
        for d in reversed(range(days))
    ]
    
    #for x in [df] + parallel_series:
    #    display(x[:2], x[-2:])
    #    display("===========")
    #print(len(df), len(parallel_series[0]))
    
    cols = ['Date']
    data = [parallel_series[0].Date]
    for i in range(len(parallel_series)):
        s = parallel_series[i]
        for col in ['Open', 'Close', 'High', 'Low', 'Volume']:
            cols.append(f'{col}.{i}')
            data.append(s[col])
            
    return pd.DataFrame(list(zip(*data)), columns=cols)


def concat_datasets(datasets):
    return pd.concat(
        df.assign(symbol = symbol)
        for symbol, df in datasets.items()
    )

def extract_all_data(symbols, feature_days=10, min_date='1990-01-01'):
    failed_csvs = []
    raw_data = {}
    normalized_data = {}
    timeseries_feature_data = {}
    for filename in os.listdir(STOCK_DIR):
        symbol = filename.split('.')[0].upper()
        if not symbol in symbols:
            continue

        raw = pd.read_csv(f"{STOCK_DIR}/{filename}")
        raw_data[symbol] = raw[raw.Date >= min_date]
        normalized_data[symbol] = normalize_dataset(raw_data[symbol])
        timeseries_feature_data[symbol] = dataset_to_timeseries(normalized_data[symbol], feature_days)

        #if len(raw_data) > 5:
            #break
            
    if failed_csvs:
        logging.warning(f'Failed to read {len(failed_csvs)} CSV files: {failed_csvs}')
            
    return raw_data, normalized_data, timeseries_feature_data
  

In [10]:
sp500_features = concat_datasets(extract_all_data(sp500, 10)[2])

In [17]:
sp500_features

Unnamed: 0,Date,Open.0,Close.0,High.0,Low.0,Volume.0,Open.1,Close.1,High.1,Low.1,...,Close.8,High.8,Low.8,Volume.8,Open.9,Close.9,High.9,Low.9,Volume.9,symbol
0,1999-12-03,7.946130,3.673286,15.130348,1.789708,3223074,8.104572,11.823897,20.327999,2.524758,...,37.307640,37.307640,-3.407127,6970266,-10.597406,-37.307640,-9.983541,-43.485703,16142920,A
1,1999-12-06,7.283094,12.034011,18.524367,6.685558,2385046,7.946130,3.673286,15.130348,1.789708,...,-41.389761,-3.759386,-41.389761,6332082,9.908169,37.307640,37.307640,-3.407127,6970266,A
2,1999-12-07,0.000000,-4.750917,2.370096,-13.917589,2348161,7.283094,12.034011,18.524367,6.685558,...,11.350289,20.528595,0.000000,5132147,-15.085422,-41.389761,-3.759386,-41.389761,6332082,A
3,1999-12-08,0.000000,-0.597536,3.511644,-4.257543,2000481,0.000000,-4.750917,2.370096,-13.917589,...,1.407915,4.597896,-3.271419,1832635,1.300837,11.350289,20.528595,0.000000,5132147,A
4,1999-12-09,0.597536,5.924635,7.158746,0.597536,2150096,0.000000,-0.597536,3.511644,-4.257543,...,9.653274,12.970489,-6.720693,4317826,-1.931562,1.407915,4.597896,-3.271419,1832635,A
5,1999-12-10,-1.153132,-10.159315,1.234111,-10.159315,1764043,0.597536,5.924635,7.158746,0.597536,...,0.732575,8.380877,-12.363657,4567146,-2.050815,9.653274,12.970489,-6.720693,4317826,A
6,1999-12-13,7.185891,7.185891,14.310460,-3.623533,4260349,-1.153132,-10.159315,1.234111,-10.159315,...,7.648303,12.683226,-3.168092,3133746,-1.254347,0.732575,8.380877,-12.363657,4567146,A
7,1999-12-14,-1.104355,-24.520087,-1.104355,-34.141550,2467856,7.185891,7.185891,14.310460,-3.623533,...,11.823897,20.327999,2.524758,3252997,0.000000,7.648303,12.683226,-3.168092,3133746,A
8,1999-12-15,-10.249090,-13.447194,-7.028744,-20.698831,3091820,-1.104355,-24.520087,-1.104355,-34.141550,...,3.673286,15.130348,1.789708,3223074,8.104572,11.823897,20.327999,2.524758,3252997,A
9,1999-12-16,3.198105,54.381264,54.381264,3.198105,2738063,-10.249090,-13.447194,-7.028744,-20.698831,...,12.034011,18.524367,6.685558,2385046,7.946130,3.673286,15.130348,1.789708,3223074,A
