In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics as stat

import os
import logging
import math
import itertools

import scipy.stats


import tensorflow as tf
from sklearn.cross_validation import train_test_split

logging.basicConfig(level=logging.DEBUG)

  from ._conv import register_converters as _register_converters
DEBUG:matplotlib.backends:backend module://ipykernel.pylab.backend_inline version unknown


In [2]:
DATASET_DIR = "dataset/dataset-2017-10-11"
STOCK_DIR = f"{DATASET_DIR}/Stocks"
ETF_DIR = f"{DATASET_DIR}/ETFs"

sp500 = set(pd.read_csv('dataset/s&p500.tsv', sep='\t')['Ticker symbol'])

In [3]:
def millibel_normalize(values, references):
    """
    Normalizes one timeseries in reference to the other, 
    and converts the result in milliBells (log-scale)
    """
    for value, ref in zip(values, references):
        yield 100*math.log(value/ref)
    
def normalize_dataset(df):
    """
    Apply `millibel_normalize` to Open, Close, High and Low prices of the dataset
    """
    data = list(zip(
        df.Date[1:],
        df.Volume[1:],
        millibel_normalize(df.Open[1:],  df.Close[:-1]), 
        millibel_normalize(df.Close[1:], df.Close[:-1]), 
        millibel_normalize(df.High[1:],  df.Close[:-1]), 
        millibel_normalize(df.Low[1:],   df.Close[:-1])))
        
    return pd.DataFrame(data, columns=['Date', 'Volume', 'Open', 'Close', 'High', 'Low'])


def dataset_to_timeseries(df, days):
    """
    Concatenates `days` sequential values to create a larger feature array.
    """
    parallel_series = [
        df[d:len(df)+1-days+d]
        for d in reversed(range(days))
    ]
    
    #for x in [df] + parallel_series:
    #    display(x[:2], x[-2:])
    #    display("===========")
    #print(len(df), len(parallel_series[0]))
    
    cols = ['Date']
    data = [parallel_series[0].Date]
    for i in range(len(parallel_series)):
        s = parallel_series[i]
        for col in ['Open', 'Close', 'High', 'Low', 'Volume']:
            cols.append(f'{col}.{i}')
            data.append(s[col])
            
    return pd.DataFrame(list(zip(*data)), columns=cols)


def concat_datasets(datasets):
    return pd.concat(
        df.assign(symbol = symbol)
        for symbol, df in datasets.items()
    )

def extract_all_data(symbols, feature_days=10, min_date='1990-01-01'):
    failed_csvs = []
    raw_data = {}
    normalized_data = {}
    timeseries_feature_data = {}
    for filename in os.listdir(STOCK_DIR):
        symbol = filename.split('.')[0].upper()
        if not symbol in symbols:
            continue

        raw = pd.read_csv(f"{STOCK_DIR}/{filename}")
        raw_data[symbol] = raw[raw.Date >= min_date]
        normalized_data[symbol] = normalize_dataset(raw_data[symbol])
        timeseries_feature_data[symbol] = dataset_to_timeseries(normalized_data[symbol], feature_days)

        #if len(raw_data) > 5:
            #break
            
    if failed_csvs:
        logging.warning(f'Failed to read {len(failed_csvs)} CSV files: {failed_csvs}')
            
    return raw_data, normalized_data, timeseries_feature_data
  
def split_dataset_by_date(dataset, test_size = 0.2):
    trading_dates = sorted(set(dataset.Date))
    train_dates, test_dates = train_test_split(trading_dates, test_size=test_size)
    train_dataset = dataset[dataset.Date.isin(set(train_dates))].sample(frac=1).reset_index(drop=True)
    test_dataset = dataset[dataset.Date.isin(set(test_dates))].sample(frac=1).reset_index(drop=True)
    return (train_dataset, test_dataset)


In [4]:
sp500_data = concat_datasets(extract_all_data(sp500, 10)[2])
sp500_train_data, sp500_test_data = split_dataset_by_date(sp500_data)

In [7]:
len(sp500_train_data), len(sp500_test_data), len(sp500_data)
sp500_train_data

Unnamed: 0,Date,Open.0,Close.0,High.0,Low.0,Volume.0,Open.1,Close.1,High.1,Low.1,...,Close.8,High.8,Low.8,Volume.8,Open.9,Close.9,High.9,Low.9,Volume.9,symbol
0,1997-06-24,1.092907,-0.264085,1.092907,-1.372006,1275300,-0.511947,-3.243528,0.000000,-3.243528,...,0.582569,1.117330,-0.585983,556900,0.000000,-2.222314,0.000000,-3.352269,897000,AZO
1,2012-06-22,0.750962,0.443354,0.861155,-0.421422,2038219,0.404324,-1.824093,0.450871,-2.046510,...,0.895505,0.895505,-0.871184,1235078,0.682918,-1.948604,0.714205,-2.093276,1722152,SPG
2,2001-11-23,0.828584,2.724598,2.784309,0.252018,1603153,0.000000,-0.591577,0.931244,-1.166095,...,-0.753016,0.530541,-1.997587,3105680,0.576566,-0.252018,0.968088,-0.826536,3582234,USB
3,2012-10-22,0.255877,-1.585766,1.926872,-2.263209,2060804,5.142995,4.923061,8.180088,2.696699,...,-0.475977,0.158157,-0.984467,1742044,-0.083399,-2.596891,-0.083399,-2.639679,1611430,RHI
4,1994-07-25,-0.192748,0.665945,0.797222,-0.465085,2531191,1.327248,0.329987,1.655182,0.329987,...,-0.259386,0.130586,-0.657806,2379263,0.000000,-0.130586,0.000000,-1.188406,1315770,CSX
5,2009-02-23,0.716778,-1.987856,1.418604,-2.355429,3882819,-1.230244,-2.914453,0.609493,-9.297327,...,-5.613571,1.574710,-7.539753,6286432,-0.222103,2.475273,3.194126,-1.481754,4042354,DISH
6,2009-06-12,-0.580292,-0.861642,-0.039912,-1.629323,608735,0.204040,2.341649,3.156358,0.183655,...,-0.885571,0.566574,-1.500226,1021705,0.558661,2.098671,2.196586,-1.147009,712622,SRCL
7,1997-03-13,0.000000,-1.472648,0.000000,-2.277332,1046836,0.491676,0.000000,0.725490,-1.018952,...,0.383175,1.031789,-0.603467,883122,-0.648614,-1.031789,0.000000,-1.031789,1345055,EMR
8,1997-08-19,-0.841196,-0.313219,0.827850,-1.160294,2952036,0.000000,-0.827850,0.000000,-1.141069,...,-2.966714,-2.141543,-3.483930,3802853,0.313219,3.797149,4.402439,0.313219,6045619,LB
9,2011-10-19,0.229130,-1.024446,2.324392,-1.310733,16713370,0.410587,1.522500,2.985624,-1.451184,...,-0.751899,0.599453,-0.933946,9240065,-0.142813,1.405651,1.405651,-0.995264,7446904,NI
