In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from __future__ import print_function

import datetime
import pandas as pd
import numpy as np
from functools import partial
from dateutil import parser as dtparser

from utils.datafetch import *
from utils.vectorized_funs import *
from utils.datapipe import *
from utils.kerasutil import *
from FintechCapstone import FinCapstone
from utils import baseline_model as baseline


from plotly import __version__
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import tools

init_notebook_mode(connected=True)


In [7]:
date_from = '2009-01-01'
date_to = '2017-03-01'
timespan = {
    "short_term": [1, 5]
    ,"medium_term": [40]
    ,"long_term": [90]
}

timespan_ab = {
    "short_term": []
    ,"medium_term": [30]
    ,"long_term": []
}

trial = FinCapstone(scenario="baseline", ticker_list=["ZHNE","BSFT","AERO","MATR","ITMSF","FIT","TIPT","TSRE","OLBK","QADA","ATTU","LGF","STX","TPCS","GSAT","ZNGA","GLW","HIMX","YHOO", "NWSA","JAKK","GLUU","AMZN", "AAPL", "EBAY", "GOOG", "DIS", "NFLX", "EA", "TWTR", "FB", "TTWO", "PXLW", "UBI"], ticker_list_samplesize=4, timespan=timespan, timespan_ab=timespan_ab, date_from=date_from, reset_status=False)
#trial = FinCapstone(ticker_list_samplesize=200, timespan=timespan, timespan_ab=timespan_ab, date_from=date_from)

#trial.run_initial_dataload()
#trial.feature_engineering()

- ~~Open~~
- ~~Close~~
- ~~High~~
- ~~Low~~
- ~~Volume~~

In [8]:
_ticker = "NFLX"
_slice_limit = -252

raw_df = load_raw_frame(_ticker, parseDate=True, dropAdjClose=True)
raw_df.set_index("Date", inplace=True)

features_df = trial.load_baseline_features(_ticker, parseDate=True)
features_df.set_index("Date", inplace=True)

labels_df = trial.load_baseline_labels(_ticker, parseDate=True)
labels_df.set_index("Date", inplace=True)

X_train, y_train, X_test, y_test = baseline.prepare_problemspace(features_df, labels_df, trial.train_from, trial.train_until, trial.test_from)


In [51]:
def plot_rawprices(raw_df, slice_limit, ticker):
    if not(slice_limit is None):
        raw_df = raw_df.iloc[slice_limit:]
        
    x = raw_df.index.tolist()

    # Create traces
    trace0 = go.Scatter(
        x = x,
        y = raw_df["Close"],
        mode = 'lines',
        name = 'Close',
        marker=dict(color='#0099ff')
    )
    trace1 = go.Bar(
        x = x,
        y = raw_df["Volume"],
        name = 'Volume',
        marker=dict(color='#0099ff')
    )
    
    fig = tools.make_subplots(rows=4, cols=1, specs=[[{'rowspan': 3}], [None], [None],[{}]], print_grid=False)

    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 4, 1)
    
    fig['layout'].update(title=ticker)

    iplot(fig)


plot_rawprices(raw_df, _slice_limit, _ticker)

In [49]:
def plot_raw_vs_baseline(raw_df, features_df, labels_df, slice_limit, ticker):
    if not(slice_limit is None):
        raw_df = raw_df.iloc[slice_limit:]
    
    x = raw_df.index.tolist()
        
    trace0 = go.Scatter(
        x = x,
        y = raw_df["Close"],
        mode = 'lines',
        name = 'Close',
        marker=dict(color='#0099ff')
    )
    trace1 = go.Scatter(
        x = x,
        y = labels_df["RETURN_1"],
        name = '1 Day Return',
        marker=dict(color='#0099ff')
    )
    trace2 = go.Scatter(
        x = x,
        y = raw_df["High"],
        name = 'High',
        marker=dict(color='#00ff99')
    )
    trace3 = go.Scatter(
        x = x,
        y = features_df["CHANGE_HIGH_1"],
        name = 'CHANGE_HIGH_1',
        marker=dict(color='#00ff99')
    )
    
    fig = tools.make_subplots(rows=4, cols=2, 
                              specs=[
                                  [{},{}]
                                  ,[{},{}]
                                  ,[{"colspan":2},None]
                                  ,[{"colspan":2},None]], print_grid=False, )

    fig.append_trace(trace0, 1, 1)
    fig.append_trace(trace1, 1, 2)
    fig.append_trace(trace2, 2, 1)
    fig.append_trace(trace3, 2, 2)
    
    fig.append_trace(trace0, 3, 1)
    fig.append_trace(trace2, 3, 1)
    
    fig.append_trace(trace1, 4, 1)
    fig.append_trace(trace3, 4, 1)
    
    fig['layout'].update(height=1000, width=1000, title=ticker)
    
    
    iplot(fig, filename='scatter-mode')
    
    return None

plot_raw_vs_baseline(raw_df, features_df, labels_df, _slice_limit, _ticker)

In [11]:
def plot_paralel(raw_df, color_col):
    _dimensions = list()

    for col in raw_df.columns.tolist():
        _dimensions.append(dict(
                            range = [raw_df[col].min(),raw_df[col].max()],
                            label = col,
                            values = raw_df[col],
                            tickformat=".3r"))

    data = [
        go.Parcoords(
            line = dict(color = raw_df[color_col],
                       colorscale = 'Jet',
                       showscale = True,
                       reversescale = True,
                       cmin = raw_df.min().min(),
                       cmax = raw_df.max().max()),
            dimensions = _dimensions
        )
    ]

    iplot(data, filename = 'parcoords-advanced')


In [12]:
plot_paralel(raw_df, "Close")
plot_paralel(features_df, "CHANGE_OPEN_1")
plot_paralel(labels_df, "RETURN_1")

In [13]:
all_df = pd.concat([raw_df, features_df, labels_df], axis=1)

## Running the line below will result in an "Entity too large" and the notebook will no linger be saved
#plot_paralel(all_df, "RETURN_1")

In [41]:
def plot_naninfzero_test(raw_df, plot_title):
    z_nan = np.isnan(raw_df).values.astype(np.float)
    z_inf = np.isinf(raw_df).values.astype(np.float)
    z_zero = (raw_df == 0).values.astype(np.float)
    
    nan_trace = go.Heatmap(
        z=z_nan,
        zmin=0.0,
        zmax=1.0,
        x=raw_df.columns.tolist(),
        y=raw_df.index.tolist(),
        colorscale=[
                [0.0, 'rgb(0,0,0)'],
                [1.0, '#0099ff']],
        colorbar=dict(
            tickmode="array",
            tickvals=[1.0, 0.0],
            ticktext=["Not Valid", "Valid"]
        )
    )

    inf_trace = go.Heatmap(
        z=z_inf,
        zmin=0.0,
        zmax=1.0,
        x=raw_df.columns.tolist(),
        y=raw_df.index.tolist(),
        colorscale=[
                [0.0, 'rgb(0,0,0)'],
                [1.0, '#0099ff']],
        colorbar=dict(
            tickmode="array",
            tickvals=[1.0, 0.0],
            ticktext=["Not Valid", "Valid"]
        )
    )
    
    zero_trace = go.Heatmap(
        z=z_zero,
        zmin=0.0,
        zmax=1.0,
        x=raw_df.columns.tolist(),
        y=raw_df.index.tolist(),
        colorscale=[
                [0.0, 'rgb(0,0,0)'],
                [1.0, '#0099ff']],
        colorbar=dict(
            tickmode="array",
            tickvals=[1.0, 0.0],
            ticktext=["Not Valid", "Valid"]
        )
    )
    
    _subplot_titles = [
        "Nan Test ({})".format(int(z_nan.sum())), 
        "Infinity Test ({})".format(int(z_inf.sum())),
        "Exactly Zero ({})".format(int(z_zero.sum()))
    ]

    fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles=_subplot_titles)
    
    fig.append_trace(nan_trace, 1, 1)
    fig.append_trace(inf_trace, 1, 2)
    fig.append_trace(zero_trace, 1, 3)
    
    fig['layout'].update(height=300, width=800, title='{} ({} obs.)'.format(plot_title, raw_df.count().sum()))
    
    iplot(fig, filename='scatter-mode')

In [47]:
plot_naninfzero_test(features_df,"Features")
plot_naninfzero_test(labels_df,"Labels")