In [1]:
import numpy as np
import datetime as dt
import tensorflow as tf
from pailab import MLRepo, MeasureConfiguration, MLObjectType, FIRST_VERSION, LAST_VERSION
import logging
from pailab.tools.tools import MLTree

logging.basicConfig(level=logging.ERROR)

In [2]:
ml_repo = MLRepo( workspace = 'c:/ml_repos/sc')
import pailab.plot as plot
MLTree.add_tree(ml_repo)

In [3]:
#
from IPython.display import display, clear_output
from pailab.tools_jupyter import ObjectOverviewList
repo_overview = ObjectOverviewList(ml_repo)
display(repo_overview.get_widget())

HBox(children=(VBox(children=(SelectMultiple(options=('EVAL_DATA', 'RAW_DATA', 'TRAINING_DATA', 'TEST_DATA', '…

In [4]:
#
from pailab.tools_jupyter import ObjectView
obj_view = ObjectView(ml_repo)
display(obj_view.get_widget())

HBox(children=(VBox(children=(SelectMultiple(index=(16,), options=('EVAL_DATA', 'RAW_DATA', 'TRAINING_DATA', '…

In [5]:
depp = ml_repo.get('simple_dense/training_stat', full_object=True)
print(depp.loss)

[  1.86895855e-02   1.91255816e-03   1.12504023e-03 ...,   8.43056997e-06
   8.57231530e-06   8.54969736e-06]


In [6]:
import h5py

# Consistency checks

In [7]:
#
from pailab.tools_jupyter import ConsistencyChecker
consistency_checker = ConsistencyChecker(ml_repo)
display(consistency_checker.get_widget())

VBox(children=(Button(description='update', style=ButtonStyle()), Output()))

In [8]:
import pailab.tools.checker as checker
checker.Repo.run(ml_repo)

{}

# Plotting

## Plot data

In [9]:
plot.histogram_data(ml_repo, {ml_repo.tree.test_data.sc_call_prices_test():'last'}, x_coordinate = 'vol_of_variance', start_index=0, end_index=10000, n_bins=100) #, y_coordinate='MEDV')

## Measure history

In [10]:
#import pailab.plot as plot
plot.measure_history(ml_repo, measure_name=['simple_dense/measure/sc_call_prices/mse'])

## Error vs training parameter

In [11]:
#
from pailab.tools_jupyter import Plotter
plotter = Plotter(ml_repo)
display(plotter.get_widget())

VBox(children=(HBox(children=(Button(description='plot', style=ButtonStyle()), SelectMultiple(options=('simple…

## Projection

In [12]:
import pailab.plot as plot
data_1 = ml_repo.get('sc_call_prices', full_object = True)

left = data_1.x_data[100,:]
left[6] = 1.0
right = np.copy(left)
left[5] = 0.7
right[5] = 1.3
print(data_1.x_coord_names)
#print(str(left))
#print(str(right))
plot.projection(ml_repo, left = left, right=right, n_steps = 500, output_index = 0)

['initial_variance', 'speed_of_meanreversion', 'long_run_variance', 'vol_of_variance', 'correlation', 'strike', 'expiry']


## Histogram of errors

In [13]:
plot.histogram_model_error(ml_repo, ml_repo.tree.models.simple_dense.model(), ml_repo.tree.test_data.sc_call_prices_test(), end_index = 5000, n_bins=300)

## Error vs input data

In [14]:
plot.scatter_model_error(ml_repo, ml_repo.tree.models.simple_dense.model(), 
                         [ml_repo.tree.test_data.sc_call_prices_test()], 'vol_of_variance', start_index= 0 , end_index = 5000)

## Data distribution of largest errors

In [15]:
    plot.histogram_data_conditional_error(ml_repo,  ml_repo.tree.models.simple_dense.model(),
                                          ml_repo.tree.test_data.sc_call_prices_test(), x_coordinate = 'strike', 
                                          start_index = 0, end_index=10000, percentile = 0.01, n_bins=100)

# Test for interpretability

In [16]:
import numpy as np
from pailab.tools.tools import ModelAnalyzer

In [17]:
if False:
    from joblib import dump, load
    def save_load_tmp(model_analyzer, version):
        if model_analyzer._decision_tree is None:
            model_analyzer._decision_tree = load('C:/ml_repos/sc/tmp/'+ version +'.joblib')
        else:
            dump(model_analyzer._decision_tree,'C:/ml_repos/sc/tmp/'+ version +'.joblib')

## Local linear regression

In [18]:
model_analyzer_latest = ModelAnalyzer(ml_repo)
result_latest = model_analyzer_latest.analyze_local_model(ml_repo.tree.models.simple_dense.model(), 
                                                          ml_repo.tree.test_data.sc_call_prices_test(),n_samples = 40, 
                                                          end_index = 22, max_depth=4)#, version='2f777012-2ea7-11e9-83cf-fc084a6691eb' , force_recalc=True)
#save_load_tmp(model_analyzer_latest, result_latest.repo_info.version)
model_analyzer = ModelAnalyzer(ml_repo)
result = model_analyzer.analyze_local_model(ml_repo.tree.models.simple_dense.model(), 
                                            ml_repo.tree.test_data.sc_call_prices_test(), n_samples = 40, 
                                            end_index = 22,  max_depth=4, version='2f777012-2ea7-11e9-83cf-fc084a6691eb') 

#save_load_tmp(model_analyzer, result.repo_info.version)

In [19]:
for v in ['f56f62e4-3a55-11e9-8e06-fc084a6691eb', '5e9d7380-3a56-11e9-9448-fc084a6691eb', '85d064f0-3ab7-11e9-969b-fc084a6691eb']:
    ml_repo.delete('model_analyzer_simple_dense/model_sc_call_prices_test', v)

In [20]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
#import plotly.plotly as py

def plot_local_model_coeff_by_coord(result):
    data=[]
    i=0
    x_names = result['x_coord_names']
    node_stat = result['node_statistics']
    for k,v in node_stat.items():  
        data.append(go.Bar(x=x_names,y=v['model_coefficients'], name = 'leaf ' + str(k)))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=decision_tree.tree_.value[x_][:,0], name = 'x0'))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=[decision_tree.tree_.value[x_][1,0]], name = 'x1'))
    layout = go.Layout(
        barmode='group'
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')
    
plot_local_model_coeff_by_coord(result.result)
plot_local_model_coeff_by_coord(result_latest.result)


In [21]:
def plot_local_model_coeff_by_leaf(result):
    data=[]
    i=0
    x_names = result['x_coord_names']
    node_stat = result['node_statistics']
    for i in range(len(x_names)):
        names = []
        y_values = []
        k=0
        for k,v in node_stat.items():
            names.append('leaf ' + str(k))
            y_values.append(v['model_coefficients'][i])
        data.append(go.Bar(x=names,y=y_values, name = x_names[i]))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=decision_tree.tree_.value[x_][:,0], name = 'x0'))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=[decision_tree.tree_.value[x_][1,0]], name = 'x1'))
        #i+=1

    layout = go.Layout(
        barmode='stack'
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')
    
def depp():
    data = []
    num_values = []
    mse_values = []
    x_names = []
    for x_ in leaf_nodes:
        x_names.append('leaf ' + str(x_))
        num_values.append(result[x_]['num_data_points'])
        mse_values.append(result[x_]['mse_mean'])
    #print(num_values)

    data.append(go.Bar(x=x_names,y=num_values))
    layout = go.Layout(
        barmode='bar'
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')


    fig = go.Figure(data=[go.Bar(x=x_names,y=mse_values)], layout=layout)
    iplot(fig, filename='grouped-bar')


plot_local_model_coeff_by_leaf(result.result)
plot_local_model_coeff_by_leaf(result_latest.result)


In [22]:
def plot_error_per_leaf(result):
    node_stat = result['node_statistics']
    nodes = [i for i in range(len(node_stat))]
    mean_error = []
    max_error = []
    min_error = []
    x_coordnames = []
    for k,v in node_stat.items():
        x_coordnames.append('leaf ' + str(k))
        mean_error.append(v['mse_mean'])
        min_error.append(v['mse_min'])
        max_error.append(v['mse_max'])
    data = []
    data.append(go.Scatter(x=x_coordnames, y=mean_error, name = 'mean'))
    data.append(go.Scatter(x=x_coordnames, y=min_error, name = 'min'))
    data.append(go.Scatter(x=x_coordnames, y=max_error, name = 'max'))
    
    layout = go.Layout(
        title='local model mse',
        xaxis=dict(title='leaf',),
        yaxis=dict(title='')
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')

plot_error_per_leaf(result.result)
plot_error_per_leaf(result_latest.result)
#result.big_data['local_model_coeff']

In [23]:
import pailab.plot as pailab_plot
def plot_mse_quantile_data_dist(ml_repo, result, eval_data, x_coord, q = 50):
    node_stat = result.result['node_statistics']
    
    data = ml_repo.get(eval_data, result.repo_info.modification_info[eval_data], full_object=True)
    x_coord_index = data.x_coord_names.index(x_coord)
    mse = result.big_data['mse']
    q_result = np.percentile(mse, q)
    
    start_index = result.result['parameter']['start_index']
    end_index = result.result['parameter']['end_index']
    x=np.extract(mse>q_result, data.x_data[start_index:end_index,x_coord_index])
    plot_data = {'title': 'histogram of ' + x_coord + ', mse > ' + str(q) + '% quantile', 
                 'x0_name' : x_coord,
                    'data': {'':{'info':{}, 'x0': x}}}
    pailab_plot._histogram(plot_data)
    #return data.x_data[start_index:end_index,x_coord_index]
       
plot_mse_quantile_data_dist(ml_repo, result, ml_repo.tree.test_data.sc_call_prices_test(), 'expiry', q=90)
#result.big_data['mse']

In [24]:
def plot_leaf_node_data_dist(ml_repo, result, eval_data, x_coord, leaf_node):
    plt_data = {}
    leaf_nodes = leaf_node
    if isinstance(leaf_nodes, str):
        leaf_nodes = {leaf_node}
    for leaf_node in leaf_nodes:
        node_stat = result.result['node_statistics']

        data = ml_repo.get(eval_data, result.repo_info.modification_info[eval_data], full_object=True)
        x_coord_index = data.x_coord_names.index(x_coord)
        data_to_leaf_index = result.big_data['data_to_leaf_index']

        start_index = result.result['parameter']['start_index']
        end_index = result.result['parameter']['end_index']
        x=np.extract(data_to_leaf_index==leaf_node, data.x_data[start_index:end_index,x_coord_index])
        plt_data['leaf ' + str(leaf_node)] = {'info':{}, 'x0': x}

    plot_data = {'title': 'histogram of ' + x_coord + ', leafnode = ' + str(leaf_node), 
                 'x0_name' : x_coord,
                    'data': plt_data}
    pailab_plot._histogram(plot_data)
plot_leaf_node_data_dist(ml_repo, result, ml_repo.tree.test_data.sc_call_prices_test(), 'expiry', leaf_node=[12,19])

## ICE

In [25]:
ice_results = model_analyzer.analyze_ice(ml_repo.tree.models.simple_dense.model(), 
                                       ml_repo.tree.test_data.sc_call_prices_test(),
                                       direction = np.array([0.0,0.0,0.0,0.0,0.0,0.0,2.0]),
                                        y_coordinate=0,
                                       start_index = 0, end_index = 104, n_steps = 10, n_clusters=9)

In [26]:
def plot_ice_cluster_centers(results):
    
    cluster_centers = results.big_data['cluster_centers']
    ice = results.big_data['ice']
    n_steps = cluster_centers.shape[1]
    data = []
    x_data = [0.0 + float(i)/float(n_steps-1) for i in range(n_steps) ]
    for i in range(cluster_centers.shape[0]):
    #for i in range(10):
        data.append(
                    go.Scatter(
                        x=x_data,
                        y=cluster_centers[i,:],
                        name='cluster ' + str(i)
                        #y=ice[i,:],
                    )
        )
    layout = go.Layout(
        title='cluster centers, ice',
        xaxis=dict(title='steps'),
        yaxis=dict(title='value')
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    
plot_ice_cluster_centers(ice_results)

In [31]:
ice_results.big_data.keys()

dict_keys(['ice', 'steps', 'labels', 'cluster_centers', 'distance_to_center'])

In [28]:
def plot_cluster_data_dist(ml_repo, result, eval_data, x_coord, cluster):
    plt_data = {}
    clusters = cluster
    if isinstance(cluster, str):
        clusters = [cluster]
    if isinstance(cluster, int):
        clusters = [cluster]
    data_to_cluter = result.big_data['labels']

    for c in clusters:        
        data = ml_repo.get(eval_data, result.repo_info.modification_info[eval_data], full_object=True)
        x_coord_index = data.x_coord_names.index(x_coord)
        start_index = result.result['param']['start_index']
        end_index = result.result['param']['end_index']
        x=np.extract(data_to_cluter==c, data.x_data[start_index:end_index,x_coord_index])
        plt_data['cluster ' + str(c)] = {'info':{}, 'x0': x}

    plot_data = {'title': 'histogram of ' + x_coord + ', cluster ' + str(cluster), 
                 'x0_name' : x_coord,
                    'data': plt_data}
    pailab_plot._histogram(plot_data)
    
plot_cluster_data_dist(ml_repo, ice_results, ml_repo.tree.test_data.sc_call_prices_test(), 'expiry', cluster=[6,5])

In [29]:
def plot_cluster_distance_hist(ml_repo, result, cluster):
    plt_data = {}
    clusters = cluster
    if isinstance(cluster, str):
        clusters = [cluster]
    if isinstance(cluster, int):
        clusters = [cluster]
    data_to_cluster = result.big_data['labels']

    for c in clusters:        
        x=np.extract(data_to_cluster==c, result.big_data['distance_to_center'])
        plt_data['cluster ' + str(c)] = {'info':{}, 'x0': x}

    plot_data = {'title': 'histogram of distance to cluster center, cluster ' + str(cluster), 
                 'x0_name' : 'distance',
                    'data': plt_data}
    pailab_plot._histogram(plot_data)
    
plot_cluster_distance_hist(ml_repo, ice_results,cluster=[0,1,2])

In [47]:
def plot_ice_quantiles(result, percentile=99):
    distance_to_center= result.big_data['distance_to_center']
    perc = np.percentile(distance_to_center, percentile)
    rows = []
    for i in range(len(distance_to_center)):
        if distance_to_center[i] > perc:
            rows.append(i)
    percentile_ice = result.big_data['ice'][rows,:]# np.extract(distance_to_center > perc, result.big_data['ice'])
    data = []
    n_steps = result.big_data['cluster_centers'].shape[1]
    x_data = [0.0 + float(i)/float(n_steps-1) for i in range(n_steps) ]
    for i in range(percentile_ice.shape[0]):
        data.append(
                    go.Scatter(
                        x=x_data,
                        y=percentile_ice[i,:],
                        #name='cluster ' + str(i)
                        #y=ice[i,:],
                    )
        )
    layout = go.Layout(
        title='ice for distance > ' + str(percentile) + ' percentile',
        xaxis=dict(title='steps'),
        yaxis=dict(title='value')
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

plot_ice_quantiles(ice_results)

In [42]:
ice_results.big_data['ice'][[1,2,3], :]

array([[ 0.        ,  0.        ,  0.0432354 ,  0.16987014,  0.18236308,
         0.22982632,  0.32866055,  0.42064863,  0.50324178,  0.58735251],
       [ 0.        ,  0.        ,  0.10344671,  0.24264109,  0.25220191,
         0.27506629,  0.33990154,  0.40842783,  0.47189429,  0.53490764],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.10748798,  0.30331087,  0.54610646,  0.77344465]])