In [1]:
import numpy as np
import datetime as dt
import tensorflow as tf
from pailab import MLRepo, MeasureConfiguration, MLObjectType, FIRST_VERSION, LAST_VERSION
from pailab.numpy_handler_hdf import NumpyHDFStorage
import logging
from pailab.tools.tools import MLTree

logging.basicConfig(level=logging.ERROR)

In [2]:
handler = NumpyHDFStorage('C:/temp/',version_files = False)


In [3]:
v1 = {'x':np.zeros([20]), 'y':np.ones([2,3])}
handler.add('test', '1', v1)

In [4]:
v2 = {'x': np.ones([5]), 'y':np.zeros([4,3])}
handler.append('test', '1','2', v2)

In [5]:
handler.get('test', '1')

{'x': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.]), 'y': array([[ 1.,  1.,  1.],
        [ 1.,  1.,  1.]])}

In [6]:
v3 = {'x': np.zeros([1]), 'y':np.ones([1,3])}
handler.append('test', '2','3', v3)

In [8]:
handler.get('test', '2')

{'x': array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.]),
 'y': array([[ 1.,  1.,  1.],
        [ 1.,  1.,  1.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.],
        [ 0.,  0.,  0.]])}

## Kann weg

In [None]:
ml_repo = MLRepo( workspace = 'c:/ml_repos/sc')
import pailab.analysis.plot as plot
MLTree.add_tree(ml_repo)

In [None]:
#
from IPython.display import display, clear_output
from pailab.tools_jupyter import ObjectOverviewList
repo_overview = ObjectOverviewList(ml_repo)
display(repo_overview.get_widget())

In [None]:
#
from pailab.tools_jupyter import ObjectView
obj_view = ObjectView(ml_repo)
display(obj_view.get_widget())

In [None]:
depp = ml_repo.get('simple_dense/training_stat', full_object=True)
print(depp.loss)

In [None]:
import h5py

# Consistency checks

In [None]:
#
from pailab.tools_jupyter import ConsistencyChecker
consistency_checker = ConsistencyChecker(ml_repo)
display(consistency_checker.get_widget())

In [None]:
import pailab.tools.checker as checker
checker.Repo.run(ml_repo)

# Plotting

## Plot data

In [None]:
plot.histogram_data(ml_repo, {ml_repo.tree.test_data.sc_call_prices_test():'last'}, x_coordinate = 'vol_of_variance', start_index=0, end_index=10000, n_bins=100) #, y_coordinate='MEDV')

## Measure history

In [None]:
#import pailab.analysis.plot as plot
plot.measure_history(ml_repo, measure_name=['simple_dense/measure/sc_call_prices/mse'])

## Error vs training parameter

In [None]:
#
from pailab.tools_jupyter import Plotter
plotter = Plotter(ml_repo)
display(plotter.get_widget())

## Projection

In [None]:
import pailab.analysis.plot as plot
data_1 = ml_repo.get('sc_call_prices', full_object = True)

left = data_1.x_data[100,:]
left[6] = 1.0
right = np.copy(left)
left[5] = 0.7
right[5] = 1.3
print(data_1.x_coord_names)
#print(str(left))
#print(str(right))
plot.projection(ml_repo, left = left, right=right, n_steps = 500, output_index = 0)

## Histogram of errors

In [None]:
plot.histogram_model_error(ml_repo, ml_repo.tree.models.simple_dense.model(), ml_repo.tree.test_data.sc_call_prices_test(), end_index = 5000, n_bins=300)

## Error vs input data

In [None]:
plot.scatter_model_error(ml_repo, ml_repo.tree.models.simple_dense.model(), 
                         [ml_repo.tree.test_data.sc_call_prices_test()], 'vol_of_variance', start_index= 0 , end_index = 5000)

## Data distribution of largest errors

In [None]:
    plot.histogram_data_conditional_error(ml_repo,  ml_repo.tree.models.simple_dense.model(),
                                          ml_repo.tree.test_data.sc_call_prices_test(), x_coordinate = 'strike', 
                                          start_index = 0, end_index=10000, percentile = 0.01, n_bins=100)

# Test for interpretability

In [None]:
import numpy as np
from pailab.tools.tools import ModelAnalyzer

In [None]:
from joblib import dump, load
def save_load_tmp(model_analyzer, version):
    if model_analyzer._decision_tree is None:
        model_analyzer._decision_tree = load('C:/ml_repos/sc/tmp/'+ version +'.joblib')
    else:
        dump(model_analyzer._decision_tree,'C:/ml_repos/sc/tmp/'+ version +'.joblib')

In [None]:
model_analyzer_latest = ModelAnalyzer(ml_repo)
result_latest = model_analyzer_latest.analyze(ml_repo.tree.models.simple_dense.model(), ml_repo.tree.test_data.sc_call_prices_test(),n_samples = 40, end_index = 2000, max_depth=4)#, version='2f777012-2ea7-11e9-83cf-fc084a6691eb' , force_recalc=True)
save_load_tmp(model_analyzer_latest, result_latest.repo_info.version)
model_analyzer = ModelAnalyzer(ml_repo)
result = model_analyzer.analyze(ml_repo.tree.models.simple_dense.model(), ml_repo.tree.test_data.sc_call_prices_test(), n_samples = 40, end_index = 2000,  max_depth=4, version='2f777012-2ea7-11e9-83cf-fc084a6691eb') 
save_load_tmp(model_analyzer, result.repo_info.version)

In [None]:
#print(result.repo_info.name)
for v in ['f56f62e4-3a55-11e9-8e06-fc084a6691eb', '5e9d7380-3a56-11e9-9448-fc084a6691eb', '85d064f0-3ab7-11e9-969b-fc084a6691eb']:
    ml_repo.delete('model_analyzer_simple_dense/model_sc_call_prices_test', v)

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
#import plotly.plotly as py

def plot_local_model_coeff_by_coord(result):
    data=[]
    i=0
    x_names = result['x_coord_names']
    node_stat = result['node_statistics']
    for k,v in node_stat.items():  
        data.append(go.Bar(x=x_names,y=v['model_coefficients'], name = 'leaf ' + str(k)))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=decision_tree.tree_.value[x_][:,0], name = 'x0'))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=[decision_tree.tree_.value[x_][1,0]], name = 'x1'))
    layout = go.Layout(
        barmode='group'
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')
    
plot_local_model_coeff_by_coord(result.result)
plot_local_model_coeff_by_coord(result_latest.result)


In [None]:
def plot_local_model_coeff_by_leaf(result):
    data=[]
    i=0
    x_names = result['x_coord_names']
    node_stat = result['node_statistics']
    for i in range(len(x_names)):
        names = []
        y_values = []
        k=0
        for k,v in node_stat.items():
            names.append('leaf ' + str(k))
            y_values.append(v['model_coefficients'][i])
        data.append(go.Bar(x=names,y=y_values, name = x_names[i]))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=decision_tree.tree_.value[x_][:,0], name = 'x0'))
        #data.append(go.Bar(x=['leaf ' + str(i)],y=[decision_tree.tree_.value[x_][1,0]], name = 'x1'))
        #i+=1

    layout = go.Layout(
        barmode='stack'
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')
    
def depp():
    data = []
    num_values = []
    mse_values = []
    x_names = []
    for x_ in leaf_nodes:
        x_names.append('leaf ' + str(x_))
        num_values.append(result[x_]['num_data_points'])
        mse_values.append(result[x_]['mse_mean'])
    #print(num_values)

    data.append(go.Bar(x=x_names,y=num_values))
    layout = go.Layout(
        barmode='bar'
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')


    fig = go.Figure(data=[go.Bar(x=x_names,y=mse_values)], layout=layout)
    iplot(fig, filename='grouped-bar')


plot_local_model_coeff_by_leaf(result.result)
plot_local_model_coeff_by_leaf(result_latest.result)


In [None]:
def plot_error_per_leaf(result):
    node_stat = result['node_statistics']
    nodes = [i for i in range(len(node_stat))]
    mean_error = []
    max_error = []
    min_error = []
    x_coordnames = []
    for k,v in node_stat.items():
        x_coordnames.append('leaf ' + str(k))
        mean_error.append(v['mse_mean'])
        min_error.append(v['mse_min'])
        max_error.append(v['mse_max'])
    data = []
    data.append(go.Scatter(x=x_coordnames, y=mean_error, name = 'mean'))
    data.append(go.Scatter(x=x_coordnames, y=min_error, name = 'min'))
    data.append(go.Scatter(x=x_coordnames, y=max_error, name = 'max'))
    
    layout = go.Layout(
        title='local model mse',
        xaxis=dict(title='leaf',),
        yaxis=dict(title='')
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='grouped-bar')

plot_error_per_leaf(result.result)
plot_error_per_leaf(result_latest.result)
#result.big_data['local_model_coeff']

In [None]:
import pailab.analysis.plot as pailab_plot
def plot_mse_quantile_data_dist(ml_repo, result, eval_data, x_coord, q = 50):
    node_stat = result.result['node_statistics']
    
    data = ml_repo.get(eval_data, result.repo_info.modification_info[eval_data], full_object=True)
    x_coord_index = data.x_coord_names.index(x_coord)
    mse = result.big_data['mse']
    q_result = np.percentile(mse, q)
    
    start_index = result.result['parameter']['start_index']
    end_index = result.result['parameter']['end_index']
    x=np.extract(mse>q_result, data.x_data[start_index:end_index,x_coord_index])
    plot_data = {'title': 'histogram of ' + x_coord + ', mse > ' + str(q) + '% quantile', 
                 'x0_name' : x_coord,
                    'data': {'':{'info':{}, 'x0': x}}}
    pailab_plot._histogram(plot_data)
    #return data.x_data[start_index:end_index,x_coord_index]
       
plot_mse_quantile_data_dist(ml_repo, result, ml_repo.tree.test_data.sc_call_prices_test(), 'expiry', q=90)
#result.big_data['mse']

In [None]:
def plot_leaf_node_data_dist(ml_repo, result, eval_data, x_coord, leaf_node):
    plt_data = {}
    leaf_nodes = leaf_node
    if isinstance(leaf_nodes, str):
        leaf_nodes = {leaf_node}
    for leaf_node in leaf_nodes:
        node_stat = result.result['node_statistics']

        data = ml_repo.get(eval_data, result.repo_info.modification_info[eval_data], full_object=True)
        x_coord_index = data.x_coord_names.index(x_coord)
        data_to_leaf_index = result.big_data['data_to_leaf_index']

        start_index = result.result['parameter']['start_index']
        end_index = result.result['parameter']['end_index']
        x=np.extract(data_to_leaf_index==leaf_node, data.x_data[start_index:end_index,x_coord_index])
        plt_data['leaf ' + str(leaf_node)] = {'info':{}, 'x0': x}

    plot_data = {'title': 'histogram of ' + x_coord + ', leafnode = ' + str(leaf_node), 
                 'x0_name' : x_coord,
                    'data': plt_data}
    pailab_plot._histogram(plot_data)
plot_leaf_node_data_dist(ml_repo, result, ml_repo.tree.test_data.sc_call_prices_test(), 'expiry', leaf_node=[12,19])

In [None]:
result.big_data

In [93]:
import h5py
h5py.__version__

'2.9.0'

In [95]:
?h5py.VirtualSource