In [40]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
from incremental_learning.config import es_cloud_id, es_user, es_password
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.express as px

from elasticsearch import Elasticsearch


import eland as ed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
es = Elasticsearch(cloud_id=es_cloud_id,
                       http_auth=(es_user, es_password))

df = ed.DataFrame(es_client=es, es_index_pattern='experiment-multi-step-sampling', 
                  columns=['config.dataset_name','run.meta.comment', 'config.training_fraction', 
                           'config.seed', 'run.result.baseline.train_error.value', 
                           'run.result.baseline.test_error.value'], )

# df1 = df.to_pandas().dropna()

# House dataset 
## Compare to the baseline

In [35]:
df[(df['run.meta.comment'] == "baseline estimation")]

Unnamed: 0,config.dataset_name,run.meta.comment,config.training_fraction,config.seed,run.result.baseline.train_error.value,run.result.baseline.test_error.value


In [26]:
pdf = df['run.meta.comment'].to_pandas()

In [32]:
pdf['run.meta.comment'].unique()

array([nan, 'baseline estimation', 'deduplication test', 'deduplication',
       'enable retrained tree eta optimization',
       'weighted sampling with deduplication', 'metric storage test',
       'Test adding sources', 'Test using comments'], dtype=object)

In [65]:
columns = ['config.dataset_name', 'comment', 'config_sampling_mode', 'training_fraction', 'updated_model.fraction_of_train', 'updated_model.hyperparameters.retrained_tree_eta', 'updated_model.test_error', 'updated_model.train_error', 'seed', 'step']
display_names = ['comment', 'config_sampling_mode', 'dataset_name', 'experiment_uid', 'run.comment', 'run.config.analysis.parameters.data_summarization_fraction', 'run.config.analysis.parameters.early_stopping_enabled', 'run.config.analysis.parameters.max_optimization_rounds_per_hyperparameter', 'run.config.analysis.parameters.prediction_change_cost', 'run.config.analysis.parameters.py/object', 'run.config.analysis.parameters.tree_topology_change_penalty', 'run.config.analysis.py/object', 'run.config.analysis_parameters.parameters.data_summarization_fraction', 'run.config.analysis_parameters.parameters.early_stopping_enabled', 'run.config.analysis_parameters.parameters.max_optimization_rounds_per_hyperparameter', 'run.config.analysis_parameters.parameters.prediction_change_cost', 'run.config.analysis_parameters.parameters.tree_topology_change_penalty', 'run.config.dataset_name', 'run.config.force_update', 'run.config.n_largest_multiplier', 'run.config.sampling_mode', 'run.config.seed', 'run.config.test_fraction', 'run.config.threads', 'run.config.training_fraction', 'run.config.update_fraction', 'run.config.update_steps', 'run.config.verbose', 'seed', 'step', 'training_fraction', 'updated_model.elapsed_time', 'updated_model.forest_statistics.num_trees', 'updated_model.forest_statistics.tree_depth_mean', 'updated_model.forest_statistics.tree_depth_std', 'updated_model.forest_statistics.tree_nodes_max', 'updated_model.forest_statistics.tree_nodes_mean', 'updated_model.forest_statistics.tree_nodes_std', 'updated_model.fraction_of_train', 'updated_model.hyperparameters.alpha', 'updated_model.hyperparameters.downsample_factor', 'updated_model.hyperparameters.eta', 'updated_model.hyperparameters.eta_growth_rate_per_tree', 'updated_model.hyperparameters.feature_bag_fraction', 'updated_model.hyperparameters.gamma', 'updated_model.hyperparameters.lambda', 'updated_model.hyperparameters.max_trees', 'updated_model.hyperparameters.prediction_change_cost', 'updated_model.hyperparameters.previous_train_loss_gap', 'updated_model.hyperparameters.previous_train_num_rows', 'updated_model.hyperparameters.retrained_tree_eta', 'updated_model.hyperparameters.soft_tree_depth_limit', 'updated_model.hyperparameters.soft_tree_depth_tolerance', 'updated_model.hyperparameters.tree_topology_change_penalty', 'updated_model.test_error', 'updated_model.train_error']

In [66]:
for col in columns:
    if col not in display_names:
        print(col)

config.dataset_name


In [47]:
baseline = df.es_query({
            "bool": {
              "filter": [
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "config.dataset_name.keyword": "house"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "run.meta.comment.keyword": "baseline estimation"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "exists": {
                          "field": "run.result.baseline.train_error.value"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                }
              ]
            }
          })

In [60]:
baseline_df = baseline.drop(columns=['config.dataset_name', 'run.meta.comment']).groupby(['config.training_fraction', 'config.seed']).agg('mean').reset_index()

In [116]:
fig = px.line(baseline_df, x='config.training_fraction', y='run.result.baseline.test_error.value', color='config.seed')
fig.show()

In [77]:
data = ed.DataFrame(es_client=es, es_index_pattern='experiment-multi-step-sampling-metrics', 
                  columns=['dataset_name', 'comment', 'config_sampling_mode', 'training_fraction',
                          'updated_model.fraction_of_train', 'updated_model.hyperparameters.retrained_tree_eta',
                          'updated_model.test_error', 'updated_model.train_error', 
                          'seed', 'step'])

In [78]:
data = data.es_query({
            "bool": {
              "filter": [
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "comment.keyword": "enable retrained tree eta optimization"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                },
                {
                  "bool": {
                    "should": [
                      {
                        "match_phrase": {
                          "dataset_name.keyword": "house"
                        }
                      }
                    ],
                    "minimum_should_match": 1
                  }
                }
              ]
            }
          })

In [79]:
data.drop(columns=['comment', 'dataset_name'], inplace=True)

In [89]:
data_df = data.to_pandas().dropna()

In [90]:
data_df

Unnamed: 0,config_sampling_mode,training_fraction,updated_model.fraction_of_train,updated_model.hyperparameters.retrained_tree_eta,updated_model.test_error,updated_model.train_error,seed,step
k3e2y3wBUYUoev8F0hb5,nlargest,0.25,0.34998,0.452951,1087295000.0,836651700.0,991068298,0
lHe2y3wBUYUoev8F0xZZ,nlargest,0.25,0.449961,1.0,1109733000.0,790408600.0,991068298,1
lXe2y3wBUYUoev8F0xa6,nlargest,0.25,0.549941,0.398967,1109733000.0,790408600.0,991068298,2
lne2y3wBUYUoev8F1BYZ,nlargest,0.25,0.649922,0.398967,1109733000.0,790408600.0,991068298,3
mHe2y3wBUYUoev8F9xbb,nlargest,0.5,0.59998,0.355451,975588000.0,330261300.0,991068298,0
mXe2y3wBUYUoev8F-BZD,nlargest,0.5,0.699961,0.301159,975588000.0,330261300.0,991068298,1
mne2y3wBUYUoev8F-Ban,nlargest,0.5,0.799941,0.301159,975588000.0,330261300.0,991068298,2
m3e2y3wBUYUoev8F-RYM,nlargest,0.5,0.899922,0.301159,975588000.0,330261300.0,991068298,3
nXe4y3wBUYUoev8FIBbX,nlargest,0.1,0.199961,0.134388,1096357000.0,657942300.0,195341097,0
nne4y3wBUYUoev8FIRY4,nlargest,0.1,0.299941,0.372485,1001643000.0,439889700.0,195341097,1


In [126]:
fig = go.Figure()
data_select = data_df[(data_df['config_sampling_mode'] == 'nlargest')].groupby(
    ['training_fraction', "updated_model.fraction_of_train"]).agg('mean').reset_index()
for training_fraction in data_select['training_fraction'].unique():
    subsample = data_select[data_select['training_fraction'] == training_fraction]
    fig.add_trace(go.Scatter(x=subsample['updated_model.fraction_of_train'], 
                             y=subsample['updated_model.test_error'], mode='lines',
                             name=training_fraction))
baseline_select = baseline_df.groupby(['config.training_fraction']).agg('mean').reset_index()
fig.add_trace(go.Scatter(x=baseline_select['config.training_fraction'], 
                         y=baseline_select['run.result.baseline.test_error.value'],
                        mode='lines', name='baseline'))
fig.update_layout(title='Test MSE with mean over seeds', xaxis_title='Fraction of train data used', 
                  yaxis_title='MSE', legend_title='training fraction')


fig.show()

In [127]:
fig = go.Figure()
data_select = data_df[(data_df['config_sampling_mode'] == 'nlargest')].groupby(
    ['training_fraction', "updated_model.fraction_of_train"]).agg('median').reset_index()
for training_fraction in data_select['training_fraction'].unique():
    subsample = data_select[data_select['training_fraction'] == training_fraction]
    fig.add_trace(go.Scatter(x=subsample['updated_model.fraction_of_train'], 
                             y=subsample['updated_model.test_error'], mode='lines',
                             name=training_fraction))
baseline_select = baseline_df.groupby(['config.training_fraction']).agg('median').reset_index()
fig.add_trace(go.Scatter(x=baseline_select['config.training_fraction'], 
                         y=baseline_select['run.result.baseline.test_error.value'],
                        mode='lines', name='baseline'))
fig.update_layout(title='Test MSE with median over seeds', xaxis_title='Fraction of train data used', 
                  yaxis_title='MSE', legend_title='training fraction')


fig.show()

In [99]:
data_select = data_df[(data_df['config_sampling_mode'] == 'nlargest')].groupby(['training_fraction', "updated_model.fraction_of_train"]).agg('mean').reset_index()


In [107]:
data_select

Unnamed: 0,training_fraction,updated_model.fraction_of_train,updated_model.hyperparameters.retrained_tree_eta,updated_model.test_error,updated_model.train_error,seed,step
0,0.1,0.199961,0.335571,1171535000.0,754429500.0,693328056.6,0.0
1,0.1,0.299941,0.38319,1041381000.0,543685200.0,693328056.6,1.0
2,0.1,0.399922,0.23265,1034137000.0,530511800.0,693328056.6,2.0
3,0.1,0.499902,0.285185,1023154000.0,521154800.0,693328056.6,3.0
4,0.1,0.599883,0.158893,1107394000.0,470072700.0,991068298.0,4.0
5,0.1,0.699863,0.072252,1097101000.0,476810400.0,991068298.0,5.0
6,0.1,0.799844,0.049522,1095774000.0,476654500.0,991068298.0,6.0
7,0.1,0.899824,0.398613,1094657000.0,474002500.0,991068298.0,7.0
8,0.1,0.999805,0.398613,1096877000.0,479790900.0,991068298.0,8.0
9,0.1,1.0,0.049522,1096936000.0,479857100.0,991068298.0,9.0
