In [1]:
import numpy as np
import pandas as pd
import nannyml as nml
from sklearn.datasets import fetch_california_housing

In [64]:
def evaluate_nannyml(aging_df, metric):
    simulation_ids = aging_df['simulation_id'].unique()
    nml_data = aging_df.merge(data, left_index=True, right_index=True, how='left')
    
    pe_results = []
    for simulation_id in simulation_ids:
        simulation_df = nml_data[nml_data['simulation_id'] == simulation_id]

        # get original test set
        test_aging_results = simulation_df[simulation_df['partition'] == 'test']
        idx_test_start, idx_test_end = test_aging_results.index[0], test_aging_results.index[-1]
        reference_df = nml_data[idx_test_start:idx_test_end]

        # get original prod set
        prod_aging_results = simulation_df[simulation_df['partition'] == 'prod']
        idx_prod_start, idx_prod_end = prod_aging_results.index[0], prod_aging_results.index[-1]
        analysis_df = nml_data[idx_prod_start:idx_prod_end]

        # fit DLE from NannyML
        estimator = nml.DLE(
            feature_column_names=data.columns.tolist(),
            y_pred='y_pred',
            y_true='y',
            timestamp_column_name='timestamp',
            metrics=[metric],
            chunk_period='d',
            tune_hyperparameters=False
        )

        estimator.fit(reference_df)
        # performance estimation results
        pe_result = estimator.estimate(analysis_df)

        pe_comparison = pe_result.filter(period='analysis').to_df()['mape']
        pe_comparison['estimated_alert'] = pe_comparison['alert']
        pe_comparison['realized_alert'] = np.where(pe_comparison['realized'] > pe_comparison['upper_threshold'], 
                                               True, False)
        
        pe_results.append(pe_comparison)
    
    
    return pd.concat(pe_results)

In [65]:
# load original data
data, target = fetch_california_housing(as_frame=True, return_X_y=True)
timestamp = pd.date_range(start='1/1/2018', periods=len(data), freq='H')
data.index = timestamp
target.index = timestamp

# load aging results
aging_results = pd.read_parquet('../results/aging/cal_house/aging_cal_house_10_simulations_3000_prod.parquet')

In [66]:
aging_results

Unnamed: 0,y,y_pred,partition,timestamp,model_age,is_model_valid,simulation_id
2018-04-11 07:00:00,0.658,0.727544,train,2018-04-11 07:00:00,-313,True,0
2018-04-11 08:00:00,0.692,0.711161,train,2018-04-11 08:00:00,-313,True,0
2018-04-11 09:00:00,0.725,0.681597,train,2018-04-11 09:00:00,-313,True,0
2018-04-11 10:00:00,0.730,0.871190,train,2018-04-11 10:00:00,-313,True,0
2018-04-11 11:00:00,1.083,0.924631,train,2018-04-11 11:00:00,-313,True,0
...,...,...,...,...,...,...,...
2020-05-07 13:00:00,1.234,1.489927,prod,2020-05-07 13:00:00,539,True,9
2020-05-07 14:00:00,0.927,1.314623,prod,2020-05-07 14:00:00,539,True,9
2020-05-07 15:00:00,1.375,1.547603,prod,2020-05-07 15:00:00,539,True,9
2020-05-07 16:00:00,0.698,1.189947,prod,2020-05-07 16:00:00,539,True,9


In [67]:
pe_comparison = evaluate_nannyml(aging_results, metric='mape')


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.


Using categorical_feature in Dataset.



In [68]:
pe_comparison.head()

Unnamed: 0,sampling_error,realized,value,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert,estimated_alert,realized_alert
0,0.026024,0.392031,0.43786,0.515934,0.359787,0.354894,0.0,True,True,True
1,0.022538,0.333964,0.43786,0.505474,0.370247,0.354894,0.0,True,True,False
2,0.022538,0.197725,0.43786,0.505474,0.370247,0.354894,0.0,True,True,False
3,0.022538,0.214981,0.43786,0.505474,0.370247,0.354894,0.0,True,True,False
4,0.022538,0.233812,0.43786,0.505474,0.370247,0.354894,0.0,True,True,False


In [69]:
pe_comparison['realized_alert'].value_counts()

False    662
True     598
Name: realized_alert, dtype: int64

In [70]:
positive_alerts = pe_comparison[pe_comparison['realized_alert'] == True]
negative_alerts = pe_comparison[pe_comparison['realized_alert'] == False]


tp_alerts = sum(positive_alerts['estimated_alert'] == positive_alerts['realized_alert'])
tn_alerts = sum(negative_alerts['estimated_alert'] == negative_alerts['realized_alert'])

print(f"""NannyML estimated {tp_alerts} ({np.round(tp_alerts/len(true_alerts) * 100, 1)} %) of the degradations correctly""")
print(f"""NannyML estimated {tp_alerts + tn_alerts} ({np.round((tp_alerts + tn_alerts) / len(pe_comparison) * 100, 1)} %) of the alerts correctly""")

NannyML estimated 142 (23.7 %) of the degradations correctly
NannyML estimated 600 (47.6 %) of the alerts correctly


fit nannyml with more than the test set. Fit it with the test + data from test to before prod.