In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import time

# turn off FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
dataset_name = 'insects'
detector_name = 'KS'

In [4]:
data_df = pd.read_excel(f'{dataset_name}/{dataset_name}_monitored_{detector_name}.xlsx')

In [5]:
data_df

Unnamed: 0,feat_1,feat_2,ref_corr,analysis_corr,corr_delta,feat_1_cd_score,feat_2_cd_score,delta_cd_score,feat_1_drift,feat_2_drift,single_feat_drift,feat_pair_drift
0,feat_10,feat_20,0.537515,0.542033,0.004518,0.079808,0.173922,0.094113,0,1,1,0
1,feat_10,feat_117,0.088667,0.067368,0.021299,0.079808,0.315089,0.235280,0,1,1,0
2,feat_10,feat_114,0.098289,0.057338,0.040951,0.079808,0.311109,0.231300,0,1,1,0
3,feat_10,feat_1,0.659821,0.682370,0.022549,0.079808,0.124413,0.044604,0,0,0,0
4,feat_10,feat_105,0.163490,0.116649,0.046842,0.079808,0.298078,0.218269,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
19895,feat_121,feat_98,0.204743,0.315981,0.111238,0.310222,0.239485,0.070738,1,1,0,1
19896,feat_121,feat_187,0.006346,0.116439,0.110093,0.310222,0.073904,0.236318,1,0,1,0
19897,feat_181,feat_98,0.246091,0.104758,0.141333,0.079982,0.239485,0.159503,0,1,1,0
19898,feat_181,feat_187,0.723333,0.603044,0.120289,0.079982,0.073904,0.006078,0,0,0,0


In [6]:
# Create the reversed pairs
reversed_df = data_df.rename(columns={'feat_1': 'feat_2', 'feat_2': 'feat_1', 'feat_1_drift': 'feat_2_drift', 'feat_2_drift': 'feat_1_drift', 'feat_1_cd_score': 'feat_2_cd_score', 'feat_2_cd_score': 'feat_1_cd_score'})

# Concatenate the original dataframe with the reversed pairs
data_df = pd.concat([data_df, reversed_df], ignore_index=True)

In [7]:
data_df.feat_1.nunique()

200

In [13]:
# put all above into a function

def compute_results(data_df, dataset_name, fold, corr_threshold, num_monitored_features):
    m = data_df['feat_1'].nunique()
    num_non_monitored_features = m - num_monitored_features
    non_monitored_ratio = num_non_monitored_features / m

    monitored_features = np.random.choice(data_df['feat_1'].unique(), num_monitored_features, replace=False)
    non_monitored_features = np.setdiff1d(data_df['feat_1'].unique(), monitored_features)

    pred_features_drifting = data_df[(data_df['feat_1'].isin(monitored_features)) & (data_df['feat_2'].isin(non_monitored_features)) & (data_df['ref_corr']>=corr_threshold)].drop_duplicates(subset=['feat_2']).feat_2.unique()

    # num_monitored_features_drifting = data_df[data_df['feat_1'].isin(monitored_features)].drop_duplicates(subset=['feat_1']).feat_1_drift.sum()

    num_non_monitored_features_drifting = data_df[data_df['feat_1'].isin(non_monitored_features)].drop_duplicates(subset=['feat_1']).feat_1_drift.sum()

    all_features_drifting = data_df[data_df['feat_1_drift']==1].drop_duplicates(subset=['feat_1']).feat_1.unique()

    all_features_non_drifting = data_df[data_df['feat_1_drift']==0].drop_duplicates(subset=['feat_1']).feat_1.unique()

    tp_features_drifting = set(pred_features_drifting).intersection(all_features_drifting)

    pred_features_non_drifting = set(non_monitored_features) - set(pred_features_drifting)

    tn_features_drifting = set(pred_features_non_drifting).intersection(all_features_non_drifting)

    num_tn_features_drifting = len(tn_features_drifting)

    num_tp_features_drifting = len(tp_features_drifting)

    num_pred_features_drifting_non_monitored = len(set(pred_features_drifting).intersection(non_monitored_features))

    results_df_non_monitored = pd.DataFrame(columns=['dataset', 'corr_threshold', 'num_monitored_features', 'fold', 'num_non_monitored_features', 'non_monitored_ratio', 'num_tp_features_drifting', 'num_tn_features_drifting', 'num_non_monitored_features_drifting', 'num_pred_features_drifting_non_monitored'])

    results_df_non_monitored = results_df_non_monitored.append({'dataset': dataset_name,
                                                                'corr_threshold': corr_threshold,
                                                                'num_monitored_features': num_monitored_features,
                                                                'fold': fold,
                                                                'num_non_monitored_features': num_non_monitored_features,
                                                                'non_monitored_ratio': non_monitored_ratio,
                                                                'num_tp_features_drifting': num_tp_features_drifting,
                                                                'num_tn_features_drifting': num_tn_features_drifting,
                                                                'num_non_monitored_features_drifting': num_non_monitored_features_drifting,
                                                                'num_pred_features_drifting_non_monitored': num_pred_features_drifting_non_monitored}, ignore_index=True)
    
    try:
        results_df_non_monitored['accuracy'] = (results_df_non_monitored['num_tp_features_drifting'] + results_df_non_monitored['num_tn_features_drifting']) / results_df_non_monitored['num_non_monitored_features']
        results_df_non_monitored['recall'] = results_df_non_monitored['num_tp_features_drifting'] / results_df_non_monitored['num_non_monitored_features_drifting']
        results_df_non_monitored['precision'] = results_df_non_monitored['num_tp_features_drifting'] / num_pred_features_drifting_non_monitored
        results_df_non_monitored['f1_score'] = 2 * (results_df_non_monitored['precision'] * results_df_non_monitored['recall']) / (results_df_non_monitored['precision'] + results_df_non_monitored['recall'])
    except ZeroDivisionError:
        results_df_non_monitored['accuracy'] = 0
        results_df_non_monitored['recall'] = 0
        results_df_non_monitored['precision'] = 0
        results_df_non_monitored['f1_score'] = 0
    
    return results_df_non_monitored

In [15]:
compute_results(data_df, dataset_name, 0, 0.1, 10)

Unnamed: 0,dataset,corr_threshold,num_monitored_features,fold,num_non_monitored_features,non_monitored_ratio,num_tp_features_drifting,num_tn_features_drifting,num_non_monitored_features_drifting,num_pred_features_drifting_non_monitored,accuracy,recall,precision,f1_score
0,insects,0.1,10,0,190,0.95,65,0,65,190,0.342105,1.0,0.342105,0.509804


In [16]:
# run a 10 fold cross validation, append the results to a final_measurements dataframe
final_measurements = pd.DataFrame(columns=['dataset', 'corr_threshold', 'num_monitored_features', 'fold', 'num_non_monitored_features', 'non_monitored_ratio', 'num_tp_features_drifting', 'num_tn_features_drifting', 'num_non_monitored_features_drifting', 'num_pred_features_drifting_non_monitored', 'accuracy', 'recall', 'precision', 'f1_score'])

m=data_df['feat_1'].nunique()
for corr_threshold in np.arange(0.1, 1.0, 0.1):
    print(corr_threshold)
    for num_monitored_features in range(int(m).bit_length()):
        num_monitored_features_val = int(m / (2 ** num_monitored_features))
        if num_monitored_features_val < 1:
            break
        for fold in range(10):
            start_time = time.time()
            results_df_non_monitored = compute_results(data_df, dataset_name, fold, corr_threshold, num_monitored_features_val)
            runtime = time.time() - start_time
            print(f"Runtime: {runtime} seconds")
            final_measurements = final_measurements.append(results_df_non_monitored, ignore_index=True)

0.1
Runtime: 0.01504969596862793 seconds
Runtime: 0.013999223709106445 seconds
Runtime: 0.015001296997070312 seconds
Runtime: 0.014997005462646484 seconds
Runtime: 0.014002084732055664 seconds
Runtime: 0.012999534606933594 seconds
Runtime: 0.013998985290527344 seconds
Runtime: 0.014002323150634766 seconds
Runtime: 0.013998746871948242 seconds
Runtime: 0.013999700546264648 seconds
Runtime: 0.018000364303588867 seconds
Runtime: 0.01600050926208496 seconds
Runtime: 0.01699972152709961 seconds
Runtime: 0.019005537033081055 seconds
Runtime: 0.01759505271911621 seconds
Runtime: 0.01600027084350586 seconds
Runtime: 0.016000032424926758 seconds
Runtime: 0.01600027084350586 seconds
Runtime: 0.01600027084350586 seconds
Runtime: 0.01600027084350586 seconds
Runtime: 0.01699995994567871 seconds
Runtime: 0.01699995994567871 seconds
Runtime: 0.017998933792114258 seconds
Runtime: 0.018000364303588867 seconds
Runtime: 0.01799941062927246 seconds
Runtime: 0.01799774169921875 seconds
Runtime: 0.018997669

In [17]:
final_measurements

Unnamed: 0,dataset,corr_threshold,num_monitored_features,fold,num_non_monitored_features,non_monitored_ratio,num_tp_features_drifting,num_tn_features_drifting,num_non_monitored_features_drifting,num_pred_features_drifting_non_monitored,accuracy,recall,precision,f1_score
0,insects,0.1,200,0,0,0.000,0,0,0,0,0,0,0,0
1,insects,0.1,200,1,0,0.000,0,0,0,0,0,0,0,0
2,insects,0.1,200,2,0,0.000,0,0,0,0,0,0,0,0
3,insects,0.1,200,3,0,0.000,0,0,0,0,0,0,0,0
4,insects,0.1,200,4,0,0.000,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,insects,0.9,1,5,199,0.995,6,133,65,7,0.698492,0.092308,0.857143,0.166667
716,insects,0.9,1,6,199,0.995,7,134,65,7,0.708543,0.107692,1.0,0.194444
717,insects,0.9,1,7,199,0.995,0,128,66,5,0,0,0,0
718,insects,0.9,1,8,199,0.995,7,134,65,7,0.708543,0.107692,1.0,0.194444


In [18]:
final_measurements.to_excel(f'../{dataset_name}_{detector_name}_results_non_monitored_v2.xlsx', index=False)