In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# turn off FutureWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
data_df = pd.read_excel('measurements/CD_delta_vs_ro_chp_tabs_insects_adwin.xlsx', sheet_name='corr_measurements')

In [4]:
# Create the reversed pairs
reversed_df = data_df.rename(columns={'feat_1': 'feat_2', 'feat_2': 'feat_1', 'feat_1_drift': 'feat_2_drift', 'feat_2_drift': 'feat_1_drift', 'feat_1_cd_score': 'feat_2_cd_score', 'feat_2_cd_score': 'feat_1_cd_score'})

# Concatenate the original dataframe with the reversed pairs
data_df = pd.concat([data_df, reversed_df], ignore_index=True)

In [5]:
data_df.feat_1.nunique()

200

In [6]:
dataset_name = 'INSECTS'

In [7]:
# put all above into a function

def compute_results(data_df, dataset_name, fold, corr_threshold, num_monitored_features):
    m = data_df['feat_1'].nunique()
    num_non_monitored_features = m - num_monitored_features
    non_monitored_ratio = num_monitored_features / m

    monitored_features = np.random.choice(data_df['feat_1'].unique(), num_monitored_features, replace=False)
    non_monitored_features = np.setdiff1d(data_df['feat_1'].unique(), monitored_features)

    num_monitored_features_drifting = data_df[data_df['feat_1'].isin(monitored_features)].drop_duplicates(subset=['feat_1']).feat_1_drift.sum()

    # num_non_monitored_features_drifting = data_df[data_df['feat_1'].isin(non_monitored_features)].drop_duplicates(subset=['feat_1']).feat_1_drift.sum()

    all_features_drifting = data_df[data_df['feat_1_drift']==1].drop_duplicates(subset=['feat_1']).feat_1.unique()

    all_features_non_drifting = data_df[data_df['feat_1_drift']==0].drop_duplicates(subset=['feat_1']).feat_1.unique()

    pred_features_drifting = data_df[(data_df['feat_1'].isin(monitored_features)) & (data_df['feat_2'].isin(non_monitored_features)) & (data_df['ref_corr']>=corr_threshold)].drop_duplicates(subset=['feat_2']).feat_2.unique()

    tp_features_drifting = set(pred_features_drifting).intersection(all_features_drifting)

    pred_features_non_drifting = set(non_monitored_features) - set(pred_features_drifting)

    tn_features_drifting = set(pred_features_non_drifting).intersection(all_features_non_drifting)

    num_tn_features_drifting = len(tn_features_drifting)

    num_tp_features_drifting = len(tp_features_drifting)

    num_pred_features_drifting_non_monitored = len(set(pred_features_drifting).intersection(non_monitored_features))

    num_pred_features_drifting_monitored = len(set(pred_features_drifting).intersection(monitored_features))

    results_df_non_monitored = pd.DataFrame(columns=['dataset', 'corr_threshold', 'num_monitored_features', 'fold', 'num_non_monitored_features', 'non_monitored_ratio', 'num_tp_features_drifting', 'num_tn_features_drifting', 'num_monitored_features_drifting', 'num_pred_features_drifting_monitored'])

    results_df_non_monitored = results_df_non_monitored.append({'dataset': dataset_name,
                                                                'corr_threshold': corr_threshold,
                                                                'num_monitored_features': num_monitored_features,
                                                                'fold': fold,
                                                                'num_non_monitored_features': num_non_monitored_features,
                                                                'non_monitored_ratio': non_monitored_ratio,
                                                                'num_tp_features_drifting': num_tp_features_drifting,
                                                                'num_tn_features_drifting': num_tn_features_drifting,
                                                                'num_monitored_features_drifting': num_monitored_features_drifting,
                                                                'num_pred_features_drifting_monitored': num_pred_features_drifting_monitored}, ignore_index=True)
    
    try:
        results_df_non_monitored['accuracy'] = (results_df_non_monitored['num_tp_features_drifting'] + results_df_non_monitored['num_tn_features_drifting']) / results_df_non_monitored['num_monitored_features']
        results_df_non_monitored['recall'] = results_df_non_monitored['num_tp_features_drifting'] / results_df_non_monitored['num_monitored_features_drifting']
        results_df_non_monitored['precision'] = results_df_non_monitored['num_tp_features_drifting'] / num_pred_features_drifting_monitored
        results_df_non_monitored['f1_score'] = 2 * (results_df_non_monitored['precision'] * results_df_non_monitored['recall']) / (results_df_non_monitored['precision'] + results_df_non_monitored['recall'])
    except ZeroDivisionError:
        results_df_non_monitored['accuracy'] = 0
        results_df_non_monitored['recall'] = 0
        results_df_non_monitored['precision'] = 0
        results_df_non_monitored['f1_score'] = 0
    
    return results_df_non_monitored

In [8]:
# run a 10 fold cross validation, append the results to a final_measurements dataframe
final_measurements = pd.DataFrame(columns=['dataset', 'corr_threshold', 'num_monitored_features', 'fold', 'num_non_monitored_features', 'non_monitored_ratio', 'num_tp_features_drifting', 'num_tn_features_drifting', 'num_monitored_features_drifting', 'num_pred_features_drifting_monitored', 'accuracy', 'recall', 'precision', 'f1_score'])

m=data_df['feat_1'].nunique()
for corr_threshold in np.arange(0.1, 1.0, 0.1):
    print(corr_threshold)
    for num_monitored_features in range(int(m).bit_length()):
        num_monitored_features_val = int(m / (2 ** num_monitored_features))
        if num_monitored_features_val < 1:
            break
        for fold in range(10):
            results_df_non_monitored = compute_results(data_df, dataset_name, fold, corr_threshold, num_monitored_features_val)
            final_measurements = final_measurements.append(results_df_non_monitored, ignore_index=True)

0.1
0.2
0.30000000000000004
0.4
0.5
0.6
0.7000000000000001
0.8
0.9


In [10]:
final_measurements

Unnamed: 0,dataset,corr_threshold,num_monitored_features,fold,num_non_monitored_features,non_monitored_ratio,num_tp_features_drifting,num_tn_features_drifting,num_monitored_features_drifting,num_pred_features_drifting_monitored,accuracy,recall,precision,f1_score
0,INSECTS,0.1,200,0,0,1.000,0,0,55,0,0,0,0,0
1,INSECTS,0.1,200,1,0,1.000,0,0,55,0,0,0,0,0
2,INSECTS,0.1,200,2,0,1.000,0,0,55,0,0,0,0,0
3,INSECTS,0.1,200,3,0,1.000,0,0,55,0,0,0,0,0
4,INSECTS,0.1,200,4,0,1.000,0,0,55,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,INSECTS,0.9,1,5,199,0.005,0,140,0,0,0,0,0,0
716,INSECTS,0.9,1,6,199,0.005,0,142,0,0,0,0,0,0
717,INSECTS,0.9,1,7,199,0.005,7,145,1,0,0,0,0,0
718,INSECTS,0.9,1,8,199,0.005,0,138,0,0,0,0,0,0


In [9]:
final_measurements.to_excel(f'measurements/final/{dataset_name}_results_monitored.xlsx', index=False)