# Tool Evaluation

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
import os
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from itertools import product
# import seaborn as sns


plt.rcParams['figure.figsize'] = [10, 4]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower


In [5]:
EXPERIMENT_DESCRIPTORS = ['dataset_type', 'dataset', 'model']

DATASET_ESTIMATORS = [
    'model_type', # SimpleCNN, RESNET, ...
    'data_type', # Spectogram, SensorData, ...
    'n_parameters',
    'datapoint_w', 
    'datapoint_h', 
    'dimensions', 
    'num_classes', 
    'original_data_size'
]

RUN_DESCRIPTORS = [
    'iteration',
    'data_quality_dimension_percentage',
    'loss',
    'categorical_accuracy',
    'categorical_crossentropy',
    'top_3_accuracy',
    'top_5_accuracy',
    'precision',
    'recall',
    'auc',
    'f1_score',
    'effective_epochs',
    'used_data_size',
    'actual_data_percentage_used',
    'emissions_kg',
    'duration',
]


DATASET_TYPE_NAMES = ['images', 'time_series']
DATASET_NAMES = []
MODEL_NAMES = []

MODEL_TYPES = ['SIMPLE_CNN', 'SIMPLE_MLP', 'RESNET', 'Other']
DATA_TYPES = ['Sensor', 'Spectro', 'Image', 'Device', 'Motion', 'Other']
METRICS = ['f1_score', 'auc', 'categorical_accuracy', 'precision', 'recall']
REDUCING_METHODS = ['keep_distributions', 'balance_classes']



In [231]:
cols_to_drop = ['ready', 'loss', 'categorical_crossentropy', 'top_3_accuracy', 'top_5_accuracy', 'project_id', 'duration_per_epoch', 'emissions_per_epoch']

def load_results(dataset_type):
    total_df = pd.DataFrame()
    for _, _, files in os.walk(f"./data/{dataset_type}"):
        for file in files:
            if file.endswith('results.csv'):
                total_df = pd.concat([total_df, pd.read_csv(f"./data/{dataset_type}/{file}")])

    return total_df



time_series_df = load_results('time_series')
images_df = load_results('images')


time_series_df.columns


time_series_df.groupby(['model', 'dataset']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset_type,experiment,iteration,experiment_method,data_quality_dimension,data_quality_dimension_percentage,loss,categorical_accuracy,categorical_crossentropy,top_3_accuracy,...,num_classes,model_type,datapoint_w,datapoint_h,dimensions,emissions_kg,duration,n_parameters,project_id,ready
model,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FCN,ChlorineConcentration,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
FCN,Coffee,90,90,90,90,90,90,90,90,90,90,...,90,90,90,90,90,90,90,90,90,90
FCN,EthanolLevel,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
FCN,PhalangesOutlinesCorrect,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
FCN,StarLightCurves,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
FCN,UWaveGestureLibraryAll,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
MLP,ChlorineConcentration,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
MLP,Coffee,90,90,90,90,90,90,90,90,90,90,...,90,90,90,90,90,90,90,90,90,90
MLP,EthanolLevel,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45
MLP,PhalangesOutlinesCorrect,45,45,45,45,45,45,45,45,45,45,...,45,45,45,45,45,45,45,45,45,45


In [232]:
def process_experiments(df):
    """
    Given the full dataframe of experiments on completeness (e11), calculates the regressions for the curve completeness vs metric
    """
    
    unique_experiments = df.groupby(EXPERIMENT_DESCRIPTORS + DATASET_ESTIMATORS, as_index=False).size()
    
    results = []
    
    for experiment in unique_experiments.itertuples():
        experiment_df = df.query(f"model == '{experiment.model}' and dataset == '{experiment.dataset}'")
        regression = LinearRegression()
        X = experiment_df[['data_quality_dimension_percentage']]
        
        for metric in metrics:
            y = experiment_df[[metric]]

            regression.fit(X, y)
            summary_data = {"metric": metric, "coefficient": regression.coef_[0][0], "intercept": regression.intercept_[0]}

            for attribute in DATASET_ESTIMATORS + EXPERIMENT_DESCRIPTORS:
                summary_data[attribute] = getattr(experiment, attribute)

            results.append(summary_data)


    return pd.DataFrame(results)




# Obtain the dataset on which we will be able to evaluate

In [233]:
time_series_dataset = process_experiments(time_series_df)

time_series_dataset.query("metric == 'f1_score'")

Unnamed: 0,metric,coefficient,intercept,model_type,data_type,n_parameters,datapoint_w,datapoint_h,dimensions,num_classes,original_data_size,dataset_type,dataset,model
4,f1_score,0.361155,0.407422,SIMPLE_CNN,Sensor,266115,166,1,1,3,467,time_series,ChlorineConcentration,FCN
9,f1_score,0.045877,0.500292,SIMPLE_MLP,Sensor,586003,166,1,1,3,467,time_series,ChlorineConcentration,MLP
14,f1_score,0.342986,0.435337,RESNET,Sensor,506947,166,1,1,3,467,time_series,ChlorineConcentration,RESNET
19,f1_score,-0.030952,0.495952,SIMPLE_CNN,Spectro,265986,286,1,1,2,28,time_series,Coffee,FCN
24,f1_score,0.310119,0.399246,SIMPLE_MLP,Spectro,645502,286,1,1,2,28,time_series,Coffee,MLP
29,f1_score,0.410714,0.47381,RESNET,Spectro,506818,286,1,1,2,28,time_series,Coffee,RESNET
34,f1_score,0.098467,0.268253,SIMPLE_CNN,Spectro,266244,1751,1,1,4,504,time_series,EthanolLevel,FCN
39,f1_score,0.018867,0.245036,SIMPLE_MLP,Spectro,1379004,1751,1,1,4,504,time_series,EthanolLevel,MLP
44,f1_score,0.299867,0.473236,RESNET,Spectro,507076,1751,1,1,4,504,time_series,EthanolLevel,RESNET
49,f1_score,0.097009,0.727135,SIMPLE_CNN,Image,265986,80,1,1,2,1800,time_series,PhalangesOutlinesCorrect,FCN


## Find the best model

In [234]:
def get_samples(total_df, n):
    sample = total_df.sample(n)
    
    return total_df.drop(sample.index), sample
    
    
def preprocess_data(dataset_df, metric):
    """
    Transforms data into sklearn format
    """
    data = dataset_df.query(f"metric == '{metric}'")
    X_cat = []
    X_num = []
    y = []
    for experiment in data.iterrows():
        x_cat = []
        x_num = []
        for estimator in DATASET_ESTIMATORS:
            if estimator in ['model_type', 'data_type']:
                x_cat.append(experiment[1][estimator])
            else:
                x_num.append(experiment[1][estimator])


        metric_coefficient = experiment[1]['coefficient']
        y.append(metric_coefficient)
        X_cat.append(x_cat)
        X_num.append(x_num)

    encoder = OneHotEncoder(handle_unknown='error', categories=[MODEL_TYPES, DATA_TYPES])
    encoder.fit(X_cat)
    X_cat = encoder.transform(X_cat).toarray()

    return np.concatenate([X_cat, X_num], axis=1), np.array(y), encoder

def train_reg(reg_name, data, metric):
    X, y, encoder = preprocess_data(data, metric)
    regressors = {'random_forest': RandomForestRegressor, 'svr': SVR }
    reg = regressors[reg_name]()
    reg.fit(X, y)
    
    return reg, encoder
    
    

In [235]:
for metric in METRICS:
    X, y, encoder = preprocess_data(time_series_dataset, 'f1_score')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    
    for regressor_name, regressor in regressors.items():

        reg = regressor()
        reg.fit(X_train, y_train)

        y_pred = reg.predict(X_test)
        score = mean_squared_error(y_pred, y_test)

        print(f"{metric} - {regressor_name} - {score}")

f1_score - random_forest - 0.025806658082464712
f1_score - svr - 0.028578057085795
auc - random_forest - 0.04600668605162473
auc - svr - 0.020803985413837548
categorical_accuracy - random_forest - 0.03022568274523517
categorical_accuracy - svr - 0.02017425373530466
precision - random_forest - 0.012648100528633308
precision - svr - 0.018729488256998272
recall - random_forest - 0.027756749868683494
recall - svr - 0.026616040742668712


## Evaluate with real data

In [288]:
train_df, test_df = get_samples(time_series_dataset, 5)

def show_percentage(number):
    return str(100*number)[:4] + "%"

def simulate_user_experiment(user_experiment_input, metric='f1_score', search_iterations=200000, base_metric_result_percentage=0.4):
    # Simulating a real scenario
    hyperparameter_search_iterations = search_iterations
    our_method_hyperparameter_search_iterations = 20
    sample = user_experiment_input
    full_data_df = time_series_df.query(f"dataset=='{sample['dataset']}' and model=='{sample['model']}' and data_quality_dimension_percentage=={1.0}")
    base_experiment_kg_emissions = full_data_df['emissions_kg'].mean()
    base_experiment_metric_result = full_data_df[metric].mean()
    
    # Training the regressor with only train data
    reg, encoder = train_reg('random_forest', train_df, metric)
    

    # Simulate real tool pipeline

    # Encode data
    X_cat = encoder.transform([[sample['model_type'], sample['data_type']]]).toarray()
    X_num = [[sample['n_parameters'], sample['datapoint_w'], sample['datapoint_h'], sample['dimensions'], sample['num_classes'], sample['original_data_size']]]
    X = np.concatenate([X_cat, X_num], axis=1)


    # Predict coefficient
    metric_coefficient = reg.predict(X)[0]


    # Use the user goal data to predict how much of the dataset is needed
    base_percentage_real_data_df = time_series_df.query(f"dataset=='{sample['dataset']}' and model=='{sample['model']}' and data_quality_dimension_percentage=={base_metric_result_percentage}")
    base_metric_result = base_percentage_real_data_df[metric].mean()
    goal_metric = min(base_metric_result*1.1, 1)
    intercept = base_metric_result - metric_coefficient*base_metric_result_percentage
    dataset_percent = (goal_metric - intercept)/metric_coefficient



    # Get the value of the metric with that percent
    rounded_dataset_percent = min(round(dataset_percent, 1), 1)
    rounded_dataset_percent_real_data_df = time_series_df.query(f"dataset=='{sample['dataset']}' and model=='{sample['model']}' and data_quality_dimension_percentage=={rounded_dataset_percent}")
    real_result = rounded_dataset_percent_real_data_df[metric].mean()


    classic_method_emissions = hyperparameter_search_iterations*base_experiment_kg_emissions
    our_method_emissions = hyperparameter_search_iterations*base_percentage_real_data_df['emissions_kg'].mean() + our_method_hyperparameter_search_iterations*rounded_dataset_percent_real_data_df['emissions_kg'].mean()

    emissions_percentage_decrease = 100*(our_method_emissions - classic_method_emissions)/classic_method_emissions


    print(
        f"""
            On dataset {sample['dataset']} model {sample['model']} metric {sample['metric']}
                ----- User goals ------

                User goal: {show_percentage(goal_metric)}
                User Inputs: {show_percentage(base_metric_result)} on {show_percentage(base_metric_result_percentage)} of the data

                ----- Prediction -----

                Our model predicted to use {show_percentage(rounded_dataset_percent)} of dataset
                
                ------ Results ----
                With that amount of data, we got {show_percentage(real_result)}

                ----- Emissions saved -----

                Emissions (kg) saved: {classic_method_emissions - our_method_emissions} (negative value is bad)
                Emissions percentage reduced: {show_percentage(emissions_percentage_decrease)} (Negative is a decrease, good)

                ------ Errors -----
                Error on goal: {show_percentage(goal_metric - real_result)} (values < 1% are good (and negative better))


                ------ Extra errors ----
                Error on classic method (100% of data): {show_percentage(base_experiment_metric_result - real_result)} (smaller is better) 
            \n\n\n
        """)


In [289]:
for sample in test_df.iterrows():
    simulate_user_experiment(sample[1])



            On dataset UWaveGestureLibraryAll model RESNET metric categorical_accuracy
                ----- User goals ------

                User goal: 82.4%
                User Inputs: 74.9% on 40.0% of the data

                ----- Prediction -----

                Our model predicted to use 80.0% of dataset
                
                ------ Results ----
                With that amount of data, we got 82.6%

                ----- Emissions saved -----

                Emissions (kg) saved: 561.9229930688853 (negative value is bad)
                Emissions percentage reduced: -584% (Negative is a decrease, good)

                ------ Errors -----
                Error on goal: -0.2% (values < 1% are good)


                ------ Extra errors ----
                Error on classic method (100% of data): 1.82% (smaller is better) 
            



        

            On dataset PhalangesOutlinesCorrect model MLP metric precision
                ----- User goals ------


In [271]:
test_df

Unnamed: 0,metric,coefficient,intercept,model_type,data_type,n_parameters,datapoint_w,datapoint_h,dimensions,num_classes,original_data_size,dataset_type,dataset,model
2,recall,0.423151,0.323262,SIMPLE_CNN,Sensor,266115,166,1,1,3,467,time_series,ChlorineConcentration,FCN
59,f1_score,0.092657,0.731896,RESNET,Image,506818,80,1,1,2,1800,time_series,PhalangesOutlinesCorrect,RESNET
9,f1_score,0.045877,0.500292,SIMPLE_MLP,Sensor,586003,166,1,1,3,467,time_series,ChlorineConcentration,MLP
68,auc,0.006327,0.966654,SIMPLE_MLP,Sensor,1015003,1024,1,1,3,1000,time_series,StarLightCurves,MLP
73,auc,0.007575,0.986302,RESNET,Sensor,506947,1024,1,1,3,1000,time_series,StarLightCurves,RESNET
