In [19]:
import pickle
import os
import datetime

In [20]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, scipy, time, warnings

import sys
from pathlib import Path

import sklearn
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from evaluation_framework.pipeline import EvaluationPipeline
from evaluation_framework import steps as ef_steps
from evaluation_framework.datasets import get_datasets_for_pattern
from evaluation_framework.runners import run_on_many_datasets

In [21]:
from methods.EF_std import calculate_std_on_dataset
from methods.utils import calculate_targets

In [22]:
import warnings 
warnings.filterwarnings('ignore')

In [23]:
# File folder with results, results will be stored here and loaded from there
results_files_folder = 'sample_size_experiment_results'
if not os.path.exists(results_files_folder):
    os.makedirs(results_files_folder)

In [24]:
selected_dataset = get_datasets_for_pattern(pattern="employment-2015_2016_2017_2018-CA.pq", data_dir="datasets")

chunksizes = np.asarray([100,  200,  500, 1000, 2000, 5000])
steps = np.minimum(chunksizes, 1000)

from copy import deepcopy

datasets = []
for chunksize, step in zip(chunksizes, steps):
    new_dataset = deepcopy(selected_dataset[0])
    new_dataset.observations_in_chunk = chunksize
    new_dataset.step_size = step
    datasets.append(new_dataset)

In [25]:
pd.read_parquet(selected_dataset[0].data_path)['partition'].value_counts()

production    756392
reference     376035
train         374943
Name: partition, dtype: int64

# EF Functions

In [26]:
monitored_models = [
    'LogisticRegression_',
    'LGBMClassifier_',
    'RandomForestClassifier_',
    'XGB_',
    'FT_Transformer_',
]

In [27]:
experiments = []

for monitored_model in monitored_models:
            experiments.append({
                'method':'DATASET_STD',
                'monitored_model': monitored_model,
            })



In [28]:
def save_results_to_df(dat, method, client_model):
    res_data = dat[0]
    df = pd.DataFrame()

    df['method'] = [method]
    df['dataset'] = res_data['dataset_name']
    df['monitored_model'] = [client_model]
    df['chunksize'] = res_data['observations_in_chunk']
    
    df['std_accuracy'] = res_data['std_accuracy']
    df['std_roc_auc'] = res_data['std_roc']
    df['std_f1'] = res_data['std_f1']

    
    
    return df
    

In [29]:
experiments

[{'method': 'DATASET_STD', 'monitored_model': 'LogisticRegression_'},
 {'method': 'DATASET_STD', 'monitored_model': 'LGBMClassifier_'},
 {'method': 'DATASET_STD', 'monitored_model': 'RandomForestClassifier_'},
 {'method': 'DATASET_STD', 'monitored_model': 'XGB_'},
 {'method': 'DATASET_STD', 'monitored_model': 'FT_Transformer_'}]

In [30]:
len(datasets)

6

In [31]:
%%time
results = []
experiments_done = []
experiments_to_rerun = []

for i, experiment in enumerate(experiments[:]):
    
    print("experiment {} out of {}".format(i, len(experiments)))
    for j, dataset in enumerate(datasets[:]):
        print("PROGRESS: " + str((i*len(datasets)+j)/len(experiments)/len(datasets)))
        OBSERVATIONS_IN_CHUNK = dataset.observations_in_chunk
    
        method = experiment['method']
        monitored_model = experiment['monitored_model']

        full_name = '_'.join((method, monitored_model))

        calculation_step = ef_steps.Step(description='Calculate STD', 
                     func=calculate_std_on_dataset, args=(monitored_model, OBSERVATIONS_IN_CHUNK, 500)) # custom step


        pipeline = EvaluationPipeline(steps=[
            ef_steps.SplitDataStep(), #split raw to train test (reference/analysis)
            ef_steps.CombineProcessedReferenceProductionWithRawStep(), # combine after processing
            ef_steps.SplitIntoChunksStep(), # split to chunks
            ef_steps.Step(description='Calculate targets', 
                     func=calculate_targets, args=(monitored_model, )), # custom step
            calculation_step,
            ])

        res, dat = run_on_many_datasets(datasets=[dataset], pipeline=pipeline, log_metrics=False,)
        
        
        if "traceback" in dat[0].keys():
            experiments_to_rerun.append((experiment, dataset.data_path, dat[0]['traceback']))
        else:
            df_results = save_results_to_df(dat, method, monitored_model)
            results.append(df_results)
            
        experiments_done.append(experiment)


experiment 0 out of 5
PROGRESS: 0.0

Evaluating on: datasets\employment-2015_2016_2017_2018-CA.pq
Chunk size: 100, all chunks: 11324, reference_chunks: 3760, transition chunks: 1, production_chunks (includes transition) :7564
PROGRESS: 0.03333333333333333

Evaluating on: datasets\employment-2015_2016_2017_2018-CA.pq
Chunk size: 200, all chunks: 5662, reference_chunks: 1880, transition chunks: 1, production_chunks (includes transition) :3782
PROGRESS: 0.06666666666666667

Evaluating on: datasets\employment-2015_2016_2017_2018-CA.pq
Chunk size: 500, all chunks: 2264, reference_chunks: 752, transition chunks: 1, production_chunks (includes transition) :1512
PROGRESS: 0.09999999999999999

Evaluating on: datasets\employment-2015_2016_2017_2018-CA.pq
Chunk size: 1000, all chunks: 1132, reference_chunks: 376, transition chunks: 1, production_chunks (includes transition) :756
PROGRESS: 0.13333333333333333

Evaluating on: datasets\employment-2015_2016_2017_2018-CA.pq
Chunk size: 2000, all chunk

In [32]:
df = pd.concat(results).reset_index(drop=True)

In [33]:
df

Unnamed: 0,method,dataset,monitored_model,chunksize,std_accuracy,std_roc_auc,std_f1
0,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,100,0.038775,0.032782,0.045941
1,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,200,0.028699,0.024903,0.032665
2,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,500,0.017593,0.015622,0.019808
3,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,1000,0.012887,0.011328,0.015079
4,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,2000,0.009353,0.008029,0.010864
5,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LogisticRegression_,5000,0.005681,0.004864,0.00647
6,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LGBMClassifier_,100,0.037842,0.030206,0.043526
7,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LGBMClassifier_,200,0.028895,0.022149,0.033803
8,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LGBMClassifier_,500,0.017623,0.013491,0.019956
9,DATASET_STD,employment-2015_2016_2017_2018-CA.pq,LGBMClassifier_,1000,0.011936,0.009819,0.014224


In [34]:
date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
date

'2024_05_21_12_23_27'

In [35]:
results_files_folder

'sample_size_experiment_results'

In [36]:
df.to_parquet(os.path.join(results_files_folder, 'STD_results.pq'))
