### Helper for loading the datasets

In [1]:
import pandas as pd
from usp_stream_datasets import *

pd.set_option('mode.use_inf_as_na', True)

dataset = "Incremental-gradual (imbal.)"

attr_columns = ['Att1', 'Att2', 'Att3', 'Att4', 'Att5', 'Att6', 'Att7', 'Att8', 'Att9',
                'Att10', 'Att11', 'Att12', 'Att13', 'Att14', 'Att15', 'Att16', 'Att17',
                'Att18', 'Att19', 'Att20', 'Att21', 'Att22', 'Att23', 'Att24', 'Att25',
                'Att26', 'Att27', 'Att28', 'Att29', 'Att30', 'Att31', 'Att32', 'Att33']

main_df = load_insect_dataset(insects_datasets[dataset]["filename"])
classes = main_df['class'].unique().tolist()
minimal_class = main_df["class"].value_counts().min()
window_size = int(minimal_class * 0.25)

### Load a specific Dataset

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats


def slice_species_dataframes(main_df, species: str):
    df_species = main_df[main_df["class"] == species]
    df_baseline = df_species.iloc[:window_size,]
    df_stream = df_species.iloc[window_size:,]
    # print(f'Baseline shape {df_baseline.shape}')
    # print(f'Stream shape {df_stream.shape}')
    return df_baseline, df_stream


def test_class_attr(species: str, attr: str):
    df_baseline, df_stream = slice_species_dataframes(main_df, species)
    output_filename = f"results/{species}_ks_{window_size}.csv"
    results = []
    count = window_size
    windows = df_stream[attr].rolling(window=window_size)
    
    for window in windows:
        ks = stats.kstest(df_baseline[attr], window)
        if ks.pvalue < 0.05:
            action = "reject"
        else:
            action = "accept"
        results.append({"attr": attr, "start_index": count - window_size, "end_index": count, "p_value": ks.pvalue})
        count += 1

    df_results = pd.DataFrame(results)
    df_results.to_csv(output_filename, index=False)
    return

## Run KS tests for all classes with minimal window and store in respective file

In [6]:
import time

for species in classes[1:]:
    species_start_time = time.time()   
    for attr in attr_columns:
        attr_start_time = time.time()
        print(f"Testing KS windows on species: {species} - {attr}")
        test_class_attr(species, attr)
        attr_end_time = time.time()
        elapsed_attr_time = attr_end_time - attr_start_time
        print(f'Execution time for {species} - {attr}: {elapsed_attr_time} seconds.')
    
    species_end_time = time.time()
    elapsed_time = species_end_time - species_start_time
    print(f'Total execution time for {species}: {elapsed_time} seconds.')
    print()

Testing KS windows on species: b'ae-aegypti-male' - Att1
Execution time for b'ae-aegypti-male' - Att1: 17.89472770690918 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att2
Execution time for b'ae-aegypti-male' - Att2: 18.7893545627594 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att3
Execution time for b'ae-aegypti-male' - Att3: 19.14688014984131 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att4
Execution time for b'ae-aegypti-male' - Att4: 19.545159101486206 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att5
Execution time for b'ae-aegypti-male' - Att5: 18.782896518707275 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att6
Execution time for b'ae-aegypti-male' - Att6: 16.58902096748352 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att7
Execution time for b'ae-aegypti-male' - Att7: 16.306382179260254 seconds.
Testing KS windows on species: b'ae-aegypti-male' - Att8
Execution time for b'ae-aegypti

In [4]:
#20:43 comecou a rodar, o primeiro arquivo foi escrito as ... 21:01

In [5]:
# Comecei a rodar o resto as 22:15