# import

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

In [2]:
from config import *

In [3]:
os.makedirs(EXTERNAL_DATA_DIR, exist_ok=True)
os.makedirs(BASE_DATA_DIR, exist_ok=True)
os.makedirs(INTERIM_DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Convert to csv format

In [4]:
dataset_name = 'Insects_A'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-abrupt_balanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [5]:
dataset_name = 'Insects_I'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental_balanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [6]:
dataset_name = 'Insects_IG'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-gradual_balanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [7]:
dataset_name = 'Insects_IR'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental-reoccurring_balanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [8]:
dataset_name = 'Insects_IAR'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental-abrupt_balanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [9]:
dataset_name = 'Insects_A_imb'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-abrupt_imbalanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [10]:
dataset_name = 'Insects_I_imb'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental_imbalanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [11]:
dataset_name = 'Insects_IG_imb'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-gradual_imbalanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [12]:
dataset_name = 'Insects_IR_imb'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental-reoccurring_imbalanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [13]:
dataset_name = 'Insects_IAR_imb'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-incremental-abrupt_imbalanced_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [14]:
dataset_name = 'Insects_OOC'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    df = pd.read_csv(os.path.join(EXTERNAL_DATA_DIR, 'Insects', 'INSECTS-out-of-control_norm.arff'), header=None, skiprows=38)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

# Encode labels

In [15]:
def make_dataset(dataset_name):
    save_path = os.path.join(INTERIM_DATA_DIR, dataset + '_cat.csv')
    if os.path.exists(save_path):
        return

    path = os.path.join(BASE_DATA_DIR, dataset_name + '.csv')
    df = pd.read_csv(path)

    label_names = df['label'].astype('category').cat.categories
    print(label_names)
    df['label'] = df['label'].astype('category').cat.codes

    df.to_csv(save_path, index=False)

In [16]:
for dataset in DATASETS:
    print(dataset)
    make_dataset(dataset)

Insects_A
Insects_I
Insects_IG
Insects_IR
Insects_IAR
Insects_A_imb
Insects_I_imb
Insects_IG_imb
Insects_IR_imb
Insects_IAR_imb
Insects_OOC


# summary of datasets

In [17]:
dfs = []
for dataset in DATASETS:
    df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, dataset+'_cat.csv'))
    dfs.append([dataset, df.shape[0], df.shape[1]-1, len(set(df['label']))])
df = pd.DataFrame(dfs, columns=['Dataset', '# of instances', '# of features', '# of classes'])
df = df.set_index('Dataset', drop=True)
df['# of instances'] = df['# of instances'].map(lambda x: '{:,}'.format(x))
df['Change points'] = pd.Series(DRIFT_POINTS).map(lambda x: ['{:,}'.format(i) for i in x]).apply('; '.join)
df.to_latex(os.path.join(OUTPUT_DIR, 'stream_dataset.txt'))
df

Unnamed: 0_level_0,# of instances,# of features,# of classes,Change points
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Insects_A,52848,33,6,"14,352; 19,500; 33,240; 38,682; 39,510"
Insects_I,57018,33,6,
Insects_IG,24150,33,6,14028
Insects_IR,79986,33,6,"26,568; 53,364"
Insects_IAR,79986,33,6,"26,568; 53,364"
Insects_A_imb,355275,33,6,"83,859; 128,651; 182,320; 242,883; 268,380"
Insects_I_imb,452044,33,6,
Insects_IG_imb,143323,33,6,58159
Insects_IR_imb,452044,33,6,"150,683; 301,365"
Insects_IAR_imb,452044,33,6,"150,683; 301,365"
