# Initial

In [9]:
from sklearn.metrics.cluster import v_measure_score, silhouette_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time
from sklearn.cluster import  MiniBatchKMeans, Birch, AgglomerativeClustering, DBSCAN, KMeans
from sklearn import neighbors
import warnings
warnings.filterwarnings("ignore")

In [2]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1


'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_aggclus_final_ds_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
elif kaggle:    
    PATH_OUTPUTS = ''
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = False

In [3]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown https://drive.google.com/u/0/uc?id=1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown https://drive.google.com/u/0/uc?id=1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ 
    !gdown https://drive.google.com/u/0/uc?id=1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown https://drive.google.com/u/0/uc?id=1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown https://drive.google.com/u/0/uc?id=17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown https://drive.google.com/u/0/uc?id=1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA

    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv('F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv('F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv('F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv('F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv('F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv('F16_DS3_fault4_fixedposition.csv', header=None),
    }
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [4]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [5]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)

In [6]:
def get_scenario(scenario, dict_ds, idxs_n, idxs_f):
    if scenario == 'n1, n2, f2, f3':
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [3]*samples,
            [0]*samples))
    elif scenario == 'f1, f2, f3, f4': # scenario 2
        data_x = np.concatenate((
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [1]*samples,
            [2]*samples,
            [3]*samples,
            [4]*samples))
    elif scenario == 'n1, f1, f2, f3': # scenario 3
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [3]*samples))
    elif scenario == 'n2, f2, f4': # scenario 4
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [4]*samples))
    elif  scenario == 'n1, f1, f2, f4': # scenario 5
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [4]*samples))
    else:
        print('Scenario not found!')

    return data_x, data_y

# Run

In [7]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

options = {'n1': [dict_ds['data_ds3_normal_t1_original'], 0, [0]],
           'n2': [dict_ds['data_ds3_normal_t2_original'], 0, [0]],
           'f1': [dict_ds['data_ds3_fault1_original'], 1, [1]],
           'f2': [dict_ds['data_ds3_fault2_original'], 1, [2]],
           'f3': [dict_ds['data_ds3_fault3_original'], 1, [3]],
           'f4': [dict_ds['data_ds3_fault4_original'], 1, [4]]
}

samples = 2000
n_folds = 10
n_clusters = 300
alghorithm = 'auto' #def
scenarios = [
    ['n1', 'n2', 'f2', 'f3'], # scenario 1
    ['f1', 'f2', 'f3', 'f4'], # scenario 2
    ['n1', 'f1', 'f2', 'f3'], # scenario 3
    ['n2', 'f2', 'f4'], # scenario 4
    ['n1', 'f1', 'f2', 'f4'], # scenario 5
]
outputs = np.zeros((n_folds, 2))

n = 0
for scenario in scenarios:
    n = n + 1
    print ('Kmeans Batch', n, scenario, n_clusters, alghorithm, file=log)

    for fold in np.arange(0, n_folds):

        ss = StandardScaler()
        clf = KMeans(n_clusters=n_clusters, algorithm=alghorithm)
        y_pred = []

        # get random indexes for each dataset, but the datasets are ordered
        idxs = np.random.randint(0,100000,samples)

        data_x = options[scenario[0]][0].iloc[idxs, :]
        data_y = options[scenario[0]][2]*samples
        for x in np.arange(1,len(scenario)):
            data_x = np.concatenate((data_x, options[scenario[x]][0].iloc[idxs, :]))
            data_y = np.concatenate((data_y, options[scenario[x]][2]*samples))

        data = np.concatenate((data_x, data_y.reshape(-1,1)), axis=1)

        for pred in clf.fit_predict(ss.fit_transform(data_x)):
            y_pred.append(pred)

        if (len(np.unique(y_pred)) == 1):
            outputs[fold][0] = -1
        else:
            outputs[fold][0] = silhouette_score(ss.transform(data_x), y_pred)

        outputs[fold][1] = v_measure_score(data_y, y_pred, beta=0) # 0 plus homogeneity | 2 completeness

    print(outputs)

if FLUSH_FILE:
    log.close()

Kmeans Batch 1 ['n1', 'n2', 'f2', 'f3'] 300 auto
[[0.34726511 0.99606134]
 [0.34310674 0.99212023]
 [0.34689502 0.99394834]
 [0.35143707 0.99277404]
 [0.34688398 0.99439462]
 [0.35228808 0.99479749]
 [0.33570812 0.98826859]
 [0.34148121 0.99527032]
 [0.34177009 0.98728445]
 [0.33955342 0.99114903]]
Kmeans Batch 2 ['f1', 'f2', 'f3', 'f4'] 300 auto
[[0.36386379 0.99478439]
 [0.36099577 0.99248156]
 [0.36355513 0.99208414]
 [0.36577337 0.99336401]
 [0.36996875 0.99441756]
 [0.35876431 0.99588683]
 [0.37451987 0.99640564]
 [0.36894059 0.99286935]
 [0.35917357 0.9930695 ]
 [0.36357638 0.99392378]]
Kmeans Batch 3 ['n1', 'f1', 'f2', 'f3'] 300 auto
[[0.3570164  0.96682433]
 [0.36162685 0.9658197 ]
 [0.35784882 0.97335937]
 [0.36564368 0.96614888]
 [0.35326607 0.97466491]
 [0.35376185 0.96950234]
 [0.36230272 0.96869577]
 [0.35435366 0.96291507]
 [0.35462746 0.96883828]
 [0.35000799 0.97860457]]
Kmeans Batch 4 ['n2', 'f2', 'f4'] 300 auto
[[0.35730166 0.99852783]
 [0.35020225 0.99797047]
 [0.380

In [10]:
np.unique(clf.labels_)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [9]:
# generate 3 scenarios
from itertools import permutations

sets = []

caracteres = ['n1', 'n2', 'f1', 'f2', 'f3', 'f4']
for l in np.arange(2,6):
    for subset in permutations(caracteres, int(l)):
        sets.append(subset)

sets.append(tuple(caracteres))

sets = np.array(sets)
sets[[np.random.randint(0, len(sets), 3)]]

array([('n1', 'n2', 'f4', 'f1'), ('f4', 'f1', 'n2', 'f2'),
       ('f2', 'f3', 'n2', 'f4')], dtype=object)

In [12]:
ss = MinMaxScaler()
ss.fit_transform(np.array([0.0002190936356,
                           0.0001962164166,
                           0.0002659634117,
                           0.000155592714,
                           0.0002077143736,
                           0.00003659935181,
                           0.0007629178845,
                           0.00003655510654,
                           0.00004553915078,
                           0.00005997954617,
                           0.00001035711518,
                           0.00001633708228,
                           0.00004472119254,
                           0.000006363675767,
                           0.00003057221736,
                           0.00002972695524,
                           0.00001447019433,
                           0.00004872239223,
                           0.000008192676174,
                           0.00002650983864,]).reshape(-1,1))

ss = MinMaxScaler()
ss.fit_transform(np.array([0.0002190936356,
                           0.0001962164166,
                           0.0002659634117,
                           0.000155592714,
                           0.0002077143736,
                           0.00003659935181,
                           0.0007629178845,
                           0.00003655510654,
                           0.00004553915078,
                           0.00005997954617,
                           0.00001035711518,
                           0.00001633708228,
                           0.00004472119254,
                           0.000006363675767,
                           0.00003057221736,
                           0.00002972695524,
                           0.00001447019433,
                           0.00004872239223,
                           0.000008192676174,
                           0.00002650983864, ]).reshape(-1, 1))

ss = MinMaxScaler()
ss.fit_transform(np.array([0.0002190936356,
                           0.0001962164166,
                           0.0002659634117,
                           0.000155592714,
                           0.0002077143736,
                           0.00003659935181,
                           0.0007629178845,
                           0.00003655510654,
                           0.00004553915078,
                           0.00005997954617,
                           0.00001035711518,
                           0.00001633708228,
                           0.00004472119254,
                           0.000006363675767,
                           0.00003057221736,
                           0.00002972695524,
                           0.00001447019433,
                           0.00004872239223,
                           0.000008192676174,
                           0.00002650983864, ]).reshape(-1, 1))



array([[0.28118271],
       [0.250944  ],
       [0.34313435],
       [0.19724831],
       [0.2661418 ],
       [0.03996498],
       [1.        ],
       [0.0399065 ],
       [0.05178145],
       [0.07086851],
       [0.00527846],
       [0.01318267],
       [0.05070029],
       [0.        ],
       [0.03199842],
       [0.03088117],
       [0.01071505],
       [0.055989  ],
       [0.00241754],
       [0.02662884]])