# Initial

In [1]:
from sklearn.metrics.cluster import v_measure_score, silhouette_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
from sklearn.cluster import  DBSCAN
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")

In [2]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1


'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_dbscan_ds_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
elif kaggle:    
    PATH_OUTPUTS = ''
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = False

In [3]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown https://drive.google.com/u/0/uc?id=1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown https://drive.google.com/u/0/uc?id=1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ 
    !gdown https://drive.google.com/u/0/uc?id=1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown https://drive.google.com/u/0/uc?id=1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown https://drive.google.com/u/0/uc?id=17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown https://drive.google.com/u/0/uc?id=1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA

    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv('F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv('F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv('F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv('F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv('F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv('F16_DS3_fault4_fixedposition.csv', header=None),
    }
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [4]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [5]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)

In [6]:
def get_scenario(scenario, dict_ds, idxs_n, idxs_f):
    if scenario == 'n1, n2, f2, f3':
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [3]*samples,
            [0]*samples))
    elif scenario == 'f1, f2, f3, f4': # scenario 2
        data_x = np.concatenate((
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [1]*samples,
            [2]*samples,
            [3]*samples,
            [4]*samples))
    elif scenario == 'n1, f1, f2, f3': # scenario 3
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [3]*samples))
    elif scenario == 'n2, f2, f4': # scenario 4
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [4]*samples))
    elif  scenario == 'n1, f1, f2, f4': # scenario 5
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [4]*samples))
    else:
        print('Scenario not found!')

    return data_x, data_y

# Run

In [7]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

options = {'n1': [dict_ds['data_ds3_normal_t1_original'], 0, [0]],
           'n2': [dict_ds['data_ds3_normal_t2_original'], 0, [0]],
           'f1': [dict_ds['data_ds3_fault1_original'], 1, [1]],
           'f2': [dict_ds['data_ds3_fault2_original'], 1, [2]],
           'f3': [dict_ds['data_ds3_fault3_original'], 1, [3]],
           'f4': [dict_ds['data_ds3_fault4_original'], 1, [4]]
}

samples = 2000
n_folds = 10
with_pca = False
pca_variance = .95 # if with_pca is true
alg = 'auto'
metric = 'euclidean'
scenarios = [
    ['n1', 'n2', 'f2', 'f3'], # scenario 1
    ['f1', 'f2', 'f3', 'f4'], # scenario 2
    ['n1', 'f1', 'f2', 'f3'], # scenario 3
    ['n2', 'f2', 'f4'], # scenario 4
    ['n1', 'f1', 'f2', 'f4'], # scenario 5
]
outputs = np.zeros((n_folds, 2))

n = 0
for scenario in scenarios:
    n = n + 1
    print ('DBSCAN', n, scenario, alg, metric, file=log)

    for fold in np.arange(0, n_folds):

        ss = StandardScaler()
        clf = DBSCAN(algorithm=alg, metric=metric)
        y_pred = []

        # get random indexes for each dataset, but the datasets are ordered
        idxs = np.random.randint(0,100000,samples)

        data_x = options[scenario[0]][0].iloc[idxs, :]
        data_y = options[scenario[0]][2]*samples
        for x in np.arange(1,len(scenario)):
            data_x = np.concatenate((data_x, options[scenario[x]][0].iloc[idxs, :]))
            data_y = np.concatenate((data_y, options[scenario[x]][2]*samples))

        if with_pca:
            pca = PCA(pca_variance)
            data_x = pca.fit_transform(data_x)

        data = np.concatenate((data_x, data_y.reshape(-1,1)), axis=1)

        for pred in clf.fit_predict(ss.fit_transform(data_x)):
            y_pred.append(pred)

        if (len(np.unique(y_pred)) == 1):
            outputs[fold][0] = -1
        else:
            outputs[fold][0] = silhouette_score(ss.transform(data_x), y_pred)

        outputs[fold][1] = v_measure_score(data_y, y_pred, beta=0) # 0 plus homogeneity | 2 completeness

    print(outputs)

if FLUSH_FILE:
    log.close()

DBSCAN 1 ['n1', 'n2', 'f2', 'f3'] auto euclidean
[[0.33078944 0.90224893]
 [0.26353734 0.90760855]
 [0.30810225 0.90161086]
 [0.3127534  0.89988957]
 [0.31979464 0.90514703]
 [0.2748321  0.90418957]
 [0.36668693 0.91338351]
 [0.28733838 0.90608751]
 [0.35264798 0.90272767]
 [0.38312649 0.90113622]]
DBSCAN 2 ['f1', 'f2', 'f3', 'f4'] auto euclidean
[[0.2929942  0.90107289]
 [0.33316743 0.90059094]
 [0.27695605 0.89367565]
 [0.30727035 0.89585977]
 [0.32440899 0.90135511]
 [0.33422129 0.90583297]
 [0.3029698  0.90211592]
 [0.241151   0.90688038]
 [0.31697779 0.91957408]
 [0.27563183 0.91005007]]
DBSCAN 3 ['n1', 'f1', 'f2', 'f3'] auto euclidean
[[0.24934716 0.68245914]
 [0.25974832 0.68075894]
 [0.16687372 0.67953403]
 [0.34173969 0.68047376]
 [0.27400261 0.6779216 ]
 [0.25701102 0.68278799]
 [0.22574487 0.68225348]
 [0.32624575 0.68225669]
 [0.31849438 0.68685272]
 [0.2198764  0.67938885]]
DBSCAN 4 ['n2', 'f2', 'f4'] auto euclidean
[[0.42287476 0.91427362]
 [0.43786788 0.91021481]
 [0.470

In [8]:
np.unique(clf.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54], dtype=int64)