# Initial

In [1]:
from sklearn.metrics.cluster import v_measure_score, silhouette_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")

In [10]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1


'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_aggclus_pca099_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
elif kaggle:    
    PATH_OUTPUTS = ''
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = True

In [3]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown https://drive.google.com/u/0/uc?id=1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown https://drive.google.com/u/0/uc?id=1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ 
    !gdown https://drive.google.com/u/0/uc?id=1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown https://drive.google.com/u/0/uc?id=1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown https://drive.google.com/u/0/uc?id=17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown https://drive.google.com/u/0/uc?id=1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA

    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv('F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv('F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv('F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv('F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv('F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv('F16_DS3_fault4_fixedposition.csv', header=None),
    }
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [4]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [5]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)

In [6]:
def get_scenario(scenario, dict_ds, idxs_n, idxs_f):
    if scenario == 'n1, n2, f2, f3':
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [3]*samples,
            [0]*samples))
    elif scenario == 'f1, f2, f3, f4': # scenario 2
        data_x = np.concatenate((
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [1]*samples,
            [2]*samples,
            [3]*samples,
            [4]*samples))
    elif scenario == 'n1, f1, f2, f3': # scenario 3
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [3]*samples))
    elif scenario == 'n2, f2, f4': # scenario 4
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [4]*samples))
    elif  scenario == 'n1, f1, f2, f4': # scenario 5
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [4]*samples))
    else:
        print('Scenario not found!')

    return data_x, data_y

# Run

In [11]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

samples = 2000

scenarios = ['n1, n2, f2, f3',
             'f1, f2, f3, f4',
             'n1, f1, f2, f3',
             'n2, f2, f4',
             'n1, f1, f2, f4'
             ]

p = {
    'n_clusters': [25, 50, 100, 300],
    'affinity': ['cosine', 'manhattan', 'euclidean'],
    'linkage': ['ward', 'complete', 'average', 'single'],
}

fold = 1
#scenario = 0 # 0 - 4

idxs_n = np.random.randint(0,180000,samples)
idxs_f = np.random.randint(0,100000,samples)

for scenario in np.arange(0, len(scenarios)):
    n = 0
    data_x, data_y = get_scenario(scenarios[scenario], dict_ds, idxs_n, idxs_f)
    data = np.concatenate((data_x, data_y.reshape(-1,1)), axis=1)

    for nc in p['n_clusters']:
        for a in p['affinity']:
            for l in p['linkage']:
                n = n+1

                if l == 'ward' and a != 'euclidean':
                    continue

                ini = time.time()
                print(n, end=' ')
                print ('Agglomerative Clustering', n, scenarios[scenario], nc, a, l, file=log)

                ss = StandardScaler()
                clf = AgglomerativeClustering(n_clusters=nc, affinity=a, linkage=l)
                y_pred = []

                x_nd = data[:,:-1]

                pca = PCA(.99)
                x_nd = pca.fit_transform(x_nd)

                for pred in clf.fit_predict(ss.fit_transform(x_nd)):
                    y_pred.append(pred)

                print(silhouette_score(ss.transform(x_nd), y_pred), file=log)
                print(v_measure_score(data_y, y_pred, beta=0), file=log) # 0 plus homogeneity | 2 completeness

                fim = time.time()
                print(fim-ini, 'seconds')

if FLUSH_FILE:
    log.close()

2 11.010993242263794 seconds
3 6.743004083633423 seconds
4 4.615996837615967 seconds
6 5.913997650146484 seconds
7 7.295003652572632 seconds
8 2.999999523162842 seconds
9 7.307996034622192 seconds
10 5.852995872497559 seconds
11 5.985999822616577 seconds
12 3.7550039291381836 seconds
14 6.161994218826294 seconds
15 7.760006666183472 seconds
16 5.714000940322876 seconds
18 12.548999309539795 seconds
19 9.00999665260315 seconds
20 3.293002128601074 seconds
21 6.885000705718994 seconds
22 7.860001802444458 seconds
23 6.1349992752075195 seconds
24 3.8879971504211426 seconds
26 7.976028919219971 seconds
27 8.785997152328491 seconds
28 4.569000244140625 seconds
30 6.278999328613281 seconds
31 7.040003299713135 seconds
32 2.8049983978271484 seconds
33 7.095001697540283 seconds
34 6.581004858016968 seconds
35 6.059997320175171 seconds
36 3.436004638671875 seconds
38 6.139997959136963 seconds
39 6.603002309799194 seconds
40 3.428001880645752 seconds
42 6.640004873275757 seconds
43 5.42600464820

In [24]:
pca = PCA(n_components=10)

x_nd = pca.fit_transform(data[:,:-1])

x_nd

array([[-4.05579517e+06, -1.59708592e+05, -2.26837633e+06, ...,
         1.00678157e+05,  1.22457007e+04, -2.62862924e+03],
       [-3.67162502e+06, -3.02659208e+04, -2.11680793e+06, ...,
         8.31597601e+04, -1.80985137e+04, -3.82459698e+03],
       [-8.50461668e+05,  8.96838245e+05, -1.81391353e+06, ...,
        -5.88350403e+03, -1.05005285e+05,  4.67521176e+03],
       ...,
       [ 1.16320108e+07, -1.19363726e+06,  7.29425047e+04, ...,
        -3.27189096e+04, -8.77522092e+03,  1.79719056e+04],
       [ 1.16144586e+07, -1.19782733e+06,  1.76473965e+05, ...,
         1.68142617e+04,  4.46586247e+03,  3.99942780e+03],
       [ 1.16134685e+07, -1.19910395e+06,  1.84158370e+05, ...,
         2.09314558e+04,  5.41472705e+03,  3.46104417e+03]])