# Initial

In [17]:
from sklearn.metrics.cluster import v_measure_score, silhouette_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import time
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")

In [18]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1


'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_kmeans_batch_pca099_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
elif kaggle:
    PATH_OUTPUTS = ''
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = True

In [19]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown https://drive.google.com/u/0/uc?id=1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown https://drive.google.com/u/0/uc?id=1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ 
    !gdown https://drive.google.com/u/0/uc?id=1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown https://drive.google.com/u/0/uc?id=1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown https://drive.google.com/u/0/uc?id=17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown https://drive.google.com/u/0/uc?id=1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA

    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv('F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv('F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv('F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv('F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv('F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv('F16_DS3_fault4_fixedposition.csv', header=None),
    }
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [20]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [21]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)

In [22]:
def get_scenario(scenario, dict_ds, idxs_n, idxs_f):
    if scenario == 'n1, n2, f2, f3':
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [3]*samples,
            [0]*samples))
    elif scenario == 'f1, f2, f3, f4': # scenario 2
        data_x = np.concatenate((
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [1]*samples,
            [2]*samples,
            [3]*samples,
            [4]*samples))
    elif scenario == 'n1, f1, f2, f3': # scenario 3
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault3_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [3]*samples))
    elif scenario == 'n2, f2, f4': # scenario 4
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t2_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [2]*samples,
            [4]*samples))
    elif  scenario == 'n1, f1, f2, f4': # scenario 5
        data_x = np.concatenate((
            dict_ds['data_ds3_normal_t1_original'].iloc[idxs_n, :],
            dict_ds['data_ds3_fault1_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault2_original'].iloc[idxs_f, :],
            dict_ds['data_ds3_fault4_original'].iloc[idxs_f, :]))
        data_y = np.concatenate((
            [0]*samples,
            [1]*samples,
            [2]*samples,
            [4]*samples))
    else:
        print('Scenario not found!')

    return data_x, data_y

# Run

In [23]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

samples = 2000

scenarios = ['n1, n2, f2, f3',
             'f1, f2, f3, f4',
             'n1, f1, f2, f3',
             'n2, f2, f4',
             'n1, f1, f2, f4'
]

p = {
    #'n_clusters': [25, 50, 100, 300],
    'n_clusters': [10, 15],
    'alghorithm': ['auto', 'full', 'elkan'],
}

n = 0
fold = 1
scenario = 4 # 0 - 4

idxs_n = np.random.randint(0,180000,samples)
idxs_f = np.random.randint(0,100000,samples)

data_x, data_y = get_scenario(scenarios[scenario], dict_ds, idxs_n, idxs_f)
data = np.concatenate((data_x, data_y.reshape(-1,1)), axis=1)

for nc in p['n_clusters']:
    for a in p['alghorithm']:
        n = n+1
        ini = time.time()
        print(n, end=' ')
        print ('Kmeans Batch', n, scenarios[scenario], nc, a, file=log)

        ss = StandardScaler()
        ac = KMeans(n_clusters=nc, algorithm=a)
        y_pred = []

        x_nd = data[:,:-1]

        pca = PCA(.99)
        x_nd = pca.fit_transform(x_nd)

        for pred in ac.fit_predict(ss.fit_transform(x_nd)):
            y_pred.append(pred)

        if (len(np.unique(y_pred)) == 1):
            print(-1, file=log)
        else:
            print(silhouette_score(ss.transform(x_nd), y_pred), file=log)
        print(v_measure_score(data_y, y_pred, beta=0), file=log) # 0 plus homogeneity | 2 completeness

        fim = time.time()
        print(fim-ini, 'seconds')

if FLUSH_FILE:
    log.close()

1 3.291079044342041 seconds
2 3.054002523422241 seconds
3 2.372997522354126 seconds
4 2.5230019092559814 seconds
5 2.981001615524292 seconds
6 2.983006477355957 seconds


In [24]:
y_hat[0]

NameError: name 'y_hat' is not defined