# Initial

In [None]:
#!pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from math import floor
from sklearn.metrics import confusion_matrix
from skmultiflow.lazy import SAMKNNClassifier
import random

In [None]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = True

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1

'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_samknn_VERSAONOVA_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '_folds.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = False

In [None]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown --id 1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown --id 1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ
    !gdown --id 1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown --id 1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown --id 17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown --id 1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA
    
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv('F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv('F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv('F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv('F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv('F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv('F16_DS3_fault4_fixedposition.csv', header=None),
    }
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [None]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [None]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)



# Functions

In [None]:
def predict(clf, classes, data, threshold_distance = 50, neighbors = 2):

    threshold_dist = threshold_distance
    neighbors = neighbors

    votes = np.zeros((len(data),len(classes)))
    mu_dist = np.zeros((len(data),len(classes)))

    y_hat = np.zeros((len(data)))

    for k, c in enumerate(classes):
        #print('TESTE', c)
        indexes = np.nonzero(np.where(clf.LTMLabels == c, clf.LTMLabels, 0))

        for s in np.arange(0, len(data)):
            dist_sample = clf.get_distances(data[s, :-1], clf.LTMSamples[indexes])
            dist_sample_sort = np.sort(dist_sample)

            if (np.mean(dist_sample_sort[:neighbors]) > threshold_dist):
                votes[s,k] = 1 # set high distance

            mu_dist[s,k] = np.mean(dist_sample_sort[:neighbors])

        mu_class = mu_dist[:,k]
        #print ('repulse:', mu_class[mu_class > threshold_dist].shape[0]/len(data)*100)

    for k, vote in enumerate(votes):
        inliers = np.argwhere((vote == [1.]) == False).reshape(-1).shape[0]

        if (inliers == 0): # (1,1,1)
            y_hat[k] = -1
        else:
            founds = np.argwhere(vote == 0).reshape(-1)

            if len(founds) == 1: # (1,0,1)
                y_hat[k] = founds[0]+1
            else: # (1,0,0)
                minor_value = -1
                minor_index = -1

                for f in founds:
                    if minor_index == -1:
                        minor_value = mu_dist[k,f]
                        minor_index = f+1
                    elif mu_dist[k,f] < minor_value:
                        minor_value = mu_dist[k,f]
                        minor_index = f+1

                y_hat[k] = minor_index

    return y_hat


def generate_y_hat(probas, threshold):
    y_hat = []

    for p in probas:
        if (np.max(p) < threshold): #indecisao
            y_hat.append(-1)
        else: #certeza
            y_hat.append(np.argmax(p))

    return y_hat

def tester(clf, nt, phi, classes, data_test, log):
    y_hat = predict(clf, classes, data_test, phi, nt)
    print(len(y_hat[y_hat == -1]) / len(y_hat) * 100, file=log) #-1
    print(len(y_hat[y_hat == 1]) / len(y_hat) * 100, file=log) #1
    print(len(y_hat[y_hat == 2]) / len(y_hat) * 100, file=log) #2
    print(len(y_hat[y_hat == 3]) / len(y_hat) * 100, file=log) #3
    print(len(y_hat[y_hat == 4]) / len(y_hat) * 100, file=log) #4

# append fault labels
def generate_fault_label(dataset, fault_label):
    labels = np.array([[fault_label]]*dataset.shape[0])

    return labels

def get_data_reinforce(clf, samples_retrain, fator):
    indexes_retrain = []

    samples = clf.STMSamples
    labels = clf.STMLabels

    for c in np.unique(labels):
        indexes = np.argwhere(labels == c) # retorna indices da classe
        idx = np.array(indexes[np.random.randint(0,len(indexes),int(samples_retrain*fator))])
        indexes_retrain.extend(idx.reshape(-1))

    random.shuffle(indexes_retrain)

    X_retrain = samples[indexes_retrain, :]
    y_retrain = labels[indexes_retrain]

    return np.append(X_retrain, y_retrain.reshape(-1,1), axis=1)

# Split

In [None]:
faults = np.concatenate((
    np.append(dict_ds['data_ds3_fault1_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault1_original'], 1), axis = 1),
    np.append(dict_ds['data_ds3_fault2_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault2_original'], 2), axis = 1)
))

faults_shuffled = faults.copy()
np.random.shuffle(faults_shuffled)
folds = np.split(faults_shuffled, 10)

fold = folds[0]
X_train, X_test, y_train, y_test = train_test_split(fold[:, :-1], fold[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

#fold = faults.copy()
#X_train, X_test, y_train, y_test = train_test_split(fold[:, :-1], fold[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=True)


X_train3, X_test3, y_train3, y_test3 = train_test_split(dict_ds['data_ds3_fault3_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault3_original'], 3).reshape(-1), test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train4, X_test4, y_train4, y_test4 = train_test_split(dict_ds['data_ds3_fault4_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault4_original'], 4).reshape(-1), test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

# Run

In [None]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

samples = 10000
samples_test = 1600
samples_retrain = 1000

p = {'max_window_size': [10000],
     'phis': [2],
     'neighbors_test': [2],
     'n_neighbors': [5],
     'weighting': ['uniform'],
     'stm_size_option': ['maxACC']}

fold = folds[0]
X_train, X_test, y_train, y_test = train_test_split(fold[:, :-1], fold[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

for f in np.arange(0,10):
    fold = folds[f]
    X_train, X_test, y_train, y_test = train_test_split(fold[:, :-1], fold[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

    ss = StandardScaler()

    clf = SAMKNNClassifier(max_window_size=mws, n_neighbors=n, weighting=w, stm_size_option=sso) # stm_size_option=None

    print('FOLD:', f, file=log)

    # MAIN TRAIN WITH FAULT 1 , 2
    ss.partial_fit(X_train[:samples])
    clf.partial_fit(ss.transform(X_train[:samples]), y_train[:samples])
    print('.', end='')

    # TEST
    data_test = np.append(X_test, y_test.reshape(-1, 1), axis=1)

    X_test_f1 = data_test[data_test[:,-1] == 1][:,:-1]
    X_test_f2 = data_test[data_test[:,-1] == 2][:,:-1]

    datas_test = [ss.transform(X_test_f1[:samples_test]),
                  ss.transform(X_test_f2[:samples_test]),
                  ss.transform(X_test3[:samples_test]),
                  ss.transform(X_test4[:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                  ]

    for data_test in datas_test:
        tester(clf, nt, phi, [1,2], data_test, log)
    print('.', end='')

    # NEW TRAIN WITH FAULT 3
    ss.partial_fit(X_train3[:samples_retrain])

    # reinforce
    data_reinforce = get_data_reinforce(clf, samples_retrain, 0.5)
    data = np.append(X_train3[:samples_retrain], y_train3[:samples_retrain].reshape(-1,1), axis=1)

    data_retrain = np.concatenate((data_reinforce, data))
    np.random.shuffle(data_retrain)

    # partial train
    clf.partial_fit(ss.transform(data_retrain[:, :-1]), data_retrain[:, -1])

    datas_test = [ss.transform(X_test_f1[:samples_test]),
                  ss.transform(X_test_f2[:samples_test]),
                  ss.transform(X_test3[:samples_test]),
                  ss.transform(X_test4[:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                  ]

    for data_test in datas_test:
        tester(clf, nt, phi, [1,2,3], data_test, log)
    print('.', end='')

    # NEW TRAIN WITH FAULT 4
    ss.partial_fit(X_train4[:samples_retrain])

    # reinforce
    data_reinforce = get_data_reinforce(clf, samples_retrain, 0.33)
    data = np.append(X_train4[:samples_retrain], y_train4[:samples_retrain].reshape(-1,1), axis=1)

    data_retrain = np.concatenate((data_reinforce, data))
    np.random.shuffle(data_retrain)

    # partial train
    clf.partial_fit(ss.transform(data_retrain[:, :-1]), data_retrain[:, -1])

    datas_test = [ss.transform(X_test_f1[:samples_test]),
                  ss.transform(X_test_f2[:samples_test]),
                  ss.transform(X_test3[:samples_test]),
                  ss.transform(X_test4[:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                  ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                  ]

    for data_test in datas_test:
        tester(clf, nt, phi, [1,2,3,4], data_test, log)
    print('.', end='')

if FLUSH_FILE:
    log.close()