# Initial

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from math import floor
from sklearn.metrics import confusion_matrix
from skmultiflow.lazy import SAMKNNClassifier
from skmultiflow.meta import AdaptiveRandomForestClassifier, BatchIncrementalClassifier, ProbabilisticClassifierChain, LearnPPNSEClassifier, LearnPPClassifier, LeveragingBaggingClassifier, MultiOutputLearner, OnlineAdaC2Classifier, OnlineBoostingClassifier, OnlineCSB2Classifier, OnlineRUSBoostClassifier, OnlineSMOTEBaggingClassifier, OnlineUnderOverBaggingClassifier, OzaBaggingClassifier, OzaBaggingADWINClassifier


Bad key "text.kerning_factor" on line 4 in
C:\Users\MARCELO\anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 3

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  10

'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_knn_skflow_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
PATH_OUTPUTS = '/content/drive/My Drive/'


'''
PATH_DATASET

'''
PATH_DATASET = '../../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = False

In [3]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown --id 1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown --id 1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ
    !gdown --id 1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown --id 1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown --id 17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown --id 1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [4]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [5]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)



In [6]:
ss = StandardScaler()
#ss = MinMaxScaler()

data_ds3_t1_normal = dict_ds['data_ds3_normal_t1_original'].copy()
data_ds3_t2_normal = dict_ds['data_ds3_normal_t2_original'].copy()
data_ds3_fault1 = dict_ds['data_ds3_fault1_original'].copy()
data_ds3_fault2 = dict_ds['data_ds3_fault2_original'].copy()
data_ds3_fault3 = dict_ds['data_ds3_fault3_original'].copy()
data_ds3_fault4 = dict_ds['data_ds3_fault4_original'].copy()

# fit values
ss.partial_fit(data_ds3_t1_normal)
ss.partial_fit(data_ds3_t2_normal)
ss.partial_fit(data_ds3_fault1)
ss.partial_fit(data_ds3_fault2)
ss.partial_fit(data_ds3_fault3)
ss.partial_fit(data_ds3_fault4)

# transform values
data_ds3_t1_normal = ss.transform(data_ds3_t1_normal)
data_ds3_t2_normal = ss.transform(data_ds3_t2_normal)
data_ds3_fault1 = ss.transform(data_ds3_fault1)
data_ds3_fault2 = ss.transform(data_ds3_fault2)
data_ds3_fault3 = ss.transform(data_ds3_fault3)
data_ds3_fault4 = ss.transform(data_ds3_fault4)

# append normal labels
data_ds3_t1_normal = np.append(data_ds3_t1_normal, np.zeros((data_ds3_t1_normal.shape[0],1)), axis = 1)
data_ds3_t2_normal = np.append(data_ds3_t2_normal, np.zeros((data_ds3_t2_normal.shape[0],1)), axis = 1)

# append fault labels
def generate_fault_label(dataset, fault_label):
    labels = np.array([[fault_label]]*dataset.shape[0])

    return labels

data_ds3_fault1 = np.append(data_ds3_fault1, generate_fault_label(data_ds3_fault1, 1), axis = 1)
data_ds3_fault2 = np.append(data_ds3_fault2, generate_fault_label(data_ds3_fault2, 2), axis = 1)
data_ds3_fault3 = np.append(data_ds3_fault3, generate_fault_label(data_ds3_fault3, 3), axis = 1)
data_ds3_fault4 = np.append(data_ds3_fault4, generate_fault_label(data_ds3_fault4, 4), axis = 1)

# Split

In [20]:
# STRATIFIED

'''X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_ds3_fault1[:LIMITADOR, :-1], data_ds3_fault1[:LIMITADOR, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data_ds3_fault2[:LIMITADOR, :-1], data_ds3_fault2[:LIMITADOR, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(data_ds3_fault3[:LIMITADOR, :-1], data_ds3_fault3[:LIMITADOR, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(data_ds3_fault4[:LIMITADOR, :-1], data_ds3_fault4[:LIMITADOR, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)


X_train = np.concatenate((X_train_1, X_train_2, X_train_3, X_train_4))
X_test = np.concatenate((X_test_1, X_test_2, X_test_3, X_test_4))
y_train = np.concatenate((y_train_1, y_train_2, y_train_3, y_train_4))
y_test = np.concatenate((y_test_1, y_test_2, y_test_3, y_test_4))'''

faults = np.concatenate((data_ds3_fault1, data_ds3_fault2))
X_train, X_test, y_train, y_test = train_test_split(faults[:, :-1], faults[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=True)


In [21]:
def predict(clf, classes, data, threshold_distance = 50, neighbors = 2):

    threshold_dist = threshold_distance
    neighbors = neighbors

    votes = np.zeros((len(data),len(classes)))
    mu_dist = np.zeros((len(data),len(classes)))

    y_hat = np.zeros((len(data)))

    for k, c in enumerate(classes):
        #print('TESTE', c)
        indexes = np.nonzero(np.where(clf.LTMLabels == c, clf.LTMLabels, 0))

        for s in np.arange(0, len(data)):
            dist_sample = clf.get_distances(data[s, :-1], clf.LTMSamples[indexes])
            dist_sample_sort = np.sort(dist_sample)

            if (np.mean(dist_sample_sort[:neighbors]) > threshold_dist):
                votes[s,k] = 1 # set high distance

            mu_dist[s,k] = np.mean(dist_sample_sort[:neighbors])

        mu_class = mu_dist[:,k]
        #print ('repulse:', mu_class[mu_class > threshold_dist].shape[0]/len(data)*100)

    for k, vote in enumerate(votes):
        inliers = np.argwhere((vote == [1.]) == False).reshape(-1).shape[0]

        if (inliers == 0): # (1,1,1)
            y_hat[k] = -1
        else:
            founds = np.argwhere(vote == 0).reshape(-1)

            if len(founds) == 1: # (1,0,1)
                y_hat[k] = founds[0]+1
            else: # (1,0,0)
                minor_value = -1
                minor_index = -1

                for f in founds:
                    if minor_index == -1:
                        minor_value = mu_dist[k,f]
                        minor_index = f+1
                    elif mu_dist[k,f] < minor_value:
                        minor_value = mu_dist[k,f]
                        minor_index = f+1

                y_hat[k] = minor_index

    return y_hat


def generate_y_hat(probas, threshold):
    y_hat = []

    for p in probas:
        if (np.max(p) < threshold): #indecisao
            y_hat.append(-1)
        else: #certeza
            y_hat.append(np.argmax(p))

    return y_hat

knn_p = {
    'n_neighbors': [5], # def 5
    'leaf_size': [30], # def 30
    'window_size': [1000], # def 1000
    'p': [2] # 1 Minkowski | def 2 euclidean
}

phis = [.5]


# Run

In [34]:
'''
Increment Parameters
'''
increments = 10
block_size = int(floor(len(X_train) / increments))
samples = 30000

log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

for n in knn_p['n_neighbors']:
    for ls in knn_p['leaf_size']:
        for ws in knn_p['window_size']:
            for p in knn_p['p']:
                print('.', end='')

                clf = SAMKNNClassifier() # stm_size_option=None

                clf.partial_fit(X_train[:samples], y_train[:samples])

                print('-', end='')

                y_hat = predict(clf, [1,2], X_test[:samples], 20, 2)
                #cm = confusion_matrix(y_test[:samples], clf.predict(X_test[:samples]), normalize='true')
                cm = confusion_matrix(y_test[:samples], y_hat, normalize='true')
                print(cm)

if FLUSH_FILE:
    log.close()

.-[[0.         0.         0.        ]
 [0.17434373 0.82565627 0.        ]
 [0.19434629 0.         0.80565371]]


In [35]:
y_hat_4 = predict(clf, [1,2], data_ds3_fault4, 20, 2)

print('4 predito como -1')
print(len(y_hat_4[y_hat_4 == -1]) / len(y_hat_4) * 100)
print('4 predito como 1')
print(len(y_hat_4[y_hat_4 == 1]) / len(y_hat_4) * 100)
print('4 predito como 2')
print(len(y_hat_4[y_hat_4 == 2]) / len(y_hat_4) * 100)
print('4 predito como 3')
print(len(y_hat_4[y_hat_4 == 3]) / len(y_hat_4) * 100)

y_hat_3 = predict(clf, [1,2], data_ds3_fault3, 20, 2)

print('3 predito como -1')
print(len(y_hat_3[y_hat_3 == -1]) / len(y_hat_3) * 100)
print('3 predito como 1')
print(len(y_hat_3[y_hat_3 == 1]) / len(y_hat_3) * 100)
print('3 predito como 2')
print(len(y_hat_3[y_hat_3 == 2]) / len(y_hat_3) * 100)
print('3 predito como 3')
print(len(y_hat_3[y_hat_3 == 3]) / len(y_hat_3) * 100)

4 predito como -1
100.0
4 predito como 1
0.0
4 predito como 2
0.0
4 predito como 3
0.0
3 predito como -1
70.33
3 predito como 1
29.67
3 predito como 2
0.0
3 predito como 3
0.0


# New Concept Run

In [38]:
faults = data_ds3_fault3
X_train2, X_test2, y_train2, y_test2 = train_test_split(faults[:, :-1], faults[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=True)

In [37]:
'''
Increment Parameters
'''
increments = 10
block_size = int(floor(len(X_train) / increments))
samples = 10000

log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

for n in knn_p['n_neighbors']:
    for ls in knn_p['leaf_size']:
        for ws in knn_p['window_size']:
            for p in knn_p['p']:
                print('.', end='')

                clf.partial_fit(X_train[:samples], y_train[:samples])

                print('-', end='')

                y_hat = predict(clf, [1,2,3], np.concatenate((X_test, X_test2[:samples])), 20, 2)
                #cm = confusion_matrix(y_test[:samples], clf.predict(X_test[:samples]), normalize='true')
                cm = confusion_matrix(np.concatenate((y_test, y_test2[:samples])), y_hat, normalize='true')
                print(cm)

if FLUSH_FILE:
    log.close()

.-

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


[[0.84051511 0.         0.15948489]
 [0.         0.80565371 0.19434629]
 [0.176      0.         0.824     ]]


In [47]:
y_hat_4 = predict(clf, [1,2,3], data_ds3_t2_normal, 20, 2)

print('4 predito como -1')
print(len(y_hat_4[y_hat_4 == -1]) / len(y_hat_4) * 100)
print('4 predito como 1')
print(len(y_hat_4[y_hat_4 == 1]) / len(y_hat_4) * 100)
print('4 predito como 2')
print(len(y_hat_4[y_hat_4 == 2]) / len(y_hat_4) * 100)
print('4 predito como 3')
print(len(y_hat_4[y_hat_4 == 3]) / len(y_hat_4) * 100)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


4 predito como -1
0.0
4 predito como 1
80.12777777777778
4 predito como 2
0.0
4 predito como 3
19.87222222222222


In [11]:
clf.predict(data_ds3_fault4[10, :-1].reshape(1, 270))

array([3], dtype=int32)

In [25]:
clf.get_distances(data_ds3_fault3[4, :-1], clf.LTMSamples[:1])

array([30.18058421])

In [46]:
labels = np.array(clf.STMLabels)
import collections
collections.Counter(labels)

Counter({2.0: 1283, 1.0: 1326})

In [35]:
metrics = {}


for c in [1,2,3]:
    indexes = np.nonzero(np.where(clf.LTMLabels == c, clf.LTMLabels, 0))

    all_distances_class = []

    for n in np.arange(0, len(indexes[0])):
        sample = clf.LTMSamples[indexes][n]
        distances_sample = clf.get_distances(sample, clf.LTMSamples[indexes])
        all_distances_class = all_distances_class + distances_sample.tolist()

    all_distances_class_np = np.array(all_distances_class)
    metrics[c] = {'mean': np.mean(all_distances_class_np), 'std': np.std(all_distances_class_np)}

In [324]:
# check

data = np.concatenate((data_ds3_fault2, data_ds3_fault3))
#data = data_ds3_fault3

threshold_dist = 10

classes = [1,2,3]

votes = np.zeros((len(data),len(classes)))
mu_dist = np.zeros((len(data),len(classes)))

for k, c in enumerate(classes):
    print('TESTE', c)
    indexes = np.nonzero(np.where(clf.LTMLabels == c, clf.LTMLabels, 0))

    for s in np.arange(0, len(data)):
        dist_sample = clf.get_distances(data[s, :-1], clf.LTMSamples[indexes])
        dist_sample_sort = np.sort(dist_sample)

        if (np.mean(dist_sample_sort[:clf.n_neighbors]) > threshold_dist):
            votes[s,k] = 1 # set high distance

        mu_dist[s,k] = np.mean(dist_sample_sort[:clf.n_neighbors])
        #print(np.mean(dist_sample_sort[:clf.n_neighbors]), np.std(dist_sample_sort[:clf.n_neighbors]), np.min(dist_sample_sort[:clf.n_neighbors]), np.max(dist_sample_sort[:clf.n_neighbors]))

    mu_class = mu_dist[:,k]
    print ('repulse:', mu_class[mu_class > threshold_dist].shape[0]/len(data)*100)

TESTE 1
repulse: 100.0
TESTE 2
repulse: 57.06
TESTE 3
repulse: 60.575


In [293]:
for k, vote in enumerate(votes):
    inliers = np.argwhere((vote == [1.]) == False).reshape(-1).shape[0]

    if (inliers == 0): # (1,1,1)
        print('Indecision')
    else:
        founds = np.argwhere(vote == 0).reshape(-1)

        if len(founds) == 1: # (1,0,1)
            print('Predict', founds[0]+1)
        else: # (1,0,0)
            minor_value = -1
            minor_index = -1

            for f in founds:
                if minor_index == -1:
                    minor_value = mu_dist[k,f] # alterar aqui
                    minor_index = f+1
                elif mu_dist[k,f] < minor_value:
                    minor_value = mu_dist[k,f]
                    minor_index = f+1

            print('Minor is', minor_index)


Indecision
Indecision
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Predict 3
Predict 3
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Indecision
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Indecision
Predict 3
Indecision
Predict 3
Indecision
Indecision
Predict 3
Indecision
Predict 3
Indecision
Predict 3
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Predict 3
Indecision
Predict 

SyntaxError: invalid syntax (<ipython-input-292-2d8585fa78ef>, line 1)

In [142]:
indexes = np.nonzero(np.where(clf.LTMLabels == 1, clf.LTMLabels, 0))
dist_sample = clf.get_distances(data[1, :-1], clf.LTMSamples[indexes])
dist_sample_sort = np.sort(dist_sample)
np.mean(dist_sample_sort[:clf.n_neighbors])

187.62136618308045

In [143]:
clf.predict(data[2, :-1].reshape(1, 270))

array([3], dtype=int32)