# Initial

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from math import floor
from sklearn.metrics import confusion_matrix
from skmultiflow.lazy import SAMKNNClassifier
from skmultiflow.meta import AdaptiveRandomForestClassifier, BatchIncrementalClassifier, ProbabilisticClassifierChain, LearnPPNSEClassifier, LearnPPClassifier, LeveragingBaggingClassifier, MultiOutputLearner, OnlineAdaC2Classifier, OnlineBoostingClassifier, OnlineCSB2Classifier, OnlineRUSBoostClassifier, OnlineSMOTEBaggingClassifier, OnlineUnderOverBaggingClassifier, OzaBaggingClassifier, OzaBaggingADWINClassifier
from skmultiflow.trees import HoeffdingAdaptiveTreeClassifier, HoeffdingTreeClassifier, ExtremelyFastDecisionTreeClassifier, LabelCombinationHoeffdingTreeClassifier
import random


Bad key "text.kerning_factor" on line 4 in
C:\Users\MARCELO\anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
'''
EXTERNAL KERNEL
'''
google_colab = False
kaggle = False

'''
CUDA
'''
cuda = False

'''
DATA REPRESENTATION

1 => SINGLE READ | 2 => ADD FEATURES | 3 => WINDOW TO FEATURES
'''
DATA_REPRESENTATION = 2

'''
DOWNSAMPLE FACTOR

1 => 10hz *original rate* | 2 => 5Hz | 5 => 2Hz | 10 => 1hz
'''
DOWNSAMPLE_FACTOR = 5

'''
WINDOWS LENGHT

* needs divisor by datapoints target
* considering downsample factor = 5

1 => WINDOW DISABLED | 2 => 1 second | 4 => 2 seconds | 10 => 5 seconds | 20 => 10 seconds | 200 => 100 seconds *full flight*
'''
WINDOW_LENGHT =  1


'''
LIMITADOR

Quantity of samples in the execution of the tests.
'''
LIMITADOR = 500

'''
LOSS FACTOR [0,1]

Ignores outliers in calculating the stats of losses in regenerated data.
'''
LOSS_FACTOR = 1

'''
TRAIN_SIZE [0,1]

Percentage of samples to be trained
'''
TRAIN_SIZE = 0.8

'''
OUTPUT_FILE_NAME

File with output results
'''
OUTPUT_FILE_NAME = 'output_adaptive_random_forest_skflow_NOVO_log_dr_' + str(DATA_REPRESENTATION) + '-ts_' + str(TRAIN_SIZE) + '-lf_' + str.replace(str(LOSS_FACTOR), '.', '') + '-limit_' + str(LIMITADOR) + '-wl_' + str(WINDOW_LENGHT) + '.txt'

'''
PATH_OUTPUTS

local : ./outputs/
google colab : /content/drive/My Drive/
'''
if google_colab:
    PATH_OUTPUTS = '/content/drive/My Drive/'
else:
    PATH_OUTPUTS = './outputs/'


'''
PATH_DATASET

'''
PATH_DATASET = '../../dataset/original/'

'''
FLUSH FILE

If output results file is ON
'''
FLUSH_FILE = True

In [3]:
if google_colab:
    !pip install git+https://github.com/online-ml/river --upgrade

    from google.colab import drive

    drive.mount('/content/drive')
    path = '/content/drive/My Drive/ACADÊMICO/MESTRADO/DISSERTAÇÃO/CHAPTERS/5 EXPERIMENTO/dataset/data_representation_1'
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(path+'/F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(path+'/F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(path+'/F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(path+'/F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(path+'/F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(path+'/F16_DS3_fault4_fixedposition.csv', header=None),
    }
elif kaggle:
    !conda install -y gdown
    !gdown --id 1G88okIVmdcgLFlmd7rDRhHvHv98yK3UB
    !gdown --id 1fX3utfHMjwKTt7IW4D01bnm-hv88yzrJ
    !gdown --id 1yUG3R5zK2AIxtS9Q4Fk-udkKBZeYShgb
    !gdown --id 1OBRDtuqNEZ-3Z-q0helWh2xGiAxeLACH
    !gdown --id 17oDi60sWYsWHHxzj2aA9m6ARm8zQ81m_
    !gdown --id 1jKEK4s5sYJh8PHtpHeV8ABOsHjuB26RA
else:
    dict_ds_original = {
        'data_ds3_normal_t1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t1.csv', header=None),
        'data_ds3_normal_t2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_normal_t2.csv', header=None),
        'data_ds3_fault1_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault1_leakage.csv', header=None),
        'data_ds3_fault2_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault2_viscousfriction.csv', header=None),
        'data_ds3_fault3_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault3_compressibility.csv', header=None),
        'data_ds3_fault4_original' : pd.read_csv(PATH_DATASET+'F16_DS3_fault4_fixedposition.csv', header=None),
    }

# Dataset

In [4]:
dict_ds = dict_ds_original.copy()

if dict_ds['data_ds3_normal_t1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0 or dict_ds['data_ds3_fault1_original'].shape[0] % DOWNSAMPLE_FACTOR != 0:
    raise Exception('Needs to be ?shape? divisor')

for n, dataset_name in enumerate(dict_ds):
    dataset = dict_ds[dataset_name].to_numpy()

    downsampled = dataset[::DOWNSAMPLE_FACTOR]

    x, y = downsampled.shape

    # resample
    dict_ds[dataset_name] = pd.DataFrame(downsampled.reshape((int(x/WINDOW_LENGHT),y*WINDOW_LENGHT)))


In [5]:
# ADD COLUMNS WITH DIFF PREVIOUS VALUES

if (DATA_REPRESENTATION == 2):
    frame_size = int(1000/DOWNSAMPLE_FACTOR)

    for n, dataset_name in enumerate(dict_ds):
        dataset = dict_ds[dataset_name].to_numpy()

        dimension = dataset.shape[1]
        samples = dataset.shape[0]

        # GENERATE NEW DIMENSIONS
        dataset = np.concatenate((dataset, np.zeros((samples,dimension))), axis=1)

        for f in np.arange(0,int(samples/frame_size)):
            # OBTAIN THE FRAME FLIGHT
            frame = dataset[f*frame_size:(f+1)*frame_size, 0:dimension]

            # CALCULATE DIFFERENCE
            chunk = np.diff(frame, axis=0)

            # DONT CALCULATE THE DIFFERENCE FOR EACH FIRST TIMESTEP
            chunk = np.insert(chunk, 0, frame[0, 0:dimension], axis=0)

            # UPDATE DATASET WITH NEW FRAME INTO NEW DIMENSIONS
            dataset[f*frame_size:(f+1)*frame_size,dimension:dimension*2] = chunk

        dict_ds[dataset_name] = pd.DataFrame(dataset)



In [6]:
ss = StandardScaler()
#ss = MinMaxScaler()

data_ds3_t1_normal = dict_ds['data_ds3_normal_t1_original'].copy()
data_ds3_t2_normal = dict_ds['data_ds3_normal_t2_original'].copy()
data_ds3_fault1 = dict_ds['data_ds3_fault1_original'].copy()
data_ds3_fault2 = dict_ds['data_ds3_fault2_original'].copy()
data_ds3_fault3 = dict_ds['data_ds3_fault3_original'].copy()
data_ds3_fault4 = dict_ds['data_ds3_fault4_original'].copy()

# fit values
ss.partial_fit(data_ds3_t1_normal)
ss.partial_fit(data_ds3_t2_normal)
ss.partial_fit(data_ds3_fault1)
ss.partial_fit(data_ds3_fault2)
ss.partial_fit(data_ds3_fault3)
ss.partial_fit(data_ds3_fault4)

# transform values
data_ds3_t1_normal = ss.transform(data_ds3_t1_normal)
data_ds3_t2_normal = ss.transform(data_ds3_t2_normal)
data_ds3_fault1 = ss.transform(data_ds3_fault1)
data_ds3_fault2 = ss.transform(data_ds3_fault2)
data_ds3_fault3 = ss.transform(data_ds3_fault3)
data_ds3_fault4 = ss.transform(data_ds3_fault4)

# append normal labels
data_ds3_t1_normal = np.append(data_ds3_t1_normal, np.zeros((data_ds3_t1_normal.shape[0],1)), axis = 1)
data_ds3_t2_normal = np.append(data_ds3_t2_normal, np.zeros((data_ds3_t2_normal.shape[0],1)), axis = 1)

# append fault labels
def generate_fault_label(dataset, fault_label):
    labels = np.array([[fault_label]]*dataset.shape[0])

    return labels

data_ds3_fault1 = np.append(data_ds3_fault1, generate_fault_label(data_ds3_fault1, 1), axis = 1)
data_ds3_fault2 = np.append(data_ds3_fault2, generate_fault_label(data_ds3_fault2, 2), axis = 1)
data_ds3_fault3 = np.append(data_ds3_fault3, generate_fault_label(data_ds3_fault3, 3), axis = 1)
data_ds3_fault4 = np.append(data_ds3_fault4, generate_fault_label(data_ds3_fault4, 4), axis = 1)

# Split

In [7]:
faults = np.concatenate((
    np.append(dict_ds['data_ds3_fault1_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault1_original'], 1), axis = 1),
    np.append(dict_ds['data_ds3_fault2_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault2_original'], 2), axis = 1)
))

faults_shuffled = faults.copy()
np.random.shuffle(faults_shuffled)
folds = np.split(faults_shuffled, 10)

fold = folds[0]
X_train, X_test, y_train, y_test = train_test_split(fold[:, :-1], fold[:, -1], test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train3, X_test3, y_train3, y_test3 = train_test_split(dict_ds['data_ds3_fault3_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault3_original'], 3).reshape(-1), test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

X_train4, X_test4, y_train4, y_test4 = train_test_split(dict_ds['data_ds3_fault4_original'].copy(), generate_fault_label(dict_ds['data_ds3_fault4_original'], 4).reshape(-1), test_size=1-TRAIN_SIZE, random_state=42, shuffle=False)

In [8]:
def get_results(data_test, phi):
    output = np.zeros((len(data_test), 5)) # 7: none, f1, f2, f3, f4

    for k, dt in enumerate(data_test):
        a = clf.predict_proba(dt)

        for t in np.arange(0, a.shape[1]):
            output[k, t] = len(a[:,t][a[:,t] >= phi])
            #output[k, t] = len(a[:, t][a[:, t] == True])

    return output

def get_data_reinforce(clf, samples_retrain, fator):
    indexes_retrain = []

    samples = clf.STMSamples
    labels = clf.STMLabels

    for c in np.unique(labels):
        indexes = np.argwhere(labels == c) # retorna indices da classe
        idx = np.array(indexes[np.random.randint(0,len(indexes),int(samples_retrain*fator))])
        indexes_retrain.extend(idx.reshape(-1))

    random.shuffle(indexes_retrain)

    X_retrain = samples[indexes_retrain, :]
    y_retrain = labels[indexes_retrain]

    return np.append(X_retrain, y_retrain.reshape(-1,1), axis=1)

# Run

In [None]:
log = None
if FLUSH_FILE:
    log = open(PATH_OUTPUTS+OUTPUT_FILE_NAME, "a", buffering=1)

phi = 1
samples_train = 10000#500
samples_test = 800#100
samples_retrain = 1000#100

params = {'max_features': ['log2'], #, 0.5, None, 'sqrt'
          'n_estimatores': [100, 1000, 10],
          'leaf_prediction': ['nb', 'mc', 'nba'],
          'split_criterion': ['gini', 'info_gain'],
          'grace_period': [50, 25, 100, 200],
          'performance_metric': ['acc', 'kappa'],
          'phis': [0.1, 0.5, 1, 2]}

'''params = {'max_features': ['sqrt'],
          'n_estimatores': [100],
          'leaf_prediction': ['nb'],
          'split_criterion': ['gini'],
          'grace_period': [100],
          'performance_metric': ['acc'],
          'phis': [1]}'''

for mf in params['max_features']:
    for n in params['n_estimatores']:
        for lp in params['leaf_prediction']:
            for sc in params['split_criterion']:
                for gp in params['grace_period']:
                    for pm in params['performance_metric']:
                        for phi in params['phis']:
                            ss = StandardScaler()

                            clf = AdaptiveRandomForestClassifier(max_features=mf, n_estimators=n, leaf_prediction=lp, split_criterion=sc, grace_period=gp, performance_metric=pm)

                            ss.partial_fit(X_train[:samples_train])
                            clf.partial_fit(ss.transform(X_train[:samples_train]), y_train[:samples_train])

                            print(mf, n, lp, sc, gp, pm, file=log)

                            # TRAIN
                            data_test = np.append(X_test, y_test.reshape(-1, 1), axis=1)

                            X_test_f1 = data_test[data_test[:,-1] == 1][:,:-1]
                            X_test_f2 = data_test[data_test[:,-1] == 2][:,:-1]

                            data_test = [ss.transform(X_test_f1[:samples_test]),
                                          ss.transform(X_test_f2[:samples_test]),
                                          ss.transform(X_test3[:samples_test]),
                                          ss.transform(X_test4[:samples_test]),
                                          ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                                          ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                            ]

                            output = get_results(data_test, phi)
                            #print(output)
                            output = output.reshape(30)
                            for o in output:
                                print(o, file=log)

                            print('.', end='')

                            # NEW TRAIN WITH FAULT 3
                            ss.partial_fit(X_train3[:samples_retrain])
                            clf.partial_fit(ss.transform(X_train3[:samples_retrain]), y_train3[:samples_retrain])

                            data_test = [ss.transform(X_test_f1[:samples_test]),
                                          ss.transform(X_test_f2[:samples_test]),
                                          ss.transform(X_test3[:samples_test]),
                                          ss.transform(X_test4[:samples_test]),
                                          ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                                          ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                                          ]

                            output = get_results(data_test, phi)
                            #print(output)
                            output = output.reshape(30)
                            for o in output:
                                print(o, file=log)

                            print('.', end='')

                            # NEW TRAIN WITH FAULT 4
                            ss.partial_fit(X_train4[:samples_retrain])
                            clf.partial_fit(ss.transform(X_train4[:samples_retrain]), y_train4[:samples_retrain])

                            data_test = [ss.transform(X_test_f1[:samples_test]),
                                         ss.transform(X_test_f2[:samples_test]),
                                         ss.transform(X_test3[:samples_test]),
                                         ss.transform(X_test4[:samples_test]),
                                         ss.transform(dict_ds['data_ds3_normal_t1_original'][:samples_test]),
                                         ss.transform(dict_ds['data_ds3_normal_t2_original'][:samples_test])
                                         ]

                            output = get_results(data_test, phi)
                            #print(output)
                            output = output.reshape(30)
                            for o in output:
                                print(o, file=log)

                            print('.', end='')

if FLUSH_FILE:
    log.close()

....................................