In [None]:
!pip install -q scikit-multiflow

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from skmultiflow.drift_detection import ADWIN, DDM
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from matplotlib.colors import ListedColormap
from multiprocessing.pool import ThreadPool
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from contextlib import suppress
from collections import Counter
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from random import shuffle
import seaborn as sns
from time import time
import pandas as pd
import numpy as np
import warnings
import scipy.io
import pickle
import sys
import gc
import os
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data'
code_path = '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/Codes/Shared Codes'
results_path = '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/results'
feature_selection_results = '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/feature_selection_results'
feature_selection_results_evolving = '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/feature_selection_results_evolving'

In [None]:
sys.path.insert(0,code_path)
from genetic_programming import SymbolicRegressor
from binirizer import CustomLabelBinirizer
from ensemble import Ensemble, Classifier
from oselm import OSELMClassifier,set_use_know
from DynamicFeatureSelection import dynamic_feature_selection
from SharedFunctions import prepare_data,train_and_test,feature_evolving,save_pickle,load_pickle,save_object,load_object,generate_new_samples

In [None]:
def genetic_programming():
    return SymbolicRegressor(population_size=10,
            generations=5, stopping_criteria=0.85,
            p_crossover=0.7, p_subtree_mutation=0.1,
            p_hoist_mutation=0.05, p_point_mutation=0.1,
            max_samples=0.7, verbose=1,
            parsimony_coefficient=1e-4, random_state=42,
            function_set=['avg2', 'avg3', 'avg5',
                          'median3', 'median5', 'maximum2', 'maximum3', 'maximum5'],
            metric='f1-score')

In [None]:
def generate_oselm_models(number_of_hidden_neurons, apply_model_replacement=False):
    models= [OSELMClassifier(number_of_hidden_neurons, 'relu', binarizer=CustomLabelBinirizer(), random_state=42),
             OSELMClassifier(number_of_hidden_neurons, 'relu', binarizer=CustomLabelBinirizer(), random_state=42),
             OSELMClassifier(number_of_hidden_neurons, 'relu', binarizer=CustomLabelBinirizer(), random_state=42),
             OSELMClassifier(number_of_hidden_neurons, 'relu', binarizer=CustomLabelBinirizer(), random_state=42),
             ]

    ensemble = Ensemble(classifiers=models, program=genetic_programming(), apply_model_replacement=apply_model_replacement)
    return ensemble

In [None]:
def concept_drift_detection(drift_detection_obj, sample) -> bool:
    """
    Detect concept drift
    :param drift_detection_obj: sklearn drift detection object (ADWIN, DDM, )
    :param smaple : new instanece of data stream
    return True if concept drift was detected otherwise false
    """
    drift_detection_obj.add_element(sample)
    return drift_detection_obj.detected_change()

In [None]:
filenames = ['kddcup99_csv.csv','ISCX2012.csv','CSE-CIC2018.csv','CICIDS2017.csv','7recurrentDrift.csv', 'blip.csv', 'incrementalDrift.csv',
             '7gradualDrift.csv', '7suddenDrift.csv']
filenames = list(map(lambda x: os.path.join(data_path, x), filenames))
filenames

['/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/kddcup99_csv.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/ISCX2012.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/CSE-CIC2018.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/CICIDS2017.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/7recurrentDrift.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/blip.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/incrementalDrift.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/7gradualDrift.csv',
 '/content/drive/My Drive/Colab Notebooks/Muawiya/Genetic Programming Combiner with DFS/data/7suddenDrift.csv']

In [None]:
f_name = filenames[0]
datasets = {}
first_chunk=True
data_frame = prepare_data(f_name)
data_frame = data_frame.sample(frac=1, random_state=42)
buffer = data_frame.sample(n=5000)
data_frame.reset_index(inplace=True)
data_frame.replace([np.inf], 0, inplace=True)
data_frame.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,5121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728571,0.0,...,0.636364,0.031373,1.0,0.07,1.0,0.0,0.0,0.0,0.0,1
1,20374,0.0,0.0,7.478457e-07,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.090909,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,40016,0.0,0.0,3.768215e-07,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.090909,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,5260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185714,0.0,...,0.727273,0.019608,0.0,0.03,0.0,0.0,0.0,0.0,0.0,1
4,644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185714,0.0,...,0.727273,0.062745,0.0,0.05,0.0,0.0,0.0,0.0,0.0,1


In [None]:
data = data_frame.values
X, Y = data[:, 0:-1], data[:, -1].astype('int')

In [None]:
print("Shape :",Y.shape)
print("Number of Ones :",np.count_nonzero(Y == 1))
print("Number of Zeros : ",len(Y) - np.count_nonzero(Y == 1))
print("Number of None :",np.count_nonzero(np.isnan(Y)))
print("Number of NaN :",np.isnan(Y).sum())

Shape : (48998,)
Number of Ones : 39270
Number of Zeros :  9728
Number of None : 0
Number of NaN : 0


In [20]:
classifiers = [0,1,2,3,4,5,6,7]
classifiers[3:]

[3, 4, 5, 6, 7]

In [18]:
ensemble = generate_oselm_models(number_of_hidden_neurons=X.shape[1]*3 // 2, apply_model_replacement=True)
X, Y = data[:, 0:-1], data[:, -1].astype('int')
chunks_features = np.array_split(X, 10)
chunks_labels = np.array_split(Y, 10)
train_size = 0.80
drift_detection_obj_ddm = DDM()
drift_detection_obj_adwin = ADWIN()
for CN,chunk_X, chunk_Y in tqdm(zip([*range(len(chunks_labels))],chunks_features, chunks_labels)):
  print(75*"=",CN,75*"=")
  print("chunk_X: {} chunk_Y: {}".format(chunk_X.shape,chunk_Y.shape))
  print("Number of Ones :",np.count_nonzero(chunk_Y == 1))
  print("Number of Zeros : ",len(chunk_Y) - np.count_nonzero(chunk_Y == 1))
  print("Number of None :",np.count_nonzero(np.isnan(chunk_Y)))
  print("Number of NaN :",np.isnan(chunk_Y).sum())
  print(100*"*")
  try:
    chunk_X, chunk_Y = SMOTE().fit_resample(chunk_X, chunk_Y)
  except:
    if chunk_Y.sum() in [0, 1]:
      new_samples, new_labels = generate_new_samples(buffer, chunk_Y)
      chunk_X = np.concatenate((chunk_X, new_samples))
      chunk_Y = np.concatenate((chunk_Y, new_labels))
  gc.collect()
  print("chunk_X: {} chunk_Y: {}".format(chunk_X.shape,chunk_Y.shape))
  print("Number of Ones :",np.count_nonzero(chunk_Y == 1))
  print("Number of Zeros : ",len(chunk_Y) - np.count_nonzero(chunk_Y == 1))
  print("Number of None :",np.count_nonzero(np.isnan(chunk_Y)))
  print("Number of NaN :",np.isnan(chunk_Y).sum())
  print(100*"*")
  X_train, X_test, y_train, y_test = chunk_X[:int(chunk_X.shape[0]*train_size)], chunk_X[int(chunk_X.shape[0]*train_size):],chunk_Y[:int(chunk_X.shape[0]*train_size)], chunk_Y[int(chunk_X.shape[0]*train_size):]
  print("X_train: {} X_test: {}".format(X_train.shape,X_test.shape))
  if not ensemble.fitted:
    y_train[0] = 0 if y_train[0] == 1 else 1
    ensemble.fit(X_train[:200], y_train[:200])
  for i in tqdm(range(200, len(X_train))):
        sample, y_true = X_train[i], y_train[i]
        y_pred = ensemble.global_support_degree(sample)
        actual_driftddm = concept_drift_detection(drift_detection_obj_ddm, int(y_true!=y_pred))
        actual_driftadwin = concept_drift_detection(drift_detection_obj_ddm, int(y_true!=y_pred))
        if actual_driftddm:
          print(i,"DDM",actual_driftddm)
        if actual_driftadwin:
          print(i,"ADWIN",actual_driftadwin)

0it [00:00, ?it/s]

chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3896
Number of Zeros :  1004
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7792, 40) chunk_Y: (7792,)
Number of Ones : 3896
Number of Zeros :  3896
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6233, 40) X_test: (1559, 40)
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0   128.70                1       81                1                1      0.63s


  0%|          | 0/6033 [00:00<?, ?it/s]

229 DDM True
273 ADWIN True
361 DDM True
387 ADWIN True
458 DDM True
484 ADWIN True
2269 ADWIN True
2292 DDM True
2337 ADWIN True
2376 DDM True
2408 ADWIN True
2438 DDM True
2607 ADWIN True
2739 ADWIN True
2870 DDM True
2997 ADWIN True
3021 DDM True
3038 ADWIN True
3064 DDM True
3844 DDM True
3992 ADWIN True
5051 DDM True
5097 ADWIN True
5127 DDM True
5142 ADWIN True
5172 DDM True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3916
Number of Zeros :  984
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7832, 40) chunk_Y: (7832,)
Number of Ones : 3916
Number of Zeros :  3916
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6265, 40) X_test: (1567, 40)


  0%|          | 0/6065 [00:00<?, ?it/s]

5925 ADWIN True
5949 DDM True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3918
Number of Zeros :  982
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7836, 40) chunk_Y: (7836,)
Number of Ones : 3918
Number of Zeros :  3918
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6268, 40) X_test: (1568, 40)


  0%|          | 0/6068 [00:00<?, ?it/s]

5268 ADWIN True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3938
Number of Zeros :  962
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7876, 40) chunk_Y: (7876,)
Number of Ones : 3938
Number of Zeros :  3938
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6300, 40) X_test: (1576, 40)


  0%|          | 0/6100 [00:00<?, ?it/s]

chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3927
Number of Zeros :  973
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7854, 40) chunk_Y: (7854,)
Number of Ones : 3927
Number of Zeros :  3927
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6283, 40) X_test: (1571, 40)


  0%|          | 0/6083 [00:00<?, ?it/s]

520 DDM True
582 ADWIN True
608 DDM True
5191 DDM True
5229 ADWIN True
5252 DDM True
5354 ADWIN True
5391 DDM True
5616 DDM True
5633 ADWIN True
5707 DDM True
5742 ADWIN True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3958
Number of Zeros :  942
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7916, 40) chunk_Y: (7916,)
Number of Ones : 3958
Number of Zeros :  3958
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6332, 40) X_test: (1584, 40)


  0%|          | 0/6132 [00:00<?, ?it/s]

5125 DDM True
5183 ADWIN True
5206 DDM True
5225 ADWIN True
5245 DDM True
5307 ADWIN True
5382 DDM True
5411 ADWIN True
5940 DDM True
6053 ADWIN True
6070 DDM True
6105 ADWIN True
6127 DDM True
6146 ADWIN True
6228 DDM True
6282 ADWIN True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3928
Number of Zeros :  972
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7856, 40) chunk_Y: (7856,)
Number of Ones : 3928
Number of Zeros :  3928
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6284, 40) X_test: (1572, 40)


  0%|          | 0/6084 [00:00<?, ?it/s]

210 DDM True
229 ADWIN True
246 DDM True
281 ADWIN True
4093 ADWIN True
4459 DDM True
4559 ADWIN True
4586 DDM True
5138 ADWIN True
5182 DDM True
5222 ADWIN True
5247 DDM True
5265 ADWIN True
5328 DDM True
5400 ADWIN True
5471 ADWIN True
5519 DDM True
5538 ADWIN True
chunk_X: (4900, 40) chunk_Y: (4900,)
Number of Ones : 3948
Number of Zeros :  952
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7896, 40) chunk_Y: (7896,)
Number of Ones : 3948
Number of Zeros :  3948
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6316, 40) X_test: (1580, 40)


  0%|          | 0/6116 [00:00<?, ?it/s]

5886 DDM True
5916 ADWIN True
5939 DDM True
5993 ADWIN True
6094 ADWIN True
chunk_X: (4899, 40) chunk_Y: (4899,)
Number of Ones : 3946
Number of Zeros :  953
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7892, 40) chunk_Y: (7892,)
Number of Ones : 3946
Number of Zeros :  3946
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6313, 40) X_test: (1579, 40)


  0%|          | 0/6113 [00:00<?, ?it/s]

5587 ADWIN True
chunk_X: (4899, 40) chunk_Y: (4899,)
Number of Ones : 3895
Number of Zeros :  1004
Number of None : 0
Number of NaN : 0
****************************************************************************************************
chunk_X: (7790, 40) chunk_Y: (7790,)
Number of Ones : 3895
Number of Zeros :  3895
Number of None : 0
Number of NaN : 0
****************************************************************************************************
X_train: (6232, 40) X_test: (1558, 40)


  0%|          | 0/6032 [00:00<?, ?it/s]

5312 DDM True
5539 DDM True
5574 ADWIN True
