In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from scipy.sparse import csr_matrix
import random as rnd
import pickle
import sys


from datetime import datetime

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, mean_squared_error, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.multiclass import OneVsRestClassifier, OutputCodeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

### CONSTANTS
SEED = 123
NUM_FEATURES = 10
TRAIN_PCT = 0.8

MAX_DEPTH = 4
MAX_ITER = 300
N_NEIGHBORS = 5



In [None]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
def generate_random_array(median,std,size):
    output=[0]*size    
    for index in range(0,size//2):
        random_nr=rnd.uniform(-std,std)
        output[2*index]=median+random_nr
        output[2*index+1]=median-random_nr
    if(size % 2 != 0):
        output[size-1]=rnd.uniform(median-std,median+std)
    return output



In [None]:
def selectFeatures(x, y, train_size_pct=0.75):
    """
    selectFeatures
        x : The features of the dataset to be used for predictions
        y : The target class for each row in "x"
        train_size_pct : (default = 0.75) In the tange (0.0, 1.0), the ratio by which to split the data for training and testing
        @return (list) The names of the selected features
    """

    # Create classifiers
    rf = RandomForestClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED)
    et = ExtraTreesClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED)
    dectree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=SEED)

    classifier_mapping = {
        "RandomForest" : rf,
        "ExtraTrees" : et,
        "DecisionTree" : dectree
    }

    ### Split the dataset
    X_train_fs, X_test_fs, Y_train_fs, Y_test_fs = train_test_split(x, y, train_size=train_size_pct)

    model_features = {}

    for model_name, model in classifier_mapping.items():
        print(f'[Training] {model_name}')
        start_train = datetime.now()
        model.fit(X_train_fs, Y_train_fs)
        print(">>> Training Time: {}".format(datetime.now() - start_train))
        model_features[model_name] = model.feature_importances_
        model_score = model.score(X_test_fs, Y_test_fs)
        print(f'>>> Training Accuracy : {model_score*100.0}')
        print("")

    cols = X_train_fs.columns.values
    feature_df = pd.DataFrame({'features': cols})
    for model_name, model in classifier_mapping.items():
        feature_df[model_name] = model_features[model_name]

    ### Grab the nlargest features (by score) from each ensemble group
    all_f = []
    for model_name, model in classifier_mapping.items():
        try:
            all_f.append(feature_df.nlargest(NUM_FEATURES, model_name))
        except KeyError as e:
            print(f'*** Failed to add features for {model_name} : {e}')

    result = []
    for i in range(len(all_f)):
        result.extend(all_f[i]['features'].to_list())		# Concat the top nlargest scores from all groups into one list

    # selected_features contains the ensemble results for best features
    selected_features = list(set(result))					# Drop duplicate fields from the list

    return selected_features


    

In [None]:
def evaluateIndividualClassifiers(x, y, train_size_pct):
    """
    evaluateIndividualClassifiers
        x : The features of the dataset to be used for predictions
        y : The target class for each row in "x"
        train_size_pct : {float in the range(0.0, 1.0)} the percentage of the dataset that should be used for training
    """

    max_depth_x2 = MAX_DEPTH * 2
    max_iter_x2 = MAX_ITER * 2
    n_neighbors_x2 = N_NEIGHBORS * 2
    n_neighbors_d2 = N_NEIGHBORS // 2

    rf = RandomForestClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED)
    rf_x2 = RandomForestClassifier(max_depth=max_depth_x2, criterion='entropy', random_state=SEED)
    et = ExtraTreesClassifier(max_depth=MAX_DEPTH, criterion='entropy', random_state=SEED)
    dectree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=SEED)
    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)
    knn_x2 = KNeighborsClassifier(n_neighbors=n_neighbors_x2)
    knn_d2 = KNeighborsClassifier(n_neighbors=n_neighbors_d2)
    mlpnn = MLPClassifier(max_iter=MAX_ITER)
    mlpnnE = MLPClassifier(max_iter=MAX_ITER, early_stopping=True)
    mlpnn_x2 = MLPClassifier(max_iter=max_iter_x2)
    mlpnnE_x2 = MLPClassifier(max_iter=max_iter_x2, early_stopping=True)

    classifier_mapping = {
        f'RandomForest-{MAX_DEPTH}' : rf,
        f'RandomForest-{max_depth_x2}' : rf_x2,
        f'ExtraTrees-{MAX_DEPTH}' : et,
        f'DecisionTree-{MAX_DEPTH}' : dectree,
        f'KNeighbors-{N_NEIGHBORS}' : knn,
        f'KNeighbors-{n_neighbors_x2}' : knn_x2,
        f'KNeighbors-{n_neighbors_d2}' : knn_d2,
        f'MLP-{MAX_ITER}' : mlpnn,
        f'MLP-{MAX_ITER}-early' : mlpnnE,
        f'MLP-{max_iter_x2}' : mlpnn_x2,
        f'MLP-{max_iter_x2}-early' : mlpnnE_x2,
    }

    for model_name, model in classifier_mapping.items():

        train_test_model(model_name, model, x, y, train_size_pct)


In [None]:
def train_test_model(model_name, model, x, y, train_size_pct):

    # Split the data
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size=train_size_pct)

    # Training
    print(f'\n[Training] {model_name}')
    start_train = datetime.now()
    model.fit(X_train, Y_train)
    print(f'>>> Training time: {datetime.now() - start_train}')

    ### Analyze Training
    train_acc = model.score(X_train, Y_train)
    print(f'>>> Training accuracy: {train_acc}')

    ### Testing
    start_predict = datetime.now()
    y_pred = model.predict(X_test)
    print(f'>>> Testing time: {datetime.now() - start_predict}')

    ### Analyze Testing
    calculateMetrics(Y_test, y_pred)



In [None]:
def calculateMetrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average="macro")
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    mse = mean_squared_error(y_test, y_pred)
    f1score = f1_score(y_pred, y_test, average='weighted')
    print(">>> Metrics")
    print(f'- Accuracy  : {acc}')
    print(f'- Recall    : {recall}')
    print(f'- Precision : {precision}')
    print(f'- MSE       : {mse}')
    print(f'- F1 Score  : {f1score}')

    return [round(acc, 6), round(recall, 6), round(precision, 6), round(mse, 6), round(f1score, 6)]



In [None]:
def label_attacks(x):
  if x != 'BENIGN' and x != 'DDoS':
    return 'DoS'
  return x

def class_selection(x):
  if 1 in set(x) and 2 in set(x):
    return 3
  if 1 in set(x):
    return 1
  if 2 in set(x):
    return 2
  else:
    return 0


In [None]:
traffic = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/TrafficLabelling/combined.csv")
print(f'Number of Rows: {traffic.shape[0]}')
print(f'Number of Columns: {traffic.shape[1]}')
pd.set_option('display.max_colwidth', None)
traffic.head()

Number of Rows: 2660377
Number of Columns: 85


Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,...,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443,192.168.10.5,54865,6,7/7/2017 3:30,3,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3.0,3.0,3.0,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,6.0,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,33,-1,1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80,192.168.10.5,55054,6,7/7/2017 3:30,109,1,1,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,110091.7,18348.62385,109.0,0.0,109.0,109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,6.0,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1.0,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80,192.168.10.5,55055,6,7/7/2017 3:30,52,1,1,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,230769.2,38461.53846,52.0,0.0,52.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,6.0,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1.0,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,29,256,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443,192.168.10.16,46236,6,7/7/2017 3:30,34,1,1,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,352941.2,58823.52941,34.0,0.0,34.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,6.0,6.0,0.0,0.0,0,0,0,0,1,1,0,0,1.0,9.0,6.0,6.0,20,0,0,0,0,0,0,1,6,1,6,31,329,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443,192.168.10.5,54863,6,7/7/2017 3:30,3,2,0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,4000000.0,666666.6667,3.0,0.0,3.0,3.0,3.0,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,...,6.0,6.0,0.0,0.0,0,0,0,0,1,0,0,0,0.0,9.0,6.0,0.0,40,0,0,0,0,0,0,2,12,0,0,32,-1,1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [None]:
# Remove duplicate entries
traffic.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

# reformat Timestamp to Unix time
# traffic[' Timestamp'] = pd.to_datetime(traffic[' Timestamp'])
# traffic[' Timestamp']= traffic[' Timestamp'].values.astype(np.int64) // 10 ** 9

# remove Timestamp column
traffic = traffic.drop(columns=[' Timestamp'])

# Remove constant columns
traffic = traffic.loc[:, traffic.apply(pd.Series.nunique) != 1]



In [None]:

# Remove column with NaN or Inf
traffic = traffic[~traffic.isin([np.nan, np.inf, -np.inf]).any(1)]

# remove Flow ID
traffic = traffic.drop(columns=['Flow ID'])




In [None]:
# reformat Label
raw_labels = set(list(traffic[' Label']))

traffic = traffic[traffic[' Label']  != "SSH-Patator"]
traffic = traffic[traffic[' Label']  != "PortScan"]
traffic = traffic[traffic[' Label']  != "Heartbleed"]
traffic = traffic[traffic[' Label']  != "FTP-Patator"]
traffic = traffic[traffic[' Label']  != "Infiltration"]
traffic = traffic[traffic[' Label']  != "Bot"]
raw_labels


{'BENIGN',
 'DDoS',
 'DoS GoldenEye',
 'DoS Hulk',
 'DoS Slowhttptest',
 'DoS slowloris'}

In [None]:

traffic[' Label'] = traffic[' Label'].apply(label_attacks)


raw_labels = list(traffic[' Label'])

for raw_label in set(raw_labels):
  print(str(raw_label) + ": " + str(raw_labels.count(raw_label)))

traffic[' Label'] = traffic[' Label'].astype('category')
traffic[' Label'] = traffic[' Label'].cat.codes

traffic[' Label']

DDoS: 128025
BENIGN: 2103072
DoS: 251711


In [None]:
columns = list(traffic.columns)


In [None]:
# create ip address dict {ip_addr: node_id}
src_ips = set(traffic[' Source IP'])
dst_ips = set(traffic[' Destination IP'])
ips = list(src_ips.union(dst_ips))
                
ips_dict = {ips[i]: i for i in range(len(ips))}   # dictionary of ip addresses { ip_addr: node_id }
print("Number of nodes: " + str(len(ips_dict)))


Number of nodes: 18598


In [None]:
# adding a group_size column to the original dataframe
traffic['group_size'] = traffic.groupby([' Source IP'])[' Destination IP'].transform("count")
print(traffic)

              Source IP   Source Port  ...  Label  group_size
0        104.16.207.165           443  ...      0          29
1         104.16.28.216            80  ...      0         114
2         104.16.28.216            80  ...      0         114
3         104.17.241.25           443  ...      0           2
4        104.19.196.102           443  ...      0         120
...                 ...           ...  ...    ...         ...
2660372   192.168.10.14         51114  ...      0      129266
2660373   192.168.10.16         24054  ...      0      114034
2660374  23.208.163.130           443  ...      0          51
2660375   192.168.10.14         51694  ...      0      129266
2660376   192.168.10.14         57949  ...      0      129266

[2482808 rows x 76 columns]


In [None]:
# define aggregation functions for all columns
aggregations = {col_name: "mean" for col_name in columns}
aggregations.pop(' Label', None)
aggregations.pop(' Source IP', None)
aggregations.pop(' Source Port', None)
aggregations.pop(' Destination IP', None)
aggregations.pop(' Destination Port', None)
aggregations.pop(' Protocol', None)

aggregations.update({' Label': lambda x: class_selection(x)})
print(aggregations)


{' Flow Duration': 'mean', ' Total Fwd Packets': 'mean', ' Total Backward Packets': 'mean', 'Total Length of Fwd Packets': 'mean', ' Total Length of Bwd Packets': 'mean', ' Fwd Packet Length Max': 'mean', ' Fwd Packet Length Min': 'mean', ' Fwd Packet Length Mean': 'mean', ' Fwd Packet Length Std': 'mean', 'Bwd Packet Length Max': 'mean', ' Bwd Packet Length Min': 'mean', ' Bwd Packet Length Mean': 'mean', ' Bwd Packet Length Std': 'mean', 'Flow Bytes/s': 'mean', ' Flow Packets/s': 'mean', ' Flow IAT Mean': 'mean', ' Flow IAT Std': 'mean', ' Flow IAT Max': 'mean', ' Flow IAT Min': 'mean', 'Fwd IAT Total': 'mean', ' Fwd IAT Mean': 'mean', ' Fwd IAT Std': 'mean', ' Fwd IAT Max': 'mean', ' Fwd IAT Min': 'mean', 'Bwd IAT Total': 'mean', ' Bwd IAT Mean': 'mean', ' Bwd IAT Std': 'mean', ' Bwd IAT Max': 'mean', ' Bwd IAT Min': 'mean', 'Fwd PSH Flags': 'mean', ' Fwd URG Flags': 'mean', ' Fwd Header Length': 'mean', ' Bwd Header Length': 'mean', 'Fwd Packets/s': 'mean', ' Bwd Packets/s': 'mean'

In [None]:
# low activity grouping
low_activities = traffic[traffic['group_size'] < 10]
low_activities = low_activities[low_activities[' Label'] == 0]
low_activities = low_activities.groupby(' Source IP', as_index=False).agg(aggregations)
low_activities



Unnamed: 0,Source IP,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0.0.0.0,3.668571e+07,2.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.054517,3.670000e+07,0.000000,3.670000e+07,3.670000e+07,3.670000e+07,3.670000e+07,0.000000,3.670000e+07,3.670000e+07,0.0,0.0,0.00000,0.0,0.0,0.0,0,0.000000,0.000000,0.054517,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0.0,0.0,0.00,0,0,0.0,0.000000,0.0,0.0,0.000000,2.000000,0.0,0.000000,0.0,-1.00,-1.00,0.000000,0.0,0.0,0.0,0.0,0.0,36700000.0,0.0,36700000.0,36700000.0,0
1,1.1.70.73,4.511675e+04,2.000000,0.250000,3.0,0.0,1.5,1.5,1.5,0.000000,0.0,0.0,0.0,0.0,8.720930e+03,4244.011538,2.269288e+04,31693.763375,4.510375e+04,2.820000e+02,4.511675e+04,4.511675e+04,0.000000,4.511675e+04,4.511675e+04,0.0,0.0,0.00000,0.0,0.0,0.0,0,58.000000,8.000000,4242.617935,1.393604,1.5,1.5,1.5,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.25,0,0,0.0,2.250000,1.5,0.0,58.000000,2.000000,3.0,0.250000,0.0,4454.25,59.25,0.250000,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.193.219.21,6.480000e+02,3.000000,0.000000,258.0,0.0,258.0,0.0,86.0,148.956369,0.0,0.0,0.0,0.0,3.981481e+05,4629.629630,3.240000e+02,452.548340,6.440000e+02,4.000000e+00,6.480000e+02,3.240000e+02,452.548340,6.440000e+02,4.000000e+00,0.0,0.0,0.00000,0.0,0.0,0.0,0,96.000000,0.000000,4629.629630,0.000000,0.0,258.0,64.5,129.000000,16641.0,0.0,0.0,0,0.0,1.0,0.00,0,0,0.0,86.000000,86.0,0.0,96.000000,3.000000,258.0,0.000000,0.0,61.00,-1.00,1.000000,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.193.219.24,1.661294e+05,2.000000,0.800000,258.0,0.0,258.0,0.0,129.0,182.433549,0.0,0.0,0.0,0.0,4.301911e+07,333556.025118,4.166945e+04,83140.944944,1.660928e+05,1.680000e+01,1.661180e+05,1.661180e+05,0.000000,1.661180e+05,1.661180e+05,165574.8,82787.4,117078.21534,165574.2,0.6,0.8,0,64.000000,16.000000,333481.472436,74.552682,0.0,258.0,137.6,140.230707,19969.2,0.0,0.8,0,0.0,1.0,0.00,0,0,0.2,199.520000,129.0,0.0,64.000000,2.000000,258.0,0.800000,0.0,65.00,-0.60,0.200000,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.226.51.14,4.217540e+04,1.400000,0.600000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,15641.023359,4.217540e+04,0.000000,4.217540e+04,4.217540e+04,4.212860e+04,4.212860e+04,0.000000,4.212860e+04,4.212860e+04,0.0,0.0,0.00000,0.0,0.0,0.0,0,44.800000,19.200000,7824.395543,7816.627816,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.60,0,0,0.6,0.000000,0.0,0.0,44.800000,1.400000,0.0,0.600000,0.0,155.20,866.00,0.000000,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11181,98.158.54.105,2.570000e+02,3.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,11673.151750,1.285000e+02,176.069589,2.530000e+02,4.000000e+00,2.570000e+02,1.285000e+02,176.069589,2.530000e+02,4.000000e+00,0.0,0.0,0.00000,0.0,0.0,0.0,0,96.000000,0.000000,11673.151750,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.00,0,0,0.0,0.000000,0.0,0.0,96.000000,3.000000,0.0,0.000000,0.0,62512.00,-1.00,0.000000,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11182,98.158.58.110,4.627100e+04,2.000000,0.500000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,333349.542732,2.313625e+04,32679.293445,4.624400e+04,2.850000e+01,4.627100e+04,4.627100e+04,0.000000,4.627100e+04,4.627100e+04,0.0,0.0,0.00000,0.0,0.0,0.0,0,64.000000,16.000000,333344.139605,5.403127,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.50,0,0,0.0,0.000000,0.0,0.0,64.000000,2.000000,0.0,0.500000,0.0,32806.00,636.00,0.000000,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11183,98.158.62.102,4.732533e+04,2.333333,0.000000,14.0,0.0,6.0,6.0,6.0,0.000000,0.0,0.0,0.0,0.0,1.351724e+06,225287.380623,2.369950e+04,33410.559709,4.732433e+04,7.466667e+01,4.732533e+04,2.369950e+04,33410.559709,4.732433e+04,7.466667e+01,0.0,0.0,0.00000,0.0,0.0,0.0,0,46.666667,0.000000,225287.380623,0.000000,6.0,6.0,6.0,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.00,0,0,0.0,8.666667,6.0,0.0,46.666667,2.333333,14.0,0.000000,0.0,62506.00,-1.00,1.333333,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
11184,98.158.62.103,1.156000e+03,2.333333,0.333333,14.0,2.0,6.0,6.0,6.0,0.000000,2.0,2.0,2.0,0.0,1.361991e+06,226998.538384,1.091111e+03,42.673611,1.140000e+03,1.061333e+03,1.156000e+03,1.107333e+03,46.197643,1.140000e+03,1.074667e+03,0.0,0.0,0.00000,0.0,0.0,0.0,0,46.666667,6.666667,225856.985870,1141.552511,6.0,6.0,6.0,0.000000,0.0,0.0,0.0,0,0.0,1.0,0.00,0,0,0.0,8.500000,6.0,2.0,46.666667,2.333333,14.0,0.333333,2.0,62506.00,84.00,1.333333,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
# attackers grouping
atk_activities = traffic[traffic[' Label'] != 0]
# atk_activities = atk_activities.groupby(' Source IP', as_index=False).agg(aggregations)
atk_activities


Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,group_size
18883,172.16.0.1,49650,192.168.10.50,80,6,1293792,3,7,26.0,11607.0,20.0,0.0,8.666667,10.263203,5840.0,0.0,1658.142857,2137.297080,8991.398927,7.729218,1.437547e+05,4.308658e+05,1292730.0,2.0,747.0,3.735000e+02,5.239661e+02,744.0,3.0,1293746.0,2.156243e+05,5.276719e+05,1292730.0,2.0,0,0,72,152,2.318765,5.410452,0.0,5840.0,1057.545455,1853.437529,3.435231e+06,0,0,0,1,0,0,0,0,2.0,1163.300000,8.666667,1658.142857,72,3,26,7,11607,8192,229,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,383301
18884,172.16.0.1,49650,192.168.10.50,80,6,4421382,4,0,24.0,0.0,6.0,6.0,6.000000,0.000000,0.0,0.0,0.000000,0.000000,5.428167,0.904695,1.473794e+06,2.552043e+06,4420639.0,340.0,4421382.0,1.473794e+06,2.552043e+06,4420639.0,340.0,0.0,0.000000e+00,0.000000e+00,0.0,0.0,0,0,80,0,0.904695,0.000000,6.0,6.0,6.000000,0.000000,0.000000e+00,0,0,0,0,1,0,0,0,0.0,7.500000,6.000000,0.000000,80,4,24,0,0,256,-1,3,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,383301
18885,172.16.0.1,51684,192.168.10.50,80,6,1083538,3,6,26.0,11601.0,20.0,0.0,8.666667,10.263203,4380.0,0.0,1933.500000,1757.789948,10730.588130,8.306123,1.354422e+05,3.777253e+05,1070206.0,41.0,12982.0,6.491000e+03,8.165669e+03,12265.0,717.0,1083407.0,2.166814e+05,4.771671e+05,1070206.0,41.0,0,0,72,132,2.768708,5.537415,0.0,4380.0,1162.700000,1645.241762,2.706820e+06,0,0,0,1,0,0,0,0,2.0,1291.888889,8.666667,1933.500000,72,3,26,6,11601,8192,229,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,383301
18886,172.16.0.1,51684,192.168.10.50,80,6,80034360,8,4,56.0,11601.0,20.0,0.0,7.000000,5.656854,8760.0,0.0,2900.250000,4128.319301,145.649943,0.149936,7.275851e+06,2.110000e+07,70600000.0,6.0,78700000.0,1.120000e+07,2.630000e+07,70600000.0,247.0,1354806.0,4.516020e+05,7.775753e+05,1349464.0,190.0,0,0,172,92,0.099957,0.049979,0.0,8760.0,897.153846,2488.507044,6.192667e+06,0,0,0,0,1,0,0,0,0.0,971.916667,7.000000,2900.250000,172,8,56,4,11601,256,229,6,20,939.0,0.0,939.0,939.0,39300000.0,44200000.0,70600000.0,8072664.0,1,383301
18887,172.16.0.1,51686,192.168.10.50,80,6,642654,3,6,26.0,11607.0,20.0,0.0,8.666667,10.263203,5840.0,0.0,1934.500000,2538.919278,18101.497850,14.004425,8.033175e+04,2.268247e+05,641694.0,3.0,801.0,4.005000e+02,5.409367e+02,783.0,18.0,642493.0,1.284986e+05,2.868851e+05,641694.0,3.0,0,0,72,132,4.668142,9.336284,0.0,5840.0,1163.300000,2138.329153,4.572452e+06,0,0,0,1,0,0,0,0,2.0,1292.555556,8.666667,1934.500000,72,3,26,6,11607,8192,229,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,383301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2660280,172.16.0.1,45746,192.168.10.50,80,6,11512204,8,5,326.0,11632.0,326.0,0.0,40.750000,115.258405,10184.0,0.0,2326.400000,4437.056321,1038.723775,1.129236,9.593503e+05,2.262616e+06,6507197.0,1.0,6510770.0,9.301100e+05,2.460477e+06,6509948.0,1.0,11500000.0,2.878040e+06,3.378874e+06,6507197.0,196.0,0,0,264,168,0.694915,0.434322,0.0,10184.0,854.142857,2713.257986,7.361769e+06,0,0,0,1,0,0,0,0,0.0,919.846154,40.750000,2326.400000,264,8,326,5,11632,29200,235,1,32,892.0,0.0,892.0,892.0,6507197.0,0.0,6507197.0,6507197.0,2,383301
2660287,172.16.0.1,45744,192.168.10.50,80,6,11513325,5,5,471.0,3525.0,471.0,0.0,94.200000,210.637604,2077.0,0.0,705.000000,990.644740,347.076105,0.868559,1.279258e+06,2.565153e+06,6508582.0,53.0,6512631.0,1.628158e+06,3.255639e+06,6511616.0,176.0,11500000.0,2.878318e+06,3.379306e+06,6508582.0,192.0,0,0,168,168,0.434279,0.434279,0.0,2077.0,363.272727,719.780257,5.180836e+05,0,0,0,1,0,0,0,0,1.0,399.600000,94.200000,705.000000,168,5,471,5,3525,29200,235,1,32,918.0,0.0,918.0,918.0,6508582.0,0.0,6508582.0,6508582.0,2,383301
2660326,172.16.0.1,45762,192.168.10.50,80,6,11509201,7,6,314.0,11632.0,314.0,0.0,44.857143,118.680845,5792.0,0.0,1938.666667,2544.918912,1037.952157,1.129531,9.591001e+05,2.261842e+06,6503248.0,46.0,6507056.0,1.084509e+06,2.655701e+06,6505437.0,255.0,11500000.0,2.301831e+06,3.195689e+06,6503248.0,47.0,0,0,232,200,0.608209,0.521322,0.0,5792.0,853.285714,1857.187221,3.449144e+06,0,0,0,1,0,0,0,0,0.0,918.923077,44.857143,1938.666667,232,7,314,6,11632,29200,235,1,32,899.0,0.0,899.0,899.0,6503248.0,0.0,6503248.0,6503248.0,2,383301
2660355,172.16.0.1,45758,192.168.10.50,80,6,11509095,8,5,369.0,11632.0,369.0,0.0,46.125000,130.461201,10184.0,0.0,2326.400000,4437.056321,1042.740546,1.129541,9.590912e+05,2.262122e+06,6504954.0,47.0,6507794.0,9.296849e+05,2.458891e+06,6505926.0,168.0,11500000.0,2.877262e+06,3.377879e+06,6504954.0,196.0,0,0,264,168,0.695102,0.434439,0.0,10184.0,857.214286,2712.638403,7.358407e+06,0,0,0,1,0,0,0,0,0.0,923.153846,46.125000,2326.400000,264,8,369,5,11632,29200,235,1,32,914.0,0.0,914.0,914.0,6504954.0,0.0,6504954.0,6504954.0,2,383301


In [None]:
basic_flows = traffic.groupby([' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Protocol'], as_index=False)

basic_flows = basic_flows.agg(aggregations)
basic_flows

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,is Attacker?
0,0.0.0.0,0,224.0.0.22,0,0,36685713.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.054517,36700000.0,0.0000,36700000.0,36700000.0,36700000.0,36700000.0,0.0,36700000.0,36700000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054517,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,36700000.0,0.0,36700000.0,36700000.0,0
1,1.1.70.73,80,192.168.10.15,52772,6,344.0,2.0,0.0,12.0,0.0,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,34883.720930,5813.953488,344.0,0.0000,344.0,344.0,344.0,344.0,0.0,344.0,344.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,5813.953488,0.000000,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,6.0,0.0,40.0,2.0,12.0,0.0,0.0,4380.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.1.70.73,80,192.168.10.17,48301,6,315.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,6349.206349,315.0,0.0000,315.0,315.0,315.0,315.0,0.0,315.0,315.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,6349.206349,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,2.0,0.0,0.0,0.0,4380.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.1.70.73,80,192.168.10.19,35264,6,179391.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,16.723247,89695.5,126775.0535,179339.0,52.0,179391.0,179391.0,0.0,179391.0,179391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,32.0,11.148831,5.574416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,2.0,0.0,1.0,0.0,4677.0,240.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.1.70.73,80,192.168.10.19,35275,6,417.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,4796.163070,417.0,0.0000,417.0,417.0,417.0,417.0,0.0,417.0,417.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,4796.163070,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,2.0,0.0,0.0,0.0,4380.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382955,99.192.248.32,80,192.168.10.5,49372,6,55.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,218181.818182,36363.636364,55.0,0.0000,55.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,18181.818182,18181.818182,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,20.0,1.0,6.0,1.0,6.0,46.0,256.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1382956,99.192.248.32,80,192.168.10.5,49419,6,45.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,266666.666667,44444.444444,45.0,0.0000,45.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,22222.222222,22222.222222,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,20.0,1.0,6.0,1.0,6.0,46.0,256.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1382957,99.192.248.32,80,192.168.10.8,2244,6,80.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,150000.000000,25000.000000,80.0,0.0000,80.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,12500.000000,12500.000000,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,20.0,1.0,6.0,1.0,6.0,46.0,16425.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1382958,99.192.248.32,80,192.168.10.9,4041,6,66.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,30303.030300,66.0,0.0000,66.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,15151.515150,15151.515150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,20.0,1.0,0.0,1.0,0.0,46.0,256.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
activity_flows = basic_flows.groupby([' Source IP'], as_index=False)
activity_flows = activity_flows.agg(aggregations)

# reformat Source IP
activity_flows = activity_flows.replace({" Source IP": ips_dict})
print(activity_flows)

atkers = activity_flows.groupby([' is Attacker?']).get_group(1)
non_atkers = activity_flows.groupby([' is Attacker?']).get_group(0)
print("Number of attackers: " + str(len(atkers)))
print("Number of non_atkers " + str(len(non_atkers)))



        Source IP   Flow Duration  ...    Idle Min   is Attacker?
0           13147    3.668571e+07  ...  36700000.0              0
1             696    4.511675e+04  ...         0.0              0
2           15131    6.480000e+02  ...         0.0              0
3            1817    1.038322e+05  ...         0.0              0
4           13681    4.217540e+04  ...         0.0              0
...           ...             ...  ...         ...            ...
16508       13431    4.627100e+04  ...         0.0              0
16509        9872    4.732533e+04  ...         0.0              0
16510        2737    1.156000e+03  ...         0.0              0
16511       10465    8.496417e+03  ...         0.0              0
16512       10578    3.000000e+00  ...         0.0              0

[16513 rows x 71 columns]
Number of attackers: 10
Number of non_atkers 16503


In [None]:
# create low profiles dataframe
num_ips = len(ips_dict)
num_src_ips = len(activity_flows[' Source IP'])
num_noflow_ips = num_ips - num_src_ips


no_flow_prof_columns = columns[5:-3]
print(no_flow_prof_columns)
num_low_ips = round(0.98*num_noflow_ips)   #### adjust benign/attackers ratio here
low_activities_prof = np.zeros((num_low_ips, len(no_flow_prof_columns)))
for col, attr in enumerate(no_flow_prof_columns):
  median = low_activities[attr].median()
  std = np.std(low_activities[attr])
  low_activities_prof[:, col] = generate_random_array(median, std, num_low_ips)


low_profile_df = pd.DataFrame(low_activities_prof, columns = no_flow_prof_columns)
low_profile_df[' is Attacker?'] = [0 for i in range(num_low_ips)]
low_profile_df[' Source IP'] = [num_src_ips + i for i in range(num_low_ips)]
low_profile_df



[' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count', ' ECE Flag Count', ' Down/Up R

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,is Attacker?,Source IP
0,1.047934e+06,0.811198,0.812650,1257.732190,-7.056445,25.470382,-1.410163,71.427614,-57.417110,15.868582,17.088212,-7.611809,-0.624601,-4.357612e+06,172554.807577,4578.762898,-610657.561377,9.760254e+05,-159422.567213,1.251354e+06,-437694.401235,425387.195188,-8.714976e+05,245353.207220,-87730.494981,-5094.169730,-33459.600727,7505.672936,-83325.958679,0.153809,0.0,58.979283,20.623185,170619.267238,-11740.012343,3.537292,-30.366054,19.622352,-34.586457,-5205.164619,0.032888,-0.294953,0.0,-0.004518,0.961313,0.641360,0.0,0.0,0.618562,76.069043,83.459744,-5.606146,36.328680,0.616589,88.415473,0.807195,-5.203535,-514.860883,5576.247938,0.600494,22.769483,-10909.657533,2228.925655,60244.540085,-28024.279072,1.085312e+06,43671.917274,-113000.204283,0,16513
1,-1.047780e+06,2.188802,0.687350,-1239.732190,10.056445,-13.470382,7.410163,-59.427614,57.417110,-12.868582,-14.088212,10.611809,0.624601,4.957612e+06,-18708.653737,-4444.762898,610657.561377,-9.758774e+05,159513.900547,-1.251346e+06,437701.401235,-425387.195188,8.715056e+05,-245348.064363,87730.494981,5094.169730,33459.600727,-7505.672936,83325.958679,-0.153809,0.0,17.020717,16.376815,-87141.006368,31457.274248,2.462708,42.366054,-7.622352,34.586457,5205.164619,-0.032888,0.294953,0.0,0.004518,1.038687,0.358640,0.0,0.0,0.381438,-58.069043,-71.459744,8.606146,39.671320,2.383411,-70.415473,0.692805,8.203535,764.860883,-5149.914605,-0.267161,17.230517,10909.657533,-2228.925655,-60244.540085,28024.279072,-1.085312e+06,-43671.917274,113000.204283,0,16514
2,7.455062e+05,2.098958,0.937870,830.096544,8.362910,87.083975,14.512443,45.311947,-43.206415,2.139036,3.605555,17.831160,0.015433,-2.719519e+06,172804.398712,-288516.072842,-177665.804494,-4.412109e+05,-307333.333367,-9.323553e+05,-439450.253629,470964.922539,-1.123753e+06,-106847.881680,13517.015568,35565.227255,14243.921040,-150454.302131,83279.747126,0.234037,0.0,47.359604,19.465540,-85642.219232,7146.704924,7.611590,-75.313887,40.486392,38.685601,1342.814112,-0.028464,0.207758,0.0,0.013354,1.047014,0.334821,0.0,0.0,0.686078,62.056559,10.037213,2.902945,63.911927,1.400432,674.085759,1.354667,14.789364,5161.157988,-10401.232926,0.062588,19.190766,14702.749946,5766.975036,-24478.858914,34884.430071,-5.841360e+04,-11118.689205,729924.675353,0,16515
3,-7.453522e+05,0.901042,0.562130,-812.096544,-5.362910,-75.083975,-8.512443,-33.311947,43.206415,0.860964,-0.605555,-14.831160,-0.015433,3.319519e+06,-18958.244872,288650.072842,177665.804494,4.413589e+05,307424.666700,9.323633e+05,439457.253629,-470964.922539,1.123761e+06,106853.024537,-13517.015568,-35565.227255,-14243.921040,150454.302131,-83279.747126,-0.234037,0.0,28.640396,17.534460,169120.480102,12570.556981,-1.611590,87.313887,-28.486392,-38.685601,-1342.814112,0.028464,-0.207758,0.0,-0.013354,0.952986,0.665179,0.0,0.0,0.313922,-44.056559,1.962787,0.097055,12.088073,1.599568,-656.085759,0.145333,-11.789364,-4911.157988,10827.566259,0.270746,20.809234,-14702.749946,-5766.975036,24478.858914,-34884.430071,5.841360e+04,11118.689205,-729924.675353,0,16516
4,1.136241e+06,2.436276,0.469239,1158.061108,-14.660164,-45.919767,16.928277,86.277329,51.818259,-13.473473,-16.745438,-8.764432,1.383190,1.241052e+06,347490.531266,277181.799053,450907.779030,-6.147699e+05,-219708.310529,-4.962041e+05,25315.436730,611092.865775,-4.511355e+05,-154545.689172,-140080.955378,8732.102978,-46576.593649,-157322.134331,83582.161844,0.034430,0.0,55.626333,28.156013,-72746.632510,2282.638848,1.090487,-143.431285,-31.809378,-11.990878,-14551.461563,-0.005607,-0.214896,0.0,-0.003952,0.944099,0.663149,0.0,0.0,0.145856,4.431797,6.773807,12.657856,35.513138,2.491877,-478.806722,1.037180,-2.950358,-3340.319158,9981.408356,-0.168578,20.809289,-51312.179057,-9018.123230,-53556.386544,-24821.746341,-5.077393e+05,-7828.419230,-841850.614345,0,16517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038,-3.473220e+05,2.013481,1.071478,-798.567085,-8.569743,124.413114,1.029593,-40.308428,-59.034205,13.540668,9.794739,-9.342402,-0.122242,1.015219e+06,-99906.916736,294236.789129,-643633.629208,-4.389345e+05,58938.449055,1.019095e+05,353190.998375,-109430.657815,4.666177e+05,112621.129096,-105788.703898,48925.506434,5964.636748,-10539.853036,23829.551526,0.294685,0.0,34.357834,23.833520,116744.806110,-6049.281665,6.892084,-51.412373,-41.400115,-50.691681,-81546.207663,0.029709,-0.322645,0.0,0.020237,1.005680,0.752287,0.0,0.0,0.457982,68.976011,-38.925743,-16.943876,52.663421,1.189264,-1058.303408,0.211456,5.733156,-504.838692,8304.343972,-0.742270,23.422314,-25630.259693,19764.563085,53866.379538,-9094.088427,6.013175e+05,23520.893158,83852.746521,0,18551
2039,3.474760e+05,0.986519,0.428522,816.567085,11.569743,-112.413114,4.970407,52.308428,59.034205,-10.540668,-6.794739,12.342402,0.122242,-4.152189e+05,253753.070576,-294102.789129,643633.629208,4.390825e+05,-58847.115722,-1.019015e+05,-353183.998375,109430.657815,-4.666097e+05,-112615.986239,105788.703898,-48925.506434,-5964.636748,10539.853036,-23829.551526,-0.294685,0.0,41.642166,13.166480,-33266.545240,25766.543570,-0.892084,63.412373,53.400115,50.691681,81546.207663,-0.029709,0.322645,0.0,-0.020237,0.994320,0.247713,0.0,0.0,0.542018,-50.976011,50.925743,19.943876,23.336579,1.810736,1076.303408,1.288544,-2.733156,754.838692,-7878.010639,1.075604,16.577686,25630.259693,-19764.563085,-53866.379538,9094.088427,-6.013175e+05,-23520.893158,-83852.746521,0,18552
2040,1.178443e+06,1.497497,0.271436,-954.587601,20.280300,65.289982,-19.152492,86.380936,-28.223519,8.926013,15.824601,3.031855,-0.590278,-5.528236e+06,-108512.347084,231693.155584,534385.585859,1.098600e+06,302827.831651,1.106954e+06,153467.338305,552778.457595,5.694051e+05,-327087.008071,229443.790857,21094.643038,20850.332359,-41149.711886,21678.181194,0.256254,0.0,59.072170,4.151898,42146.223787,-1036.661578,3.573578,6.512300,65.119174,-48.759047,-89166.349213,-0.047308,0.073789,0.0,-0.007605,0.958751,0.413940,0.0,0.0,0.503679,1.022615,43.241741,9.299101,26.752381,1.579481,76.671245,1.294994,18.977477,-1470.041303,1207.071467,-0.588345,17.025246,-43849.505673,-3468.232817,-12012.054277,32337.550172,3.990604e+05,46615.638592,472659.345509,0,18553
2041,-1.178289e+06,1.502503,1.228564,972.587601,-17.280300,-53.289982,25.152492,-74.380936,28.223519,-5.926013,-12.824601,-0.031855,0.590278,6.128236e+06,262358.500924,-231559.155584,-534385.585859,-1.098452e+06,-302736.498317,-1.106946e+06,-153460.338305,-552778.457595,-5.693971e+05,327092.150928,-229443.790857,-21094.643038,-20850.332359,41149.711886,-21678.181194,-0.256254,0.0,16.927830,32.848102,41332.037083,20753.923483,2.426422,5.487700,-53.119174,48.759047,89166.349213,0.047308,-0.073789,0.0,0.007605,1.041249,0.586060,0.0,0.0,0.496321,16.977385,-31.241741,-6.299101,49.247619,1.420519,-58.671245,0.205006,-15.977477,1720.041303,-780.738133,0.921678,22.974754,43849.505673,3468.232817,12012.054277,-32337.550172,-3.990604e+05,-46615.638592,-472659.345509,0,18554


In [None]:
# create attacker profiles dataframe

num_atk_ips = num_noflow_ips-num_low_ips

atker_prof = np.zeros((num_atk_ips, len(no_flow_prof_columns)))
for col, attr in enumerate(no_flow_prof_columns):
  median = atk_activities[attr].median()
  std = np.std(atk_activities[attr])
  atker_prof[:, col] = generate_random_array(median, std, num_atk_ips)


atker_profile_df = pd.DataFrame(atker_prof, columns = no_flow_prof_columns)
atker_profile_df[' is Attacker?'] = [1 for i in range(num_atk_ips)]
atker_profile_df[' Source IP'] = [num_src_ips+num_low_ips + i for i in range(num_atk_ips)]
atker_profile_df


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,is Attacker?,Source IP
0,8432509.0,-6.398289,-17.189685,-9831.822621,-81.278364,-16.019222,1.838416,9.130216,74.942009,827.204914,1.836736,91.901567,-215.939747,159563.299005,-26656.713013,-248925.3,1899078.0,-2326964.0,33256.343404,-6983735.0,1049380.0,-837660.5,4396967.0,140317.372531,-8199910.0,414131.8,547393.7,3010813.0,64748.766608,0.019387,0.0,-350.812162,105.600314,30176.604882,-2623.181185,1.853658,231.9063,23.161956,198.504444,239783.608382,0.005297,-0.004626,0.0,0.793671,0.427644,0.365756,0.0,0.0,-0.06324,-1.751432,123.254812,140.280053,636.933052,1.220045,-3835.942086,-6.021032,333.547473,9911.703421,257.314349,-4.593144,24.614253,-112679.866037,55082.672756,-80146.448542,-64186.223486,-1091989.0,-262231.182322,1344019.0,1,18556
1,-2533988.0,15.925219,27.580434,13631.972894,300.64795,645.861242,-1.607312,139.621119,123.767464,-659.683116,2.519203,-44.740677,298.182226,-41951.487962,35185.678392,670142.8,-857621.5,7266263.0,-32849.623481,7891642.0,-620362.5,908524.2,-3882375.0,184969.710629,14088540.0,316114.0,406221.5,1700459.0,261276.734345,-0.019387,0.0,582.919642,117.986973,-25747.204539,6722.74622,-1.622554,720.400272,111.641756,49.40992,178042.842565,-0.005297,0.004626,0.0,0.563007,0.150027,-0.355332,0.0,0.0,1.274278,144.913826,25.496523,-93.119163,-404.825572,8.306885,7636.092359,16.411782,-114.177887,6166.622997,134.568773,8.586613,21.574348,127698.19937,-55082.672756,95164.781875,79204.556819,4213261.0,262231.182322,2730309.0,1,18557
2,-1048650.0,30.122722,-4.118905,14988.159331,181.484365,168.587529,-2.127155,121.383957,190.084289,-686.928484,0.469005,82.716522,-85.495226,119305.970929,-390.425609,-361380.8,-1256147.0,3522618.0,-53303.874319,8330704.0,2037544.0,-2636649.0,-4998087.0,287147.42549,6567639.0,-128688.2,664955.5,2100004.0,95107.668112,0.001876,0.0,-270.541374,204.782629,-12096.23291,-84.929011,2.083972,-236.531499,-19.707053,187.109288,-236698.999733,0.00853,0.009829,0.0,0.87707,0.499166,0.118228,0.0,0.0,-0.661925,36.904776,76.230354,269.023171,82.784428,33.172198,6456.020249,-1.465952,706.473031,4871.222317,564.383449,-21.521701,19.938708,112296.350991,-56490.572791,-99302.841267,73572.208991,4046744.0,-799.084456,5746373.0,1,18558
3,6947170.0,-20.595792,14.509654,-11188.009058,37.885221,461.25449,2.358259,27.367378,8.625185,854.450283,3.886933,-35.555632,167.737705,-1694.159886,8919.390988,782598.2,2297603.0,1416681.0,53710.594241,-7422796.0,-1608526.0,2707512.0,5512679.0,38139.65767,-679009.1,858934.0,288659.8,2611269.0,230917.832841,-0.001876,0.0,502.648854,18.804658,16525.633253,4184.494046,-1.852868,1188.838071,154.510766,60.805077,654525.450679,-0.00853,-0.009829,0.0,0.479608,0.078504,-0.107804,0.0,0.0,1.872963,106.257619,72.520981,-221.862281,149.323052,-23.645268,-2655.869975,11.856702,-487.103446,11207.104101,-172.500327,25.51517,26.249893,-97278.017658,56490.572791,114321.1746,-58553.875658,-925472.9,799.084456,-1672046.0,1,18559
4,-7826064.0,25.801102,-9.887407,6038.416443,741.285951,318.241991,-1.285439,162.135969,48.870213,-27.508839,2.304061,-48.224396,-303.924744,196909.763934,965.031431,-477954.6,1815307.0,-5613838.0,54171.907837,2059818.0,-1001773.0,167907.5,6963321.0,331504.793137,1525523.0,628125.1,-282415.2,271379.2,114364.107208,-0.015925,0.0,-289.78559,-238.875737,26336.885811,11439.02797,1.093988,939.506495,6.5408,277.591319,424519.83536,0.017695,-0.007822,0.0,0.676009,0.146919,0.293489,0.0,0.0,1.881416,125.155683,38.904161,101.467116,490.0906,-9.630942,-9710.864355,10.788012,1276.241303,8487.184428,-286.040817,29.141339,22.377035,113732.286794,-6115.012208,-161848.684522,-54410.184736,-1117399.0,122075.769471,8764173.0,1,18560
5,13724580.0,-16.274172,20.278157,-2238.26617,-521.916366,311.600029,1.516543,-13.384634,149.839261,195.030637,2.051877,95.385286,386.167223,-79297.95289,7563.933948,899172.0,-773851.0,10553140.0,-53765.187915,-1151911.0,1430791.0,-97043.81,-6448729.0,-6217.709977,4363107.0,102120.7,1236030.0,4439893.0,211661.393745,0.015925,0.0,521.89307,462.463024,-21907.485468,-7339.462935,-0.862884,12.800077,128.262913,-29.676954,-6693.384413,-0.017695,0.007822,0.0,0.680669,0.430751,-0.283065,0.0,0.0,-0.670379,18.006712,109.847174,-54.306225,-257.98312,19.157872,13511.014628,-0.397262,-1056.871718,7591.141991,677.923939,-25.14787,23.811566,-98713.953461,6115.012208,176867.017855,69428.51807,4238670.0,-122075.769471,-4689846.0,1,18561
6,-4687920.0,5.323248,13.064153,-5768.423878,554.427128,172.665608,-0.116879,124.820703,97.017436,656.529544,1.137238,97.442677,-77.441191,-27392.85791,25343.454479,-713068.7,-950191.4,-1438556.0,21305.438499,5959428.0,1045390.0,1919550.0,3284630.0,108894.764519,-4726384.0,678279.1,609068.8,4795611.0,245683.152537,-0.000169,0.0,134.548282,-11.947981,-9160.233271,1078.451328,1.311261,956.872284,-51.12167,183.873768,88986.484408,-0.002088,-0.009855,0.0,0.470026,0.225834,-0.048924,0.0,0.0,0.26487,171.941766,126.371908,-39.007578,-326.233249,-23.999187,-11189.844619,-3.610607,-145.606325,7870.634736,-268.826131,1.451944,26.200613,-90302.408756,53498.028164,-113356.9609,-13570.066687,2970156.0,-464238.623973,6414177.0,1,18562
7,10586440.0,4.203683,-2.673403,9568.574151,-335.057542,457.176412,0.347983,23.930632,101.692038,-489.007746,3.2187,-50.281787,159.68367,145004.668953,-16814.4891,1134286.0,1991648.0,6377856.0,-20898.718576,-5051520.0,-616372.4,-1848686.0,-2770039.0,216392.318641,10615010.0,51966.7,344546.5,-84338.65,80342.348416,0.000169,0.0,97.559198,235.535268,13589.633614,3021.113707,-1.080157,-4.565712,185.925383,64.040597,328839.966538,0.002088,0.009855,0.0,0.886652,0.351836,0.059349,0.0,0.0,0.946167,-28.779371,22.379427,86.168469,558.340729,33.526118,14989.994893,14.001356,364.97591,8207.691682,660.709253,2.541525,19.987988,105320.742089,-53498.028164,128375.294234,28588.40002,151114.9,464238.623973,-2339850.0,1,18563
8,-35795.3,-15.595333,3.991412,-3275.940361,-980.431332,602.147351,-0.345148,114.835861,183.83376,783.477707,3.168451,236.37158,376.086568,23774.554322,39173.76836,807216.4,598271.0,7039995.0,-46106.837402,5548803.0,712755.9,-1306217.0,-6545911.0,88.693907,12276620.0,496476.3,-110239.1,992791.5,180395.33324,-0.001184,0.0,47.954822,96.538064,5702.784781,-624.181477,-0.6645,-31.048384,92.856555,87.144471,-232242.135769,0.00904,0.018353,0.0,0.596391,0.546551,-0.1873,0.0,0.0,0.88487,-19.465923,39.803586,-108.813125,-305.516548,-4.466174,4621.878706,-13.811501,-1181.146287,12330.598791,-289.191439,-3.907339,19.290293,14343.254528,30636.790157,-78818.47,91048.554969,-1254664.0,59911.814754,1696034.0,1,18564
9,5934316.0,25.122264,6.399337,7076.090634,1199.800918,27.694669,0.576252,33.915474,14.875714,-615.955909,1.187487,-189.21069,-293.844089,93837.256722,-30644.802982,-385999.0,443185.3,-2100695.0,46513.557324,-4640896.0,-283738.0,1377081.0,7060503.0,325198.389254,-6387987.0,233769.5,1063854.0,3718481.0,145630.167712,0.001184,0.0,184.152659,127.049223,-1273.384438,4723.746512,0.895604,983.354956,41.947157,160.769894,650068.586716,-0.00904,-0.018353,0.0,0.760287,0.031119,0.197725,0.0,0.0,0.326167,162.628318,108.947749,155.974015,537.624029,13.993105,-821.728433,24.20225,1400.515873,3747.727627,681.074561,7.900808,26.898308,675.078806,-30636.790157,93836.803333,-76030.221635,4375936.0,-59911.814754,2378293.0,1,18565


In [None]:
# combines all the dataframes
activity_flows_all = activity_flows.append(low_profile_df, ignore_index=True)
activity_flows_all = activity_flows_all.append(atker_profile_df, ignore_index=True)



In [None]:

atkers = activity_flows_all.groupby([' is Attacker?']).get_group(1)
non_atkers = activity_flows_all.groupby([' is Attacker?']).get_group(0)
print("Number of attackers: " + str(len(atkers)))
print("Number of non_atkers " + str(len(non_atkers)))


Number of attackers: 52
Number of non_atkers 18546


In [None]:
activity_flows_all = activity_flows_all.sort_values(by=' Source IP', ignore_index=True)
activity_flows_all

Unnamed: 0,Source IP,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,is Attacker?
0,0,2.958276e+02,1.655172,0.551724,9.241379,2.275862,6.344828,4.137931,4.873563,1.274152,2.275862,2.275862,2.275862,0.000000,7.443734e+05,173200.060082,254.298851,3.268862e+01,2.907586e+02,233.827586,2.620000e+02,2.324828e+02,3.940292e+01,2.603448e+02,204.620690,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.0,33.103448,11.034483,154810.571641,18389.488437,4.137931,6.344828,4.579310,0.986954,14.124138,0.000000,0.000000,0.0,0.000000,1.000000,0.482759,0.0,0.0,0.413793,6.620690,4.873563,2.275862,33.103448,1.655172,9.241379,0.551724,2.275862,31.586207,174.034483,0.482759,20.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0
1,1,2.968569e+04,1.122807,0.877193,5.842105,4.789474,5.315789,5.315789,5.315789,0.000000,4.789474,4.789474,4.789474,0.000000,3.480764e+05,60856.512853,29685.692982,0.000000e+00,2.968569e+04,29685.692982,2.962772e+04,2.962772e+04,0.000000e+00,2.962772e+04,29627.719298,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.0,22.456140,17.543860,35588.410486,25268.102366,5.315789,5.315789,5.315789,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.877193,0.0,0.0,0.877193,7.973684,5.315789,4.789474,22.456140,1.122807,5.842105,0.877193,4.789474,29.008772,2768.675439,0.087719,20.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0
2,2,1.920000e+02,1.500000,0.500000,9.000000,3.000000,6.000000,6.000000,6.000000,0.000000,3.000000,3.000000,3.000000,0.000000,1.936134e+05,32268.907562,192.000000,0.000000e+00,1.920000e+02,192.000000,1.750000e+02,1.750000e+02,0.000000e+00,1.750000e+02,175.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.0,30.000000,10.000000,17563.025212,14705.882355,6.000000,6.000000,6.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.500000,0.0,0.0,0.500000,9.000000,6.000000,3.000000,30.000000,1.500000,9.000000,0.500000,3.000000,31.000000,164.000000,0.500000,20.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0
3,3,1.322275e+03,2.050847,0.279661,26.500000,1.449153,19.483051,4.576271,9.133333,7.515555,1.423729,1.423729,1.423729,0.000000,1.011438e+06,204326.933754,579.338983,5.730325e+02,1.303220e+03,255.038136,1.307114e+03,7.785996e+02,5.956608e+02,1.296072e+03,380.025424,1.271186e-02,1.271186e-02,0.000000e+00,1.271186e-02,0.012712,0.025424,0.0,41.016949,5.593220,201936.127350,2390.806404,4.576271,19.483051,8.742857,7.009958,605.918402,0.000000,0.025424,0.0,0.000000,1.000000,0.152542,0.0,0.0,0.152542,11.812853,9.133333,1.423729,41.016949,2.050847,26.500000,0.279661,1.449153,31.533898,1701.580508,0.800847,20.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0
4,4,1.114520e+03,1.510000,0.650000,64.720000,3.660000,39.880000,5.640000,20.296667,17.468492,3.540000,3.540000,3.540000,0.000000,6.492904e+05,105575.627689,825.445000,3.047410e+02,1.108860e+03,626.020000,1.066520e+03,8.133233e+02,3.348182e+02,1.062060e+03,578.020000,8.000000e-02,8.000000e-02,0.000000e+00,8.000000e-02,0.080000,0.020000,0.0,30.200000,13.000000,87630.411660,17945.216028,5.640000,39.880000,15.896667,15.687622,9024.800667,0.000000,0.020000,0.0,0.000000,1.000000,0.580000,0.0,0.0,0.590000,20.692000,20.296667,3.540000,30.200000,1.510000,64.720000,0.650000,3.660000,35.400000,197.030000,0.470000,20.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18593,18593,-7.976650e+06,-14.470026,26.530102,13825.695347,-788.069866,633.005564,0.760715,98.131541,106.801259,167.265611,2.105396,-155.868707,275.596571,1.878074e+05,13138.191628,173134.344849,-7.967597e+05,4.398538e+06,64846.840956,1.004606e+07,1.336392e+06,-5.758492e+04,-7.296595e+06,195294.876819,6.191766e+06,1.673591e+06,-2.382216e+05,1.517107e+06,194873.384710,-0.009106,0.0,647.428040,526.638454,11260.384322,-5257.838008,-1.178859,177.823899,-50.399784,333.577085,522495.866530,-0.006471,-0.009618,0.0,0.424649,0.127991,0.257778,0.0,0.0,1.507523,109.586229,-11.321317,-53.062532,117.297810,25.231231,-1776.091899,2.463417,718.218371,8171.294466,824.295130,-13.860721,24.295256,-40495.521743,-6549.563029,14921.561401,50868.665383,-2.639456e+06,252451.330404,-2.562797e+06,,1
18594,18594,5.696992e+06,0.080672,-17.175439,80.721756,-501.785557,267.002900,0.381691,53.873552,131.746004,-714.908112,0.443577,-24.096570,-35.747604,-2.148275e+04,-22553.837879,794727.534432,9.011195e+04,7.687284e+06,-57762.301574,-2.639839e+06,-8.648871e+05,7.794759e+04,4.515221e+06,133002.087169,-3.071470e+06,6.462190e+05,1.072505e+06,6.659683e+05,273085.877744,-0.002182,0.0,466.211933,-75.155757,17055.597430,14730.577389,-0.205656,113.661114,-28.037653,294.710238,244455.482291,0.010334,-0.006641,0.0,0.370457,0.101446,0.093149,0.0,0.0,0.350726,-2.646588,92.144983,146.544510,628.929636,18.773755,64.862836,-9.536403,-838.757459,9135.695200,741.581968,30.172140,19.959878,-42888.306981,35575.666896,152960.521946,-30610.468307,8.767414e+06,-522139.682371,9.268334e+06,,1
18595,18595,2.015280e+05,9.446258,27.566189,3719.428518,721.155142,362.839120,-0.150587,94.877783,66.963470,882.429910,3.912361,71.257460,117.990083,1.390946e+05,31082.803258,-373510.102438,9.513444e+05,-2.747984e+06,58169.021497,3.547746e+06,1.293905e+06,-7.083886e+03,-4.000630e+06,192284.995991,8.960100e+06,8.402679e+04,-1.188898e+05,4.045304e+06,52939.623209,0.002182,0.0,-234.104453,298.743044,-12626.197087,-10631.012353,0.436760,838.645457,162.841365,-46.795873,173370.968655,-0.010334,0.006641,0.0,0.986220,0.476224,-0.082724,0.0,0.0,0.860311,145.808983,56.606353,-99.383620,-396.822156,-9.246825,3735.287437,19.927153,1058.127044,6942.631218,-349.698846,-26.178671,26.228723,57906.640314,-35575.666896,-137942.188612,45628.801641,-5.646143e+06,522139.682371,-5.194007e+06,,1
18596,18596,-1.360453e+06,26.806953,26.625242,-8143.644817,641.209914,148.624841,1.662568,117.862386,29.152801,712.901634,3.648794,155.615981,278.909368,8.591492e+04,-29961.678903,666460.753131,2.943965e+06,5.025893e+06,-21881.763436,4.588835e+06,3.636868e+05,3.084962e+06,3.499059e+06,310496.146390,-5.887742e+06,2.887920e+05,9.815072e+05,3.630735e+06,194851.782813,-0.003997,0.0,87.496176,-17.852975,13774.558154,-6804.009438,0.305598,1050.553610,143.240555,218.157263,-181997.068665,-0.031731,-0.013557,0.0,0.346586,0.199230,0.274598,0.0,0.0,1.299373,18.708842,67.569520,-160.860399,157.866130,10.299381,-7099.077208,-22.661269,754.322056,11256.513368,65.886926,-9.656186,24.305350,117622.913809,16851.171561,81156.300785,-59457.512877,-5.666094e+06,-188144.654247,8.475532e+06,,1


In [None]:
# (1) SELECT | ALL features
X = activity_flows_all.iloc[:, 1:(activity_flows_all.shape[1]-1)]
Y = activity_flows_all.iloc[:, -1]


In [None]:
# create hypergraph
connection = traffic.groupby([' Source IP', ' Destination IP'], as_index=False)
connection = connection.agg({' Flow Duration' : 'count'})
src_ip_ids = [ips_dict[ip] for ip in connection[' Source IP']]
dst_ip_ids = [ips_dict[ip] for ip in connection[' Destination IP']]
edges = list(zip(src_ip_ids, dst_ip_ids))
hypergraph = {i: edges[i] for i in range(len(edges))}

In [None]:
# # generate features
features = X
features = np.nan_to_num(features)
features = preprocessing.normalize(features)
features = csr_matrix(features)

# generate features from feature selection set
# X = X.fillna(0)
# selected_features = selectFeatures(X, Y)
# print(f'Selected Features "from All": {selected_features}')
# features = X[selected_features]
# print("Shape of features: ", features.shape)

# features = preprocessing.normalize(features)
# features = csr_matrix(features)


In [None]:
# generate labels
labels = Y

In [None]:
# train/test split

splits = {}
attackers_ids = list(atkers.index)
V = len(ips)
node_sorted = list(range(V))
rnd.shuffle(node_sorted)
train_size = round(0.8*V)
test_size = V - train_size
train_set, test_set = train_test_split(node_sorted, train_size=train_size, test_size=test_size)



splits['train'] = train_set
splits['test'] = test_set


In [None]:
train_atk_count = 0
for idx, train_id in enumerate(train_set):
    if labels[train_id] == 1:
      train_atk_count += 1

test_atk_count = 0
for idx, test_id in enumerate(test_set):
    if labels[test_id] == 1:
      test_atk_count += 1

print("Number of atker in train set: ", train_atk_count)
print("Number of atker in test set: ", test_atk_count)

Number of atker in train set:  39
Number of atker in test set:  13


In [None]:
# generate files
with open("hypergraph.pickle", "wb") as hypergraph_out:
    pickle.dump(hypergraph, hypergraph_out)
    hypergraph_out.close()
with open("features.pickle", "wb") as features_out:
    pickle.dump(features, features_out)
    features_out.close()
with open("labels.pickle", "wb") as labels_out:
    pickle.dump(labels, labels_out)
    labels_out.close()
with open("1.pickle", "wb") as splits_out:
    pickle.dump(splits, splits_out)
    splits_out.close()