In [15]:
import pandas as pd
import numpy as np

csv_files = [
    #'Monday-WorkingHours.pcap_ISCX.csv',
    'Tuesday-WorkingHours.pcap_ISCX.csv',
    #'Wednesday-workingHours.pcap_ISCX.csv',
    #'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    # 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    #'Friday-WorkingHours-Morning.pcap_ISCX.csv',
    # 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    # 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
]

days = [
    # 'Monday',
    'Tuesday',
    # 'Wednesday',
    # 'Thursday',
    #'Friday'
]

In [16]:
from math import floor
import numpy as np

def balancing(rawDataFrame):
    # Preprocessing
    rawDataFrame['Label'] = rawDataFrame['Label'].apply(lambda x: 'BENIGN' if x == 'BENIGN' else 'Abnormal')
    rawDataFrame = rawDataFrame.drop(rawDataFrame[pd.isnull(rawDataFrame['Flow ID'])].index)
    rawDataFrame.replace('Infinity', -1, inplace=True)
    rawDataFrame.replace([np.inf, -np.inf, np.nan], -1, inplace=True)
    rawDataFrame[['Flow Bytes/s', 'Flow Packets/s']] = rawDataFrame[['Flow Bytes/s','Flow Packets/s']].apply(pd.to_numeric)
    
    rawDataFrame = rawDataFrame.loc[rawDataFrame['Flow Bytes/s'] > 0]
    rawDataFrame = rawDataFrame.loc[rawDataFrame['Flow Packets/s'] > 0]
    rawDataFrame = rawDataFrame.loc[rawDataFrame['Flow Duration'] > 0]

    rawDataFrame.to_csv('./beforelog.csv')

    # Log transformation of each variable
    rawDataFrame['Flow Bytes/s'] = np.log(rawDataFrame['Flow Bytes/s']) 
    rawDataFrame['Flow Packets/s'] = np.log(rawDataFrame['Flow Packets/s'])
    rawDataFrame['Flow Duration'] = np.log(rawDataFrame['Flow Duration'])

    rawDataFrame.to_csv('./afterlogTue.csv')
    

    attack_df = rawDataFrame.loc[rawDataFrame['Label'] != 'BENIGN']
    attack_df.to_csv('./attack.csv')
    attack_count = len(attack_df.index)
    print(attack_count)
    raw_normal_df = rawDataFrame.loc[rawDataFrame['Label'] == 'BENIGN']
    raw_normal_df.to_csv('./benign.csv')
    log_raw_df = pd.concat([attack_df, raw_normal_df])
    log_raw_df.to_csv('./logRawTue.csv')
  
    normal_count = int(floor(attack_count / 30 * 70))
    normal_df = rawDataFrame.loc[rawDataFrame['Label'] == 'BENIGN'].sample(normal_count)
    normal_df.to_csv('./normal.csv')
    normal_count = len(normal_df.index)
    print(f'Normal: {normal_count}')
    log_final_df = pd.concat([attack_df, normal_df])
    log_final_df.to_csv('./logTue.csv')
    return pd.concat([attack_df, normal_df])



In [17]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def hyperparameter_selection(balancedDataFrame):
    excluded = ['Flow ID', 'Source IP', 'Source Port',
                'Destination IP', 'Destination Port', 'Protocol', 'Timestamp']
    balancedDataFrame = balancedDataFrame.drop(columns=excluded, errors='ignore')
    balancedDataFrame['Label'] = balancedDataFrame['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
    
    
    # Find best estimator
    y = balancedDataFrame['Label'].values
    X = balancedDataFrame.drop(columns=['Label'])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=np.random.seed(42))
    X_train.shape, X_test.shape
    rfc = RandomForestClassifier(random_state=np.random.seed(42))

    rfc.fit(X_train, y_train)
    parameters = {
        'n_estimators': [20, 40, 60],
        'min_samples_leaf': [2, 4, 6],
        'max_features': [4, 6, 8],
        'max_depth': [13, 15, 17]
    }
    gcv = GridSearchCV(rfc, parameters, scoring=['f1', 'accuracy'],
                      refit= 'f1', cv=3, return_train_score=True)
    gcv.fit(X_train, y_train)
    
    # The best score produced on the test folds from your training data
    best_score = gcv.best_score_
    print (f'Best score: {best_score}')
    # The accuracy on the test set
    accuracy = gcv.score(X_test, y_test) 
    print (f'Accuracy: {accuracy}')


    return [gcv.best_estimator_, gcv.best_params_, gcv.best_score_]
    


In [18]:

import sklearn.metrics as metrics
import pickle

def score(givenDataFrame, model_config):
    givenDataFrame.to_csv('./actual.csv')

    
    # Prepare training data & testing data
    excluded = ['Flow ID', 'Source IP', 'Source Port',
                'Destination IP', 'Destination Port', 'Protocol', 'Timestamp']
    excluded_df = givenDataFrame[excluded]
    excluded_df.to_csv('./excluded.csv')
    givenDataFrame = givenDataFrame.drop(columns=excluded, errors='ignore')
    givenDataFrame['Label'] = givenDataFrame['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
    
    y = givenDataFrame['Label'].values
    X = givenDataFrame.drop(columns=['Label'])
    beforepredict_df = X
    beforepredict_df.to_csv('./beforepredict.csv')
    print(f'Given DataFrame shape: {X.shape}')
    print(f'Label shape: {y.shape}')
    print(f'Labels: {givenDataFrame["Label"].unique()}')

    
    # Training Data into model
    rfc = model_config[0]

    # Predict
    y_pred = rfc.predict(X)
    pd.DataFrame(y_pred).to_csv('./randompredict.csv')
    print ({X.shape})
    print ({y_pred.shape})

    predict_df = excluded_df.join(X).join(pd.DataFrame(y_pred, index = X.index, columns=['Label']))
    predict_df['Label'] = predict_df['Label'].apply(lambda x: 'BENIGN' if x == 0 else 'Abnormal')
    predict_df.to_csv('./predicted.csv')

    print(len(predict_df))
    print(len(givenDataFrame))


In [19]:

total = 0
for day in days:
     # Filter & merge dataset by day
    paths = list(filter(lambda s: s.startswith(day), csv_files))
    rawDataFrame = None
    for path in paths:
        absolute_path = f'/Users/emmap/Downloads/TrafficLabelling/{path}'
        if rawDataFrame is None:
             rawDataFrame = pd.read_csv(absolute_path)
        else:
             rawDataFrame = pd.concat([rawDataFrame, pd.read_csv(absolute_path)])
             
    total += len(rawDataFrame.index)
    print(f'{day}: {len(rawDataFrame.index)} rows')
    rawDataFrame = rawDataFrame.rename(columns=lambda s: s.strip())
    balance_df = balancing(rawDataFrame)
    print(balance_df.shape)
    model_config = hyperparameter_selection(balance_df)
    # Random sampling of the dataframe to pass to "score"
    sample_balance = balance_df.sample(frac = 0.002)
    score(sample_balance, model_config)
    sample_balance.to_csv('./balanceTue.csv', index=False)

    


print(f'Total: {total}')
print(model_config[0])
print(model_config[1])
print(model_config[2])

Tuesday: 445909 rows
10896
Normal: 25424
(36320, 85)
Best score: 0.9994721562365821
Accuracy: 0.9995477159656264
Given DataFrame shape: (73, 77)
Label shape: (73,)
Labels: [1 0]
{(73, 77)}
{(73,)}
73
73
Total: 445909
RandomForestClassifier(max_depth=13, max_features=6, min_samples_leaf=2,
                       n_estimators=20)
{'max_depth': 13, 'max_features': 6, 'min_samples_leaf': 2, 'n_estimators': 20}
0.9994721562365821


numpy.float64