In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import time
import pickle

In [2]:
root = "../../../../../"

In [3]:
df = pd.read_csv(root + "datasets/multiclass/processed/CICDDoS_pre.csv", index_col=[0])

In [4]:
df.head()

Unnamed: 0_level_0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10431,1,2,0,2736.0,0.0,1368.0,1368.0,1368.0,0.0,0.0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
13090,1,2,0,2650.0,0.0,1325.0,1325.0,1325.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
12990,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
19624,46,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
20691,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS


In [5]:
X = df.drop(columns=[' Label'])
y = df[' Label']

In [6]:
splits = 10
fs_times = []
train_times = []
fit_times = []
number_features = []
predict_times = []
test_sizes = []
score_times = []
test_accuracies = []
test_precisions = []
test_recalls = []
test_f1_scores = []

skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train,  X_test = X.iloc[train_index], X.iloc[test_index]
    y_train,  y_test = y.iloc[train_index], y.iloc[test_index]

    start = time.time()
    # Feature Selection
    fs_start = time.time()
    selector = VarianceThreshold(threshold=0.01)
    selector.fit(X_train)

    features_to_keep = X.columns[selector.get_support()]

    X_train = selector.transform(X_train)
    X_train = pd.DataFrame(X_train)
    X_train.columns = features_to_keep

    X_test = selector.transform(X_test)
    X_test = pd.DataFrame(X_test)
    X_test.columns = features_to_keep

    X_train.drop(columns=' Fwd Header Length.1', inplace=True)
    X_test.drop(columns=' Fwd Header Length.1', inplace=True)
    fs_end = time.time()
    # Training the model
    train_start = time.time()
    clf_xgb = xgb.XGBClassifier(seed=42)
    # clf_xgb.fit(X_train, 
    #             y_train,
    #             # verbose=True,
    #             ## the next three arguments set up early stopping.
    #             early_stopping_rounds=5,
    #             eval_metric='logloss',
    #             eval_set=[(X_test, y_test)])
    clf_xgb.fit(X_train, y_train)
    train_end = time.time()
    end = time.time()

    fs_times.append(fs_end - fs_start)
    train_times.append(train_end - train_start)
    fit_times.append(end - start)

    number_features.append(len(X_train.columns))

    start = time.time()
    y_pred = clf_xgb.predict(X_test)
    end = time.time()
    predict_times.append(end - start)

    test_sizes.append(len(y_pred))

    start = time.time()
    test_accuracies.append(accuracy_score(y_test, y_pred))
    test_precisions.append(precision_score(y_test, y_pred, average="macro"))
    test_recalls.append(recall_score(y_test, y_pred, average='macro'))
    test_f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    end = time.time()
    score_times.append(end - start)

fs_times = np.array(fs_times)
train_times = np.array(train_times)
fit_times = np.array(fit_times)
number_features = np.array(number_features)
predict_times = np.array(predict_times)
test_sizes = np.array(test_sizes)
test_accuracies = np.array(test_accuracies)
test_precisions = np.array(test_precisions)
test_recalls = np.array(test_recalls)
test_f1_scores = np.array(test_f1_scores)
score_times = np.array(score_times)



In [7]:
pfm = pd.DataFrame([test_accuracies, test_precisions, test_recalls, test_f1_scores,
                    fit_times, fs_times, train_times, predict_times, score_times, 
                    number_features, test_sizes])
pfm = pfm.T
pfm.columns = ["Accuracy", "Precision", "Recall", "F1_Score", 
                "Fit_Time", "FS_Time", "Train_Time","Predict_Time", "Score_Time", 
                "Number_Features", "Test_Size"]

In [8]:
filename = root + "pickles/multiclass_categorical/cross_validation/label_independent/basic.pkl"
outfile = open(filename, 'wb')
pickle.dump(pfm, outfile)
outfile.close()