# Preparing the environment


## Libraries

In [237]:
#%pip install --upgrade pip
#%pip install pandas
#%pip install scipy 
#%pip install scikit-learn 
#%pip install tqdm 
#%pip install plotly 
#%pip install matplotlib
#%pip install nbformat
#%pip install fastparquet
#%pip install pyarrow

In [238]:
# requirements
import pandas as pd
import numpy as np
from scipy.stats import chi2, poisson, chisquare
from scipy.stats import entropy  # for KL (use small-smoothing)
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix
from tqdm import tqdm
import math, json
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.graph_objects as go

# Dataset - The APA DDoS Dataset

The APA DDoS Dataset, available at [Kaggle](https://www.kaggle.com/datasets/yashwanthkumbam/apaddos-dataset), is a HTTP request report containing IP source address, destinatination ip addres, request time frame and the attack label.



## Preparing the dataset
To make it useful for this project test, it was necessary to convert the columns name to the equivalent words used by the modules. The project assumes that the request information includes a HTTP method and an API endpoint, that is why the column ip.dst was converted to URI column.

In [239]:
import pandas as pd
ddos_data=pd.read_csv('../0-datasets/APA-DDoS/APA-DDoS-Dataset.csv')
ddos_data = ddos_data[['ip.src', 'ip.dst', 'frame.time', 'Label']] # original names
ddos_data.columns = ['address', 'uri', 'time_local', 'label'] # converted names

The time_local columns had to be formatted to discard miliseconds and location.

In [240]:
import re

ddos_data['time_local'] = ddos_data['time_local'].astype(str)
ddos_data['time_local'] = ddos_data['time_local'].apply(lambda x: re.findall('\d{2}-\w{3} \d{4} \d{2}:\d{2}:\d{2}', x)[0])
ddos_data['time_local'] = pd.to_datetime(ddos_data['time_local'])


invalid escape sequence '\d'


invalid escape sequence '\d'


invalid escape sequence '\d'



Below it is shown the number of reports benign traffic for each 5min-window. As can be seen, it was only possible to acquire 7 windows, and their reports are imbalanced.


In [241]:
ddos_data_benign = ddos_data.loc[ddos_data['label'] == 'Benign']
ddos_data_benign.groupby(pd.Grouper(key='time_local', freq='5min')).count()

Unnamed: 0_level_0,address,uri,label
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-16 20:15:00,6122,6122,6122
2020-06-16 20:20:00,19140,19140,19140
2020-06-16 20:25:00,12538,12538,12538
2020-06-16 20:30:00,0,0,0
2020-06-16 20:35:00,0,0,0
2020-06-16 20:40:00,0,0,0
2020-06-16 20:45:00,0,0,0
2020-06-16 20:50:00,0,0,0
2020-06-16 20:55:00,0,0,0
2020-06-16 21:00:00,0,0,0


In this experiment, a 5 minute window was fixed. Below it is shown the number of attack traffic reports for each 5min-window. It was also only possible to acquire 7 windows, and their reports are imbalanced.

In [242]:
ddos_data_attack = ddos_data.loc[~(ddos_data['label'] == 'Benign')].sort_values(by='time_local', ascending=True)
ddos_data_attack.groupby(pd.Grouper(key='time_local', freq='1min')).count()

Unnamed: 0_level_0,address,uri,label
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-16 20:18:00,5400,5400,5400
2020-06-16 20:19:00,5400,5400,5400
2020-06-16 20:20:00,0,0,0
2020-06-16 20:21:00,5400,5400,5400
2020-06-16 20:22:00,5400,5400,5400
...,...,...,...
2020-06-16 22:04:00,0,0,0
2020-06-16 22:05:00,5400,5400,5400
2020-06-16 22:06:00,0,0,0
2020-06-16 22:07:00,5400,5400,5400


In [243]:
groups = ddos_data.loc[ddos_data['time_local'] < datetime.strptime('2020-06-16 22:10:00', '%Y-%m-%d %H:%M:%S')].groupby(pd.Grouper(key='time_local', freq='5min')).count()
groups.shape

(23, 3)

In [244]:
from datetime import timedelta

groups = ddos_data.loc[ddos_data['time_local'] < datetime.strptime('2020-06-16 22:10:00', '%Y-%m-%d %H:%M:%S')].groupby(pd.Grouper(key='time_local', freq='5min')).count()

for index, group in groups.iterrows():
  end_interval = index + timedelta(minutes=5)
  dataset = ddos_data.loc[(ddos_data['time_local'] >= index) & (ddos_data['time_local'] < end_interval)]
  if(dataset.size > 0):
    display(dataset)
  #print(f"time: {index} | end: {end_interval}")

Unnamed: 0,address,uri,time_local,label
0,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
...,...,...,...,...
43917,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43918,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43919,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43920,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign


Unnamed: 0,address,uri,time_local,label
10800,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10801,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10802,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10803,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10804,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
...,...,...,...,...
63057,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63058,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63059,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63060,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign


Unnamed: 0,address,uri,time_local,label
27000,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27001,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27002,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27003,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27004,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
...,...,...,...,...
75595,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75596,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75597,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75598,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign


Unnamed: 0,address,uri,time_local,label
75600,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75601,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75602,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75603,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75604,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
...,...,...,...,...
120224,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120225,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120226,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120227,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign


Unnamed: 0,address,uri,time_local,label
86400,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86401,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86402,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86403,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86404,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
...,...,...,...,...
134650,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134651,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134652,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134653,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign


Unnamed: 0,address,uri,time_local,label
97200,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97201,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97202,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97203,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97204,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
...,...,...,...,...
148521,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148522,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148523,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148524,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
