# Preparing the environment


## Libraries

In [237]:
#%pip install --upgrade pip
#%pip install pandas
#%pip install scipy 
#%pip install scikit-learn 
#%pip install tqdm 
#%pip install plotly 
#%pip install matplotlib
#%pip install nbformat
#%pip install fastparquet
#%pip install pyarrow

In [238]:
# requirements
import pandas as pd
import numpy as np
from scipy.stats import chi2, poisson, chisquare
from scipy.stats import entropy  # for KL (use small-smoothing)
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix
from tqdm import tqdm
import math, json
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.graph_objects as go

# Dataset 1 - The APA DDoS Dataset

The APA DDoS Dataset, available at [Kaggle](https://www.kaggle.com/datasets/yashwanthkumbam/apaddos-dataset), is a HTTP request report containing IP source address, destinatination ip addres, request time frame and the attack label.



## Preparing the dataset
To make it useful for this project test, it was necessary to convert the columns name to the equivalent words used by the modules. The project assumes that the request information includes a HTTP method and an API endpoint, that is why the column ip.dst was converted to URI column.

In [239]:
import pandas as pd
ddos_data=pd.read_csv('./datasets/APA-DDoS-Dataset.csv')
ddos_data = ddos_data[['ip.src', 'ip.dst', 'frame.time', 'Label']] # original names
ddos_data.columns = ['address', 'uri', 'time_local', 'label'] # converted names

The time_local columns had to be formatted to discard miliseconds and location.

In [240]:
import re

ddos_data['time_local'] = ddos_data['time_local'].astype(str)
ddos_data['time_local'] = ddos_data['time_local'].apply(lambda x: re.findall('\d{2}-\w{3} \d{4} \d{2}:\d{2}:\d{2}', x)[0])
ddos_data['time_local'] = pd.to_datetime(ddos_data['time_local'])


invalid escape sequence '\d'


invalid escape sequence '\d'


invalid escape sequence '\d'



Below it is shown the number of reports benign traffic for each 5min-window. As can be seen, it was only possible to acquire 7 windows, and their reports are imbalanced.


In [241]:
ddos_data_benign = ddos_data.loc[ddos_data['label'] == 'Benign']
ddos_data_benign.groupby(pd.Grouper(key='time_local', freq='5min')).count()

Unnamed: 0_level_0,address,uri,label
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-16 20:15:00,6122,6122,6122
2020-06-16 20:20:00,19140,19140,19140
2020-06-16 20:25:00,12538,12538,12538
2020-06-16 20:30:00,0,0,0
2020-06-16 20:35:00,0,0,0
2020-06-16 20:40:00,0,0,0
2020-06-16 20:45:00,0,0,0
2020-06-16 20:50:00,0,0,0
2020-06-16 20:55:00,0,0,0
2020-06-16 21:00:00,0,0,0


In this experiment, a 5 minute window was fixed. Below it is shown the number of attack traffic reports for each 5min-window. It was also only possible to acquire 7 windows, and their reports are imbalanced.

In [242]:
ddos_data_attack = ddos_data.loc[~(ddos_data['label'] == 'Benign')].sort_values(by='time_local', ascending=True)
ddos_data_attack.groupby(pd.Grouper(key='time_local', freq='1min')).count()

Unnamed: 0_level_0,address,uri,label
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-16 20:18:00,5400,5400,5400
2020-06-16 20:19:00,5400,5400,5400
2020-06-16 20:20:00,0,0,0
2020-06-16 20:21:00,5400,5400,5400
2020-06-16 20:22:00,5400,5400,5400
...,...,...,...
2020-06-16 22:04:00,0,0,0
2020-06-16 22:05:00,5400,5400,5400
2020-06-16 22:06:00,0,0,0
2020-06-16 22:07:00,5400,5400,5400


In [243]:
groups = ddos_data.loc[ddos_data['time_local'] < datetime.strptime('2020-06-16 22:10:00', '%Y-%m-%d %H:%M:%S')].groupby(pd.Grouper(key='time_local', freq='5min')).count()
groups.shape

(23, 3)

In [244]:
from datetime import timedelta

groups = ddos_data.loc[ddos_data['time_local'] < datetime.strptime('2020-06-16 22:10:00', '%Y-%m-%d %H:%M:%S')].groupby(pd.Grouper(key='time_local', freq='5min')).count()

for index, group in groups.iterrows():
  end_interval = index + timedelta(minutes=5)
  dataset = ddos_data.loc[(ddos_data['time_local'] >= index) & (ddos_data['time_local'] < end_interval)]
  if(dataset.size > 0):
    display(dataset)
  #print(f"time: {index} | end: {end_interval}")

Unnamed: 0,address,uri,time_local,label
0,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2020-06-16 20:18:15,DDoS-PSH-ACK
...,...,...,...,...
43917,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43918,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43919,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign
43920,192.168.5.1,192.168.23.2,2020-06-16 20:19:59,Benign


Unnamed: 0,address,uri,time_local,label
10800,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10801,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10802,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10803,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
10804,192.168.7.1,192.168.23.2,2020-06-16 20:21:14,DDoS-PSH-ACK
...,...,...,...,...
63057,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63058,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63059,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign
63060,192.168.13.1,192.168.23.2,2020-06-16 20:24:59,Benign


Unnamed: 0,address,uri,time_local,label
27000,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27001,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27002,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27003,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
27004,192.168.11.1,192.168.23.2,2020-06-16 20:25:27,DDoS-PSH-ACK
...,...,...,...,...
75595,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75596,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75597,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign
75598,192.168.19.1,192.168.23.2,2020-06-16 20:28:21,Benign


Unnamed: 0,address,uri,time_local,label
75600,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75601,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75602,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75603,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
75604,192.168.1.1,192.168.23.2,2020-06-16 21:57:57,DDoS-ACK
...,...,...,...,...
120224,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120225,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120226,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign
120227,192.168.5.1,192.168.23.2,2020-06-16 21:59:59,Benign


Unnamed: 0,address,uri,time_local,label
86400,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86401,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86402,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86403,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
86404,192.168.7.1,192.168.23.2,2020-06-16 22:01:09,DDoS-ACK
...,...,...,...,...
134650,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134651,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134652,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign
134653,192.168.16.1,192.168.23.2,2020-06-16 22:04:59,Benign


Unnamed: 0,address,uri,time_local,label
97200,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97201,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97202,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97203,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
97204,192.168.14.1,192.168.23.2,2020-06-16 22:05:19,DDoS-ACK
...,...,...,...,...
148521,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148522,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148523,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign
148524,192.168.19.1,192.168.23.2,2020-06-16 22:09:57,Benign


# Dataset 2 - CICDDoS2019

The CICDDoS2019, available at [University of New Brunswick website](https://www.unb.ca/cic/datasets/ddos-2019.html), contains different kinds of DDoS attacks and realistic traffic profiles. It is a HTTP request report containing IP source address, destinatination ip addres, request time frame and the attack label.


## Preparing the dataset

Analysing csv columns to convert it in HTTP equivalent data.

In [245]:
file_ext='csv'
cic_03_11_path=f"./datasets/DrDos_UDP.{file_ext}"
cic_01_12_path=f"./datasets/UDP.{file_ext}"

In [246]:
cic_03_11_df=pd.read_csv(cic_03_11_path)
cic_03_11_df.columns

KeyboardInterrupt: 

In [None]:
cic_01_12_df=pd.read_csv(cic_01_12_path)
cic_01_12_df.columns

  cic_01_12_df=pd.read_csv(cic_01_12_path)


Index(['Unnamed: 0', 'Flow ID', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
       ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Len

In [None]:
def prepare_columns(df):
	"""Converting columns to represent HTTP endpoints"""
	df = df[[' Source IP', ' Destination IP', ' Destination Port', ' Timestamp', ' Label']].copy()
	
	df[' Destination IP'] = df[' Destination IP'].astype(str)
	df[' Destination Port'] = df[' Destination Port'].astype(str) 
	df['uri'] = df[' Destination IP'] + ':' + df[' Destination Port']
	df = df[[ ' Source IP', 'uri', ' Timestamp', ' Label']] # original names
	df.columns = ['address', 'uri', 'time_local', 'label'] # converted names

	# parse timestamp
	df['time_local'] = pd.to_datetime(df['time_local'], errors='coerce')
	df = df.sort_values('time_local')
	df = df.dropna(subset=['time_local'])

	# create normalized columns
	df['time_local'] = df['time_local'].dt.floor('s')
	df['uri'] = df['uri'].astype(str)
	df['address'] = df['address'].astype(str)
	df['label'] = df['label'].astype(str)

	return df

In [None]:
df_1 = prepare_columns(cic_03_11_df)
df_1.head()

Unnamed: 0,address,uri,time_local,label
0,172.16.0.5,192.168.50.1:6652,2018-12-01 12:36:57,DrDoS_UDP
1,172.16.0.5,192.168.50.1:9712,2018-12-01 12:36:57,DrDoS_UDP
2,172.16.0.5,192.168.50.1:4680,2018-12-01 12:36:57,DrDoS_UDP
3,172.16.0.5,192.168.50.1:2644,2018-12-01 12:36:57,DrDoS_UDP
4,172.16.0.5,192.168.50.1:16901,2018-12-01 12:36:57,DrDoS_UDP


In [None]:
df_1["time_local"].head(5).map(repr).tolist()

["Timestamp('2018-12-01 12:36:57')",
 "Timestamp('2018-12-01 12:36:57')",
 "Timestamp('2018-12-01 12:36:57')",
 "Timestamp('2018-12-01 12:36:57')",
 "Timestamp('2018-12-01 12:36:57')"]

In [None]:
df_2 = prepare_columns(cic_01_12_df)
df_2.head()

Unnamed: 0,address,uri,time_local,label
0,172.16.0.5,192.168.50.4:15931,2018-11-03 10:42:57,MSSQL
1,172.16.0.5,192.168.50.4:29997,2018-11-03 10:42:57,MSSQL
2,172.16.0.5,192.168.50.4:29887,2018-11-03 10:42:57,MSSQL
3,172.16.0.5,192.168.50.4:7393,2018-11-03 10:42:57,MSSQL
4,172.16.0.5,192.168.50.4:57957,2018-11-03 10:42:57,MSSQL


In [None]:
df_1.value_counts('label')

label
DrDoS_UDP    3134645
BENIGN          2157
Name: count, dtype: int64

In [None]:
df_2.value_counts('label')

label
UDP       3754680
MSSQL       24392
BENIGN       3134
Name: count, dtype: int64

In [None]:
def eda_label(df):
	time_col = 'time_local'
	label_col = 'label'

	df[time_col] = pd.to_datetime(df[time_col])

	# Resample (e.g. per minute, per second, etc.)
	# Choose the frequency depending on your dataset ('1T' = 1 minute, '1S' = 1 second, etc.)
	counts = df.groupby([label_col, pd.Grouper(key=time_col, freq='min')]).size().reset_index(name='count')

	# Plot evolution over time
	fig = px.line(counts, x=time_col, y='count', color=label_col,
				title='Event frequency over time',
				labels={time_col: 'Time', 'count': 'Count'})
	fig.show()

In [None]:
def eda_label(df):
	time_col = 'time_local'
	label_col = 'label'

	df[time_col] = pd.to_datetime(df[time_col])

	# Resample (e.g. per minute, per second, etc.)
	# Choose the frequency depending on your dataset ('1T' = 1 minute, '1S' = 1 second, etc.)
	counts = df.groupby([label_col, pd.Grouper(key=time_col, freq='min')]).size().reset_index(name='count')

		# Plot evolution over time
	fig = px.line(counts, x=time_col, y='count', color=label_col,
				title='Event frequency over time',
				labels={time_col: 'Time', 'count': 'Count'})

	fig.show()



In [None]:
df_benign_1 = df_1[df_1['label'] == 'BENIGN'].copy()

eda_label(df_1)

In [None]:
df_benign_2 = df_2[df_2['label'] == 'BENIGN'].copy()

eda_label(df_2)

Checking first time entry.

In [None]:
df_sorted = df_1.sort_values('time_local')
df_sorted.head(1)

Unnamed: 0,address,uri,time_local,label
0,172.16.0.5,192.168.50.1:6652,2018-12-01 12:36:57,DrDoS_UDP


In [None]:
df_agg = df_1.sort_values('time_local').groupby(['time_local','uri']).size()

df_agg

time_local           uri               
2018-12-01 12:36:57  172.16.0.5:21497      1
                     192.168.50.1:0        1
                     192.168.50.1:10012    2
                     192.168.50.1:10018    1
                     192.168.50.1:10028    2
                                          ..
2018-12-01 13:04:45  192.168.50.1:9907     6
                     192.168.50.1:9914     5
                     192.168.50.1:9938     1
                     192.168.50.1:9968     5
                     192.168.50.1:9970     5
Length: 1084960, dtype: int64

In [None]:
df_1['time_local'].value_counts()

time_local
2018-12-01 13:03:11    10776
2018-12-01 13:03:16    10094
2018-12-01 12:59:01    10077
2018-12-01 13:04:32     9963
2018-12-01 13:03:10     9930
                       ...  
2018-12-01 12:46:20        1
2018-12-01 12:46:26        1
2018-12-01 12:40:12        1
2018-12-01 12:48:58        1
2018-12-01 12:48:59        1
Name: count, Length: 1521, dtype: int64

In [None]:
display(df_1['uri'].nunique())
display(df_2['uri'].nunique())

set1 = set(df_1['uri'].unique())
set2 = set(df_2['uri'].unique())


common_values = set1.intersection(set2)
print(f"Common unique values: {len(common_values)}")

67029

67033

Common unique values: 51


In [None]:
df_t1 = df_1[df_1['uri'].isin(common_values)].copy()
df_t2 = df_2[df_2['uri'].isin(common_values)].copy()

display(df_t1.shape)
display(df_t1.shape)

(1295, 4)

(1295, 4)

In [None]:
df_t1['is_attack'] = ~df_t1['label'].astype(str).str.upper().eq('BENIGN')
df_t2['is_attack'] = ~df_t2['label'].astype(str).str.upper().eq('BENIGN')

In [None]:
display(df_t1['is_attack'].head())
display(df_t2['is_attack'].head())

29378    False
34281    False
36759    False
45152     True
48566    False
Name: is_attack, dtype: bool

5681     False
6320     False
14572    False
14575    False
14688    False
Name: is_attack, dtype: bool

In [None]:
counts_t1 = df_t1.groupby(['uri', 'is_attack']).size()
counts_t1 = counts_t1.unstack(level='is_attack')
counts_t1.head()

is_attack,False,True
uri,Unnamed: 1_level_1,Unnamed: 2_level_1
104.36.115.113:443,24.0,
172.16.0.5:10015,,1.0
172.16.0.5:15674,,1.0
172.16.0.5:18930,,1.0
172.16.0.5:20192,,1.0


In [None]:
counts_t2 = df_t2.groupby(['uri', 'is_attack']).size()
counts_t2 = counts_t2.unstack(level='is_attack')
counts_t2.head()

is_attack,False,True
uri,Unnamed: 1_level_1,Unnamed: 2_level_1
104.36.115.113:443,31.0,
172.16.0.5:10015,,1.0
172.16.0.5:15674,,1.0
172.16.0.5:18930,,1.0
172.16.0.5:20192,,1.0


In [None]:
display(counts_t1[True].value_counts())
display(counts_t1[True].isna().sum())
display(counts_t1[False].value_counts())
display(counts_t1[False].isna().sum())

True
0.0    36
1.0    14
3.0     1
Name: count, dtype: int64

np.int64(0)

False
0.0      15
3.0       5
2.0       5
6.0       4
1.0       3
5.0       2
15.0      2
11.0      2
14.0      2
24.0      1
4.0       1
36.0      1
30.0      1
10.0      1
66.0      1
541.0     1
74.0      1
132.0     1
25.0      1
194.0     1
Name: count, dtype: int64

np.int64(0)

In [None]:
display(counts_t2[True].value_counts())
display(counts_t2[True].isna().sum())
display(counts_t2[False].value_counts())
display(counts_t2[False].isna().sum())

True
1.0    14
2.0     1
Name: count, dtype: int64

np.int64(36)

False
2.0      4
11.0     3
4.0      3
5.0      3
7.0      2
1.0      2
31.0     1
13.0     1
9.0      1
55.0     1
14.0     1
21.0     1
8.0      1
17.0     1
6.0      1
54.0     1
49.0     1
3.0      1
70.0     1
47.0     1
630.0    1
65.0     1
115.0    1
15.0     1
97.0     1
Name: count, dtype: int64

np.int64(15)

In [None]:
counts_t1.fillna(value=0, inplace=True)
counts_t2.fillna(value=0, inplace=True)

In [None]:
counts_t1[counts_t1[True]==0].head()

is_attack,False,True
uri,Unnamed: 1_level_1,Unnamed: 2_level_1
104.36.115.113:443,24.0,0.0
172.217.1.162:443,5.0,0.0
172.217.1.2:443,6.0,0.0
172.217.10.130:443,36.0,0.0
172.217.10.34:443,11.0,0.0


In [None]:
display(counts_t1[counts_t1[True]==0].shape)
display(counts_t2[counts_t2[True]==0].shape)

(36, 2)

(36, 2)

In [None]:
display(counts_t1[counts_t1[False]==0].shape)
display(counts_t2[counts_t2[False]==0].shape)

(15, 2)

(15, 2)

In [None]:
counts_t1

is_attack,False,True
uri,Unnamed: 1_level_1,Unnamed: 2_level_1
104.36.115.113:443,24.0,0.0
172.16.0.5:10015,0.0,1.0
172.16.0.5:15674,0.0,1.0
172.16.0.5:18930,0.0,1.0
172.16.0.5:20192,0.0,1.0
172.16.0.5:31864,0.0,1.0
172.16.0.5:35571,0.0,1.0
172.16.0.5:36339,0.0,1.0
172.16.0.5:36357,0.0,1.0
172.16.0.5:41344,0.0,1.0


In [None]:
attack_end = counts_t1[counts_t1[True] > 0].reset_index()
attack_end

is_attack,uri,False,True
0,172.16.0.5:10015,0.0,1.0
1,172.16.0.5:15674,0.0,1.0
2,172.16.0.5:18930,0.0,1.0
3,172.16.0.5:20192,0.0,1.0
4,172.16.0.5:31864,0.0,1.0
5,172.16.0.5:35571,0.0,1.0
6,172.16.0.5:36339,0.0,1.0
7,172.16.0.5:36357,0.0,1.0
8,172.16.0.5:41344,0.0,1.0
9,172.16.0.5:45444,0.0,3.0


In [None]:
counts_t2_r = counts_t2.reset_index()
attack_d2 = counts_t2_r[counts_t2_r['uri'].isin(attack_end['uri'])]
attack_d2.shape

(15, 3)

In [None]:
df_t1[df_t1['uri']=='104.36.115.113:443']

Unnamed: 0,address,uri,time_local,label,is_attack
71579,192.168.50.8,104.36.115.113:443,2018-12-01 12:54:42,BENIGN,False
126938,192.168.50.8,104.36.115.113:443,2018-12-01 12:55:12,BENIGN,False
259890,192.168.50.8,104.36.115.113:443,2018-12-01 12:55:43,BENIGN,False
410515,192.168.50.8,104.36.115.113:443,2018-12-01 12:56:03,BENIGN,False
521272,192.168.50.8,104.36.115.113:443,2018-12-01 12:56:23,BENIGN,False
584621,192.168.50.8,104.36.115.113:443,2018-12-01 12:56:38,BENIGN,False
1336925,192.168.50.6,104.36.115.113:443,2018-12-01 12:59:14,BENIGN,False
1337538,192.168.50.6,104.36.115.113:443,2018-12-01 12:59:15,BENIGN,False
1337652,192.168.50.6,104.36.115.113:443,2018-12-01 12:59:15,BENIGN,False
1384004,192.168.50.6,104.36.115.113:443,2018-12-01 12:59:25,BENIGN,False


In [None]:
eda_label(df_t1[df_t1['uri']=='172.16.0.5:15674'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
attack_end['uri']

0     172.16.0.5:10015
1     172.16.0.5:15674
2     172.16.0.5:18930
3     172.16.0.5:20192
4     172.16.0.5:31864
5     172.16.0.5:35571
6     172.16.0.5:36339
7     172.16.0.5:36357
8     172.16.0.5:41344
9     172.16.0.5:45444
10    172.16.0.5:48462
11    172.16.0.5:55727
12    172.16.0.5:57107
13    172.16.0.5:59011
14     172.16.0.5:7469
Name: uri, dtype: object

In [None]:
df_eda = df_t1[df_t1['uri'].isin(attack_end['uri'])]

result = (
    df_eda.groupby('uri')
      .agg(initial_time=('time_local', 'min'),
           final_time=('time_local', 'max'))
      .reset_index()
)

result

Unnamed: 0,uri,initial_time,final_time
0,172.16.0.5:10015,2018-12-01 12:37:05,2018-12-01 12:37:05
1,172.16.0.5:15674,2018-12-01 13:03:20,2018-12-01 13:03:20
2,172.16.0.5:18930,2018-12-01 12:59:49,2018-12-01 12:59:49
3,172.16.0.5:20192,2018-12-01 12:55:48,2018-12-01 12:55:48
4,172.16.0.5:31864,2018-12-01 12:50:47,2018-12-01 12:50:47
5,172.16.0.5:35571,2018-12-01 12:56:45,2018-12-01 12:56:45
6,172.16.0.5:36339,2018-12-01 12:56:27,2018-12-01 12:56:27
7,172.16.0.5:36357,2018-12-01 12:48:32,2018-12-01 12:48:32
8,172.16.0.5:41344,2018-12-01 12:55:10,2018-12-01 12:55:10
9,172.16.0.5:45444,2018-12-01 12:52:28,2018-12-01 12:52:28


In [None]:
import datetime as dt
import datetime

result["window_60s"] = (
    (result["final_time"] - result["initial_time"]) 
    / datetime.timedelta(seconds=60)
)

result.sort_values('window_60s')

wd_df = result['window_60s'].value_counts().reset_index().sort_values(['window_60s'])
wd_df[wd_df['window_60s'] >= 20]

Unnamed: 0,window_60s,count


In [None]:
wd_df

Unnamed: 0,window_60s,count
0,0.0,15


In [None]:
uris_gt_20 = result[result['window_60s'] >=20]['uri']
uris_gt_20

31        192.168.10.50:22
36             224.0.0.5:0
48               8.0.6.4:0
40      255.255.255.255:67
42              4.2.2.4:53
47       74.208.236.171:80
49              8.8.8.8:53
39    239.255.255.250:1900
45          72.21.91.29:80
Name: uri, dtype: object

In [None]:
df_t1 = df_t1[df_t1['uri'].isin(uris_gt_20)]
df_t1.shape

(1063, 5)

In [None]:
df_t1[['uri', 'is_attack']].groupby(['uri', 'is_attack']).size().unstack(level='is_attack').reset_index()

is_attack,uri,False
0,192.168.10.50:22,14
1,224.0.0.5:0,66
2,239.255.255.250:1900,2
3,255.255.255.255:67,15
4,4.2.2.4:53,541
5,72.21.91.29:80,74
6,74.208.236.171:80,132
7,8.0.6.4:0,25
8,8.8.8.8:53,194


## Training phase: identifying typical traffic behavior

In [None]:
### TRAINING MODULE: detects typical traffic flow

def compute_lambda_by_api_endpoint(df, label_benign='BENIGN', window_seconds=60):
	train_df = df[df['label'].astype(str).str.upper() == label_benign.upper()]

	lambdas = {}

	# time range for entire trace (use global range to create aligned second index)
	tmin = df['time_local'].min()
	tmax = df['time_local'].max()

	# for each uri compute per-second counts and windowed mean
	groups = train_df.groupby(['uri'])

	for uri, g in groups:
		# for each timestamp calculates the total of requests
		counts = g.groupby('time_local').size() 

		# reindex full second range for stability
		# to avoid missing seconds
		all_seconds = pd.date_range(tmin, tmax, freq='s')
		counts = counts.reindex(all_seconds, fill_value=0)
		arr = counts.values
		
		# filtra os segundos do 
		if len(arr) < window_seconds:
			lam = arr.mean()
		else:
			# calculates how many windows are in the interval
			# // returns floor round, so the result can be lower
			# e.g. 7 secs // 3 secs -> 2 windows (only considers 6 seconds)
			n_win = len(arr) // window_seconds

			# filters interval to the exact windows size
			arr = arr[:n_win*window_seconds]
			
			# fragments the interval in the number of windows
			arr = arr.reshape(n_win, window_seconds)

			# calculates each window mean
			window_means = arr.sum(axis=1) / window_seconds

			# computes the mean of the means as lambda
			lam = max(window_means.mean(), 0.0)
		lambdas[uri] = lam
	return lambdas

## Monitoring phase: observes traffic for atypical behavior

In [None]:
### MONITORING MODULE: Observes the traffic and compare with the registered lambdas
def kl_divergence(p, q):
	''' Calculates KL divergence between p and q'''
	SIG_EPS = 1e-10 # avoids division by zero
	p = np.asarray(p, dtype=float) + SIG_EPS
	q = np.asarray(q, dtype=float) + SIG_EPS
	return entropy(p, q)

def gaussian_membership(u, mu=0.0, sigma=1.0):
    return math.exp(-((u-mu)**2) / (2*(sigma**2)))

def calculate_D(obs_counts, expected, alpha, pmf_y, dX):
	# checks for anomaly
	SIG_EPS = 1e-10 # avoids division by zero
	try:
		if np.mean(expected) == 0:
			print("ZERROOOOU")
			expected = np.asarray(expected, dtype=float) + SIG_EPS
			_, pval = np.nan, 1.0
		_, pval = chisquare(f_obs=obs_counts, f_exp=expected)
	except Exception:
		_, pval = np.nan, 1.0
	if pval < alpha:
		D = kl_divergence(pmf_y, dX)
	else:
		D = 0.0
	return D

def extract_traffic_changes(obs_counts, expected, alpha, pmf_y, dX, current_window, lambda_ep, seconds_in_window):
	D = calculate_D(obs_counts, expected, alpha, pmf_y, dX)
	Xbar = current_window.mean()
	Delta = (Xbar - lambda_ep) / max(lambda_ep, 1)
	Z = (Xbar - lambda_ep) / math.sqrt(max(lambda_ep, 1) / seconds_in_window)
	return D, Delta, Z

def fuzzification(current_window, D, Delta, Z):
	sigma_u = max(1.0, np.std(current_window))  # adaptive width
	fD = gaussian_membership(D, mu=0.0, sigma=sigma_u)
	fDelta = gaussian_membership(Delta, mu=0.0, sigma=sigma_u)
	fZ = gaussian_membership(Z, mu=0.0, sigma=sigma_u)
	return fD, fDelta, fZ

def anomaly_score(fD, fDelta, fZ, wDelta, wD, wZ):
	fDprime = 1 - fD
	fDelprime = 1 - fDelta
	fZprime = 1 - fZ
	eta = fDelprime + fDprime + fZprime
	return eta/3

def sample_expected_traffic(max_count, current_window, lambda_ep):
	# samples a poisson pmf for the estimated lambda
	bins = np.arange(0, max_count+1)
	dY = poisson.pmf(bins, mu=lambda_ep)
	if dY.sum() == 0:
		# if pmf invalid, uses a uniform
		dY = np.ones_like(dY) / len(dY)
	else:
		dY = dY / dY.sum()
	expected_traffic = dY * current_window.size
	return expected_traffic, dY

def get_observed_traffic(max_count, current_window):
	obs_traffic, _ = np.histogram(current_window, bins=np.arange(0,max_count+2))
	if obs_traffic.sum() == 0:
		dX = np.zeros_like(obs_traffic, dtype=float)
	else:
		dX = obs_traffic / obs_traffic.sum()
	return obs_traffic, dX


def analyze_window(current_window, lambda_endpoint, seconds_in_window=60, alpha=0.05, beta=1.0, zeta=0.5, wD=0.5, wDelta=0.3, wZ=0.2):
	''''''
	max_count = max(int(current_window.max()), int(lambda_endpoint*2)+5)
	
	obs_traffic, dX = get_observed_traffic(max_count, current_window)
	exp_traffic, dY = sample_expected_traffic(max_count, current_window, lambda_endpoint)

	D, Delta, Z = extract_traffic_changes(obs_traffic, exp_traffic, alpha, dY, dX, current_window, lambda_endpoint, seconds_in_window)
	fD, fDelta, fZ = fuzzification(current_window, D, Delta, Z)
	eta = anomaly_score(fD, fDelta, fZ, wDelta, wD, wZ)	

	#C2 = (1 - zeta) * math.tanh(beta - D) + zeta * (1 - eta)
	#C2 = math.tanh(1.764 * (-eta + 1.5))
	#C2 = beta - eta/3
	#C2 = map_interval(eta/3)
	C2= -math.tanh((eta - beta)*2)
	C22 = math.tanh(beta - D)

	return {'D': D, 'Delta': Delta, 'Z': Z, 'eta': eta, 'C2': C2, 'C22': C22, 'fD': fD, 'fDelta': fDelta, 'fZ': fZ}

In [None]:
import time

def scaled_sigmoid(x, alpha=3.063, beta=0.5):
    return 1 / (1 + np.exp(-alpha * (x - beta)))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def update_Ra2(old_Ra, C2):
	new_Ra = old_Ra + C2

	return new_Ra

def update_Ra4(old_Ra, C2):
	new_Ra = 0.5*old_Ra + 0.5*C2

	return new_Ra

def update_Ra3(old_Ra, C2, P, Q):
	P_pred = P + Q
	K = P_pred / (P_pred + 1e-2)

	new_Ra = old_Ra + K * (C2 - old_Ra)
	P = (1-K) * P_pred	

	return new_Ra, P, Q	

def update_Ra(old_Ra, C2):
	new_Ra = scaled_sigmoid(old_Ra + C2)
	return new_Ra	

def update_Ra_old(old_Ra, C2):
	new_Ra = sigmoid(old_Ra + C2)
	return new_Ra	

# Main sweep function. It builds windowed results per uri.
def run_detection(df, endpoints_lambdas, seconds_in_window=60, stride_seconds=None,
                  alpha=0.05, beta=0.5, zeta=0.5, init_Ra=1.0, init_Rc=1.0, rc_clients=None):
	# window sliding
	if stride_seconds is None:
		stride_seconds = seconds_in_window

	# creates per uri second-aligned series (global time index)
	tmin = df['time_local'].min()
	tmax = df['time_local'].max()
	full_index = pd.date_range(tmin, tmax, freq='s')

	results = []
	Ra = {}
	Ra_2 = {}
	
	# creates a mapping from per-second windows to whether any attack occurred in that window (groundtruth)
	df['is_attack'] = ~df['label'].astype(str).str.upper().eq('BENIGN')
	grouped = df.groupby('uri')

	# for every endpoint, shows progress
	for uri, g in tqdm(grouped, desc='APIs'):
		# builds per-second request series for this endpoint
		requests_per_sec = g.groupby('time_local').size().reindex(full_index, fill_value=0).values
		lam = endpoints_lambdas.get(uri, requests_per_sec.mean())
		Ra[uri] = init_Ra
		Ra_2[uri] = init_Ra
		secs = len(requests_per_sec)

		# iterates all windows start
		for start in range(0, secs - seconds_in_window + 1, stride_seconds):
			start_time = time.perf_counter()
			current_window = requests_per_sec[start:start+seconds_in_window]
			out = analyze_window(current_window, lam, seconds_in_window, alpha, beta, zeta)
			Ra[uri] = update_Ra(Ra[uri], out['C2'])
			Ra_2[uri] = update_Ra_old(Ra_2[uri], out['C22'])
			
			# map window start time to actual timestamp for groundtruth labeling
			win_start_ts = full_index[start]
			win_end_ts = full_index[start+seconds_in_window-1]
			
			# groundtruth: if any flow to this API-endpoint in window is attack -> attack window for this pair
			mask = (g['time_local'] >= win_start_ts) & (g['time_local'] <= win_end_ts)
			label_attack = g.loc[mask, 'is_attack'].any()

			end_time = time.perf_counter()
			results.append({
				'endpoint': uri, 'win_start': win_start_ts,
				'win_end': win_end_ts, 'lambda': lam, 'C2': out['C2'], 'D': out['D'],
				'Delta': out['Delta'], 'Z': out['Z'], 'eta': out['eta'],
				'Ra': Ra[uri], 'label_attack': bool(label_attack),
				'Ra_2': Ra_2[uri], 'fDP': 1-out['fD'], 'fDeltaP': 1-out['fDelta'], 'fZP': 1-out['fZ'],
				'time_computation': end_time - start_time
			})
	res_df = pd.DataFrame(results)
	return res_df

# Evaluation

In [None]:
def plot_results(plot_examples, res_df,df):
	# plot Ra timeseries for some example api-endpoints
	unique_pairs = res_df['endpoint'].drop_duplicates().sample(min(plot_examples, len(res_df)))

	for endpoint in unique_pairs:
		df_flow = df[df['uri'] == endpoint].sort_values('time_local').copy()
		df_flow = df_flow.groupby(['time_local']).size()

		time_col = 'win_end'
		sub = res_df[res_df['endpoint'] == endpoint].sort_values(time_col).copy()
		sub['label_attack'] = sub['label_attack'].map({True: 'Attack', False: 'Benign'})

		fig = go.Figure()

		# --- Ra dots (colored by attack label) ---
		color_map = {'Benign': 'green', 'Attack': 'red'}
		fig.add_trace(go.Scatter(
			x=sub[time_col],
			y=sub['Ra'],
			mode='markers',
			marker=dict(
				color=sub['label_attack'].map(color_map),
				size=7,
				line=dict(width=0.5, color='black')
			),
			name='Ra (colored by Attack/Benign)',
			hovertemplate='Time: %{x}<br>Ra: %{y:.3f}<br>Status: %{marker.color}<extra></extra>')
		)

		#fig.add_trace(go.Scatter(
		#	x=df_flow.index,
		#	y=df_flow.values,
		#	mode='lines',
		#	name='Flow by time',
		#	hovertemplate='Time: %{x}<br>Ra: %{y:.3f}<br>Status: %{marker.color}<extra></extra>')
		#)

		# --- Other metrics (D, Z, Delta) as simple lines ---
		metric_colors = {
			'D': '#1f77b4', 
			'Z': '#ff7f0e', 
			'Delta': '#2ca02c', 
			'eta': "#fde861", 
			#'Ra_2': "#fd61bc", 
			'C2': "#ff0000", 
			#'fDP': "#ff00ff", 
			#'fDeltaP': "#00c3ff", 
			#'fZP': "#bbff00"
		}  # distinct line colors

		for metric, color in metric_colors.items():
			fig.add_trace(go.Scatter(
				x=sub[time_col],
				y=sub[metric],
				mode='lines',
				line=dict(color=color, width=2),
				name=metric
			))

		# --- Layout ---
		fig.update_layout(
			title=f'Metrics evolution for the {endpoint}',
			xaxis_title='Time',
			yaxis_title='Metric value',
			template='plotly_white',
			legend_title_text='Metric',
			margin=dict(l=50, r=30, t=60, b=40)
		)

		fig.show()

def evaluate_results(res_df, ra_threshold=0.5):
	if res_df is None or len(res_df)==0:
		print('No results to evaluate.')
		return
	# make binary predictions using Ra threshold (lower Ra -> more anomalous)
	y_true = res_df['label_attack'].astype(int).values
	y_score = res_df['C2'].astype(float).values
	y_pred = (y_score < 0).astype(int)  # equivalent to C2 < 0
	precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
	roc = roc_auc_score(y_true, y_score) if len(np.unique(y_true))>1 else float('nan')
	print(f'Precision={precision:.4f}  Recall={recall:.4f}  F1={f1:.4f}  ROC_AUC={roc:.4f}')
	# simple confusion table
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	print('TP',tp,'FP',fp,'FN',fn,'TN',tn, 'Execution time: ', np.mean(res_df['time_computation']))

def evaluate_results2(res_df, final, ra_threshold=0.5):
	if final is None or len(res_df)==0:
		print('No results to evaluate.')
		return
	# make binary predictions using Ra threshold (lower Ra -> more anomalous)
	y_true = final['ground_truth'].astype(int).values
	y_pred = final['Class'].astype(int).values
	precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
	roc = roc_auc_score(y_true, y_pred) if len(np.unique(y_true))>1 else float('nan')
	print(f'Sample: Precision={precision:.4f}  Recall={recall:.4f}  F1={f1:.4f}  ROC_AUC={roc:.4f}')
	# simple confusion table
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	print('TP',tp,'FP',fp,'FN',fn,'TN',tn, 'Execution time: ', np.mean(res_df['time_computation']))


def evaluate_and_plot(res_df, df, ra_threshold=0.5, plot_examples=5):
	evaluate_results(res_df, ra_threshold)
	plot_results(plot_examples, res_df, df)

def evaluate_and_plot2(res_df, final, df, ra_threshold=0.5, plot_examples=5):
	evaluate_results2(res_df, final, ra_threshold)
	plot_results(plot_examples, res_df, df)
	

In [None]:
def simulate(df_train, df_test, window_size, beta):
	endpoints_lambdas = compute_lambda_by_api_endpoint(df_train, window_seconds=window_size)
	res_df = run_detection(df_test, endpoints_lambdas, seconds_in_window=window_size, beta=beta)
	evaluate_and_plot(res_df, df_test, ra_threshold=beta, plot_examples=1)
	return res_df

In [None]:
#[df_1['uri']=='192.168.50.1:16478'] #192.168.50.1:10229
res_df = simulate(df_train=df_1, df_test=df_2, window_size=60, beta=0.1)


divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [19:18<00:00, 57.84it/s]


Precision=0.9682  Recall=0.5505  F1=0.7019  ROC_AUC=0.2013
TP 203731 FP 6693 FN 166340 TN 829830 Execution time:  0.000903188532023396


In [None]:
gt_ = df_2[['uri', 'is_attack']].groupby(['uri', 'is_attack']).size().unstack(level='is_attack').reset_index()
gt = gt_.fillna(0)
gt

is_attack,uri,False,True
0,104.19.195.151:443,11.0,0.0
1,104.20.11.130:443,3.0,0.0
2,104.20.12.130:443,13.0,0.0
3,104.20.12.130:80,5.0,0.0
4,104.36.115.113:443,31.0,0.0
...,...,...,...
67028,8.43.72.98:443,2.0,0.0
67029,8.8.8.8:53,97.0,0.0
67030,87.250.250.119:443,6.0,0.0
67031,91.189.89.199:123,1.0,0.0


Precision=0.9998  Recall=0.7537  F1=0.8595  ROC_AUC=0.8705
TP 50045 FP 8 FN 16358 TN 622 Execution time:  0.000903188532023396


In [None]:
evaluate_and_plot(res_df, df_2, ra_threshold=0.1, plot_examples=1)

Precision=0.9682  Recall=0.5505  F1=0.7019  ROC_AUC=0.2013
TP 203731 FP 6693 FN 166340 TN 829830 Execution time:  0.000903188532023396


In [None]:
final_report = res_df[['endpoint', 'label_attack']].copy()
final_report = final_report.groupby(['endpoint','label_attack']).size()
counts_df = final_report.unstack(level='label_attack')

counts_df.head()

label_attack,False,True
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
104.19.195.151:443,18.0,
104.20.11.130:443,18.0,
104.20.12.130:443,18.0,
104.20.12.130:80,18.0,
104.36.115.113:443,18.0,


In [None]:
df_true = counts_df[counts_df[False] > 0]
df_true.shape[0]

67033

In [None]:
df_not_true = counts_df[(counts_df[False] == 0) | (counts_df[False].isna())]
df_not_true.head()


label_attack,False,True
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
evaluate_and_plot(res_df, df_2, ra_threshold=0.1, plot_examples=3)

Precision=0.9682  Recall=0.5505  F1=0.7019  ROC_AUC=0.2013
TP 203731 FP 6693 FN 166340 TN 829830 Execution time:  0.000903188532023396


In [None]:
# Run evaluation if res_df exists
t_minutes = [1*60, 5*60, 10*60]

#param_combos = [(t, b) for t in t_minutes for b in beta]
for t in t_minutes:
	print(f"Testing parameters: t={t}s window")
	simulate(df_train=df_1, df_test=df_2, window_size=t, beta=0.1)
	print()

Testing parameters: t=60s window



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [20:25<00:00, 54.68it/s]


Precision=0.9682  Recall=0.5505  F1=0.7019  ROC_AUC=0.2013
TP 203731 FP 6693 FN 166340 TN 829830 Execution time:  0.000957789205982206



Testing parameters: t=300s window


APIs: 100%|██████████| 67033/67033 [04:32<00:00, 246.29it/s]


Precision=0.3558  Recall=0.5011  F1=0.4161  ROC_AUC=0.6775
TP 44271 FP 80173 FN 44070 TN 32585 Execution time:  0.0011443668233113637



Testing parameters: t=600s window


APIs: 100%|██████████| 67033/67033 [02:00<00:00, 557.39it/s]


Precision=0.3111  Recall=0.7819  F1=0.4451  ROC_AUC=0.6684
TP 17406 FP 38544 FN 4855 TN 6228 Execution time:  0.0012428893112046847





In [247]:
# Run evaluation if res_df exists
t_minutes = [1*60]
b = [0.1, 0.3, 0.5, 0.7, 0.9, 1]

#param_combos = [(t, b) for t in t_minutes for b in beta]
for t in t_minutes:
	for beta in b:
		print(f"Testing parameters: t={t}s window, beta={beta} treshold")
		res_df = simulate(df_train=df_1, df_test=df_2, window_size=t, beta=beta)
		result_g = res_df.groupby(['endpoint']).agg({'Ra':'min'}).reset_index()
		result_g['Class'] = result_g['Ra'] <= 0.5
		result_g = result_g.rename(columns={'endpoint':'uri'})
		merged_r = pd.merge(result_g, gt, on='uri', how='inner')
		merged_r['ground_truth'] = merged_r[True] > 0
		final_report = merged_r[['uri', 'Class', 'ground_truth']]
		final_report

		evaluate_and_plot2(res_df, final_report, df_2, ra_threshold=0.1, plot_examples=1)
		print()

Testing parameters: t=60s window, beta=0.1 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [20:55<00:00, 53.41it/s]


Precision=0.9682  Recall=0.5505  F1=0.7019  ROC_AUC=0.2013
TP 203731 FP 6693 FN 166340 TN 829830 Execution time:  0.0009762021910140102


Sample: Precision=0.9998  Recall=0.7537  F1=0.8595  ROC_AUC=0.8705
TP 50045 FP 8 FN 16358 TN 622 Execution time:  0.0009762021910140102



Testing parameters: t=60s window, beta=0.3 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [21:23<00:00, 52.23it/s]


Precision=0.9989  Recall=0.0852  F1=0.1570  ROC_AUC=0.2013
TP 31520 FP 34 FN 338551 TN 836489 Execution time:  0.0010014476825691452


Sample: Precision=0.9983  Recall=0.0181  F1=0.0356  ROC_AUC=0.5075
TP 1202 FP 2 FN 65201 TN 628 Execution time:  0.0010014476825691452



Testing parameters: t=60s window, beta=0.5 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [21:08<00:00, 52.85it/s]


Precision=0.9661  Recall=0.0022  F1=0.0045  ROC_AUC=0.2013
TP 826 FP 29 FN 369245 TN 836494 Execution time:  0.000991500077440328


Sample: Precision=0.3333  Recall=0.0000  F1=0.0000  ROC_AUC=0.4984
TP 1 FP 2 FN 66402 TN 628 Execution time:  0.000991500077440328



Testing parameters: t=60s window, beta=0.7 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [21:28<00:00, 52.04it/s]


Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.2013
TP 0 FP 4 FN 370071 TN 836519 Execution time:  0.0010063910871482879


Sample: Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.5000
TP 0 FP 0 FN 66403 TN 630 Execution time:  0.0010063910871482879



Testing parameters: t=60s window, beta=0.9 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [20:54<00:00, 53.44it/s]


Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.2013
TP 0 FP 0 FN 370071 TN 836523 Execution time:  0.000983242722595752


Sample: Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.5000
TP 0 FP 0 FN 66403 TN 630 Execution time:  0.000983242722595752



Testing parameters: t=60s window, beta=1 treshold



divide by zero encountered in divide


invalid value encountered in divide

APIs: 100%|██████████| 67033/67033 [19:40<00:00, 56.80it/s]


Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.2013
TP 0 FP 0 FN 370071 TN 836523 Execution time:  0.0009268785197301725


Sample: Precision=0.0000  Recall=0.0000  F1=0.0000  ROC_AUC=0.5000
TP 0 FP 0 FN 66403 TN 630 Execution time:  0.0009268785197301725





# Drafts

In [None]:
# @title Mocked Storage

import pandas as pd
import datetime

# ecossistem com reputação negativa
clients_reputation = {
  'id': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29'],
  'indicator':[0.5, 0.5, 0.3, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.1],
  'lastUpdate':['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
  'firstUpdate':[datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now(), datetime.now()],
  'clientIP':['10.0.10.0', '10.0.10.1', '10.0.10.2', '10.0.10.3', '10.0.10.4', '10.0.10.5', '10.0.10.6', '10.0.10.7', '10.0.10.8', '10.0.10.9', '10.0.10.10', '10.0.10.11', '10.0.10.12', '10.0.10.13', '10.0.10.14', '10.0.10.15', '10.0.10.16', '10.0.10.17', '10.0.10.18', '10.0.10.19', '10.0.10.20', '10.0.10.21', '10.0.10.22', '10.0.10.23', '10.0.10.24', '10.0.10.25', '10.0.10.26', '10.0.10.27', '10.0.10.28', '10.0.10.29'],
  'isValid':['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
}

endpoints_reputation = {
  'id': [],
  'indicator':[],
  'lastUpdate':[],
  'firstUpdate':[],
  'uri':[],
  'isValid':[]
  }

requests_history = {
  'id': [],
  'time_local':[],
  'resp_body_size':[],
  'address':[],
  'request_length':[],
  'method':[],
  'uri': [],
  'status':[],
  'user_agent':[],
  'resp_time':[],
  'upstream_addr':[]
  }

# Estados inciais
def get_clean_storage(reputations):
    clients_reputation_copy=clients_reputation.copy()
    scenario_reputations = list(np.round(reputations, 2))
    clients_reputation_copy['indicator'] = scenario_reputations

    db = create_data_storage(clients_reputation=clients_reputation_copy, endpoints_reputation=endpoints_reputation,
                             requests_history=requests_history)
    return db

def create_data_storage(clients_reputation, endpoints_reputation, requests_history):
    df_clients_reputation = pd.DataFrame(clients_reputation)
    df_endpoints_reputation = pd.DataFrame(endpoints_reputation)
    df_requests_history = pd.DataFrame(requests_history)
    return {
      "clients_reputation": df_clients_reputation,
      "endpoints_reputation": df_endpoints_reputation,
      "requests_history": df_requests_history
    }


AttributeError: module 'datetime' has no attribute 'now'

In [None]:
xs = np.linspace(1, 3, num=50, endpoint=True, retstep=False, dtype=None, axis=0)
y = [math.tanh(1.764 * (x - 1.5)) for x in xs]

plt.plot(xs,y)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def map_interval(x):
	"""
	Maps x in [0, 1] to [1, -1], with:
		[0, 0.33] -> [1, 0]
		(0.33, 1] -> (0, -1]
	"""
	threshold = 0.2
	x = np.clip(x, 0, 1)
	threshold = np.clip(threshold, 1e-9, 1 - 1e-9)  # avoid division by zero

	y = np.where(
		x <= threshold,
		1 - x / threshold,                     # [0, threshold] → [1, 0]
		-(x - threshold) / (1 - threshold)     # (threshold, 1] → (0, -1]
	)
	return y

# Generate points
x = np.linspace(0, 1, 500)
y = map_interval(x)

# Plot
plt.figure(figsize=(6, 4))
plt.plot(x, y, label='mapped interval', color='blue', linewidth=2)
plt.axvline(0.33, color='gray', linestyle='--', alpha=0.6, label='x = 0.33')
plt.title('Piecewise Mapping: [0,1] → [1,-1]')
plt.xlabel('Input x')
plt.ylabel('Mapped value f(x)')
plt.grid(True, linestyle=':', alpha=0.7)
plt.legend()
plt.show()


## Referências


https://math.stackexchange.com/questions/2282342/central-limit-theorem-poisson-equals-normal-tell-me-where-im-wrong

https://stats.stackexchange.com/questions/438060/does-this-code-demonstrate-the-central-limit-theorem/438117#438117


https://www.statology.org/normality-test-python/

https://stackoverflow.com/questions/73407488/how-to-check-the-normality-of-data-on-a-column-grouped-by-an-index


https://stats.stackexchange.com/questions/194182/beta-as-distribution-of-proportions-or-as-continuous-binomial

https://stats.stackexchange.com/questions/316086/distribution-that-has-a-range-from-0-to-1-and-with-peak-between-them


### Videos

https://www.youtube.com/watch?v=pYRG5X7110M

https://www.youtube.com/watch?v=RawXxYCOaig