In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
tqdm.pandas()

LEVEL = "small"

%config InlineBackend.figure_formats = ['svg']

In [2]:
DATASET_PATH = f"../../data/extract/ugr16-{LEVEL}.csv"
data_df = pd.read_csv(DATASET_PATH, index_col=0)
data_df.head()

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,protocol,flag,forwarding,type,packet,byte,label
0,2016-08-07 21:08:25,0.484,253.139.127.229,42.219.156.215,25,48324,TCP,.AP.SF,0,0,5,564,anomaly-spam
1,2016-08-08 14:10:17,0.468,253.139.127.230,42.219.156.213,25,51871,TCP,.AP.SF,0,0,5,564,anomaly-spam
2,2016-03-24 11:35:33,0.296,42.219.156.223,108.66.255.194,40484,25,TCP,.A.RS.,0,0,4,216,anomaly-spam
3,2016-08-07 00:10:32,2.616,42.219.156.214,194.192.119.38,44877,25,TCP,.APRS.,0,0,21,7486,anomaly-spam
4,2016-08-06 22:40:05,0.364,194.192.119.34,42.219.156.215,25,35728,TCP,.AP.SF,0,0,5,564,anomaly-spam


## Convertir les colonnes dans les bons types

In [90]:
data_df.dtypes

datetime_end         object
duration            float64
source_ip            object
destination_ip       object
source_port           int64
destination_port      int64
protocol             object
flag                 object
forwarding            int64
type                  int64
packet                int64
byte                  int64
label                object
dtype: object

In [91]:
# Convert all object to string if it is possible
data_df = data_df.convert_dtypes()

In [92]:
data_df.datetime_end = pd.to_datetime(data_df.datetime_end)

In [93]:
data_df.dtypes

datetime_end        datetime64[ns]
duration                   Float64
source_ip                   string
destination_ip              string
source_port                  Int64
destination_port             Int64
protocol                    string
flag                        string
forwarding                   Int64
type                         Int64
packet                       Int64
byte                         Int64
label                       string
dtype: object

## Statistique

In [6]:
data_df.describe()

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte
count,14400000.0,14400000.0,14400000.0,14400000.0,14400000.0,14400000.0,14400000.0
mean,3.44652,22883.46,18231.33,0.0,9.813184,14.64368,9800.866
std,28.33413,23744.14,22813.86,0.0,21.36845,1926.603,2518562.0
min,0.0,0.0,0.0,0.0,0.0,1.0,20.0
25%,0.0,80.0,80.0,0.0,0.0,1.0,69.0
50%,0.0,8525.0,5011.0,0.0,0.0,2.0,196.0
75%,1.036,47199.0,41618.0,0.0,0.0,6.0,683.0
max,2691.891,65535.0,65535.0,0.0,252.0,1816918.0,2658149000.0


In [7]:
#data_df.set_index("datetime_end").groupby(pd.Grouper(freq="W-MON"))
data_df = data_df.convert_dtypes()
data_df.datetime_end = pd.to_datetime(data_df.datetime_end)

In [8]:
hist_df = data_df.groupby([pd.Grouper(key='datetime_end', freq='W-MON'), "label"]).size().to_frame('count').reset_index()

In [9]:
fig = px.bar(hist_df, x="datetime_end", y="count", color="label",
            title="Histogramme des labels en fonction de la date de capture",
            color_discrete_sequence=px.colors.qualitative.G10_r)
fig.update_layout(title_x=0.5)
fig.show()
fig.write_image(f"count-extract-ugr16-{LEVEL}.svg")

## Preprocess
Cette partie :
1. Normaliser/Encoder les variables
2. Diviser la dataset en deux jeux : TRAIN/TEST

### One Hot Encoding
Cela concerne les variables suivantes :
* protocol : OHE
* label : OHE
* flag : split (0 ou 1)

#### Statistique

In [96]:
data_df.protocol.value_counts()

TCP     100971
UDP      42085
ICMP       857
GRE         47
ESP         33
IPIP         5
IPv6         2
Name: protocol, dtype: Int64

In [97]:
data_df.label.value_counts()

background         72000
anomaly-spam        9000
anomaly-sshscan     9000
anomaly-udpscan     9000
blacklist           9000
dos                 9000
nerisbotnet         9000
scan11              9000
scan44              9000
Name: label, dtype: Int64

In [98]:
data_df.flag.value_counts()

.A....    44407
.AP.SF    32043
....S.    23191
.A..S.     9822
...RS.     5988
.APRS.     5409
.AP.S.     3949
.APRSF     3822
.AP...     3347
.A.R..     2538
...R..     2494
.A...F     2136
.A..SF     1417
.A.RS.     1222
.AP..F      948
......      843
.A.R.F      175
.APR..      156
.APR.F       69
.A.RSF       24
Name: flag, dtype: Int64

#### `protocol` & `label`

In [99]:
data_df = pd.get_dummies(data_df, columns=["protocol", "label"])

In [100]:
LABEL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("label")]
LABEL_COLUMN_LIST

Index(['label_anomaly-spam', 'label_anomaly-sshscan', 'label_anomaly-udpscan',
       'label_background', 'label_blacklist', 'label_dos', 'label_nerisbotnet',
       'label_scan11', 'label_scan44'],
      dtype='object')

In [101]:
PROTOCOL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("protocol")]
PROTOCOL_COLUMN_LIST

Index(['protocol_ESP', 'protocol_GRE', 'protocol_ICMP', 'protocol_IPIP',
       'protocol_IPv6', 'protocol_TCP', 'protocol_UDP'],
      dtype='object')

#### `flag`

In [102]:
FLAG_VALUE_LIST = ["A", "P", "R", "S", "F"]
FLAG_ROOT = "flag"
FLAG_COLUMN_LIST = [f"{FLAG_ROOT}_{flag_value}" for flag_value in FLAG_VALUE_LIST]

def split_flag(flag_str):
    # remove the first dot
    flag_str = flag_str[1:]
    # split each class of flag
    flag_bool_list = [letter != "." for letter in flag_str]

    return pd.Series(flag_bool_list, dtype=int, index=FLAG_COLUMN_LIST)

# split_flag(".AP.S.")

In [103]:
flag_df = data_df.flag.progress_apply(split_flag)

  0%|          | 0/144000 [00:00<?, ?it/s]

In [104]:
data_df = pd.concat([data_df, flag_df], axis=1)
data_df.drop(columns=["flag"], inplace=True)

#### Export CSV

In [105]:
data_df.to_csv(f"../../data/normalize/one-hot_ugr16-{LEVEL}.csv")

In [87]:
print(LABEL_COLUMN_LIST)
print(PROTOCOL_COLUMN_LIST)
print(FLAG_COLUMN_LIST)

Index(['label_anomaly-spam', 'label_anomaly-sshscan', 'label_anomaly-udpscan',
       'label_background', 'label_blacklist', 'label_dos', 'label_nerisbotnet',
       'label_scan11', 'label_scan44'],
      dtype='object')
Index(['protocol_ESP', 'protocol_GRE', 'protocol_ICMP', 'protocol_IPIP',
       'protocol_IPv6', 'protocol_TCP', 'protocol_UDP'],
      dtype='object')
['flag_A', 'flag_P', 'flag_R', 'flag_S', 'flag_F']


### Split TRAIN/TEST

In [3]:
from sklearn.model_selection import train_test_split

In [9]:
LEVEL = "small"
LABEL_COLUMN_LIST = ['label_anomaly-spam', 'label_anomaly-sshscan', 'label_anomaly-udpscan',
                    'label_background', 'label_blacklist', 'label_dos', 'label_nerisbotnet',
                    'label_scan11', 'label_scan44']

In [10]:
RAW_DATASET_PATH = f"../../data/extract/ugr16-{LEVEL}.csv"
data_df = pd.read_csv(RAW_DATASET_PATH, index_col=0)
data_df.head()

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,protocol,flag,forwarding,type,packet,byte,label
0,2016-08-07 21:08:25,0.484,253.139.127.229,42.219.156.215,25,48324,TCP,.AP.SF,0,0,5,564,anomaly-spam
1,2016-08-08 14:10:17,0.468,253.139.127.230,42.219.156.213,25,51871,TCP,.AP.SF,0,0,5,564,anomaly-spam
2,2016-03-24 11:35:33,0.296,42.219.156.223,108.66.255.194,40484,25,TCP,.A.RS.,0,0,4,216,anomaly-spam
3,2016-08-07 00:10:32,2.616,42.219.156.214,194.192.119.38,44877,25,TCP,.APRS.,0,0,21,7486,anomaly-spam
4,2016-08-06 22:40:05,0.364,194.192.119.34,42.219.156.215,25,35728,TCP,.AP.SF,0,0,5,564,anomaly-spam


In [11]:
ONE_HOT_DATASET_PATH = f"../../data/normalize/one-hot_ugr16-{LEVEL}.csv"
one_hot_data_df = pd.read_csv(ONE_HOT_DATASET_PATH, index_col=0)
one_hot_data_df.head()

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,forwarding,type,packet,byte,...,label_blacklist,label_dos,label_nerisbotnet,label_scan11,label_scan44,flag_A,flag_P,flag_R,flag_S,flag_F
0,2016-08-07 21:08:25,0.484,253.139.127.229,42.219.156.215,25,48324,0,0,5,564,...,0,0,0,0,0,1,1,0,1,1
1,2016-08-08 14:10:17,0.468,253.139.127.230,42.219.156.213,25,51871,0,0,5,564,...,0,0,0,0,0,1,1,0,1,1
2,2016-03-24 11:35:33,0.296,42.219.156.223,108.66.255.194,40484,25,0,0,4,216,...,0,0,0,0,0,1,0,1,1,0
3,2016-08-07 00:10:32,2.616,42.219.156.214,194.192.119.38,44877,25,0,0,21,7486,...,0,0,0,0,0,1,1,1,1,0
4,2016-08-06 22:40:05,0.364,194.192.119.34,42.219.156.215,25,35728,0,0,5,564,...,0,0,0,0,0,1,1,0,1,1


In [12]:
train_df, test_df = train_test_split(one_hot_data_df, test_size=0.1, shuffle=True, random_state=42, stratify=data_df.label)

In [13]:
test_df[LABEL_COLUMN_LIST].sum()

label_anomaly-spam        900
label_anomaly-sshscan     900
label_anomaly-udpscan     900
label_background         7200
label_blacklist           900
label_dos                 900
label_nerisbotnet         900
label_scan11              900
label_scan44              900
dtype: int64

In [141]:
train_df.to_csv(f"../../data/normalize/one-hot_train-ugr16-{LEVEL}.csv")
test_df.to_csv(f"../../data/normalize/one-hot_test-ugr16-{LEVEL}.csv")

### Standardisation
Parfois, certain parle de "Normalisation". Ici, on centre (espérance = 0) et on réduit (ecart-type = 1) un certain nombre de variables.

Cela concerne les variables suivantes :
* duration : Normaliser
* type : Normaliser
* packet : Normaliser
* byte : Normaliser

### Normalisation
Cela concerne les variables suivantes :
* datetime_end : jour / seconde
* source_ip : split / 255
* destination_ip : split / 255
* source_port : /65535
* destination_port : /65535

---

* forwarding ???