In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
from sklearn.model_selection import train_test_split
tqdm.pandas()

LEVEL = "base"

%config InlineBackend.figure_formats = ['svg']

In [2]:
DATASET_PATH = f"../../data/extract/ugr16-{LEVEL}.csv"
data_df = pd.read_csv(DATASET_PATH, index_col=0)
data_df.head()

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,protocol,flag,forwarding,type,packet,byte,label
0,2016-08-07 04:48:34,0.616,42.219.156.212,253.139.127.227,54594,25,TCP,.APRS.,0,0,6,433,anomaly-spam
1,2016-08-08 23:39:59,0.584,177.235.191.18,42.219.156.223,25,33008,TCP,.AP.SF,0,0,5,564,anomaly-spam
2,2016-08-08 19:20:10,0.572,42.219.156.213,194.192.119.230,38880,25,TCP,.APRS.,0,0,6,443,anomaly-spam
3,2016-08-07 00:11:24,0.616,42.219.156.223,253.139.127.227,55611,25,TCP,.APRS.,0,0,6,441,anomaly-spam
4,2016-08-09 04:48:51,0.72,42.219.156.223,253.136.161.201,40400,25,TCP,.APRS.,0,0,6,442,anomaly-spam


## Convertir les colonnes dans les bons types

In [3]:
data_df.dtypes

datetime_end         object
duration            float64
source_ip            object
destination_ip       object
source_port           int64
destination_port      int64
protocol             object
flag                 object
forwarding            int64
type                  int64
packet                int64
byte                  int64
label                object
dtype: object

In [4]:
# Convert all object to string if it is possible
data_df = data_df.convert_dtypes()
# Convert Datetime column to Datetime object
data_df.datetime_end = pd.to_datetime(data_df.datetime_end)

In [5]:
data_df.dtypes

datetime_end        datetime64[ns]
duration                   Float64
source_ip                   string
destination_ip              string
source_port                  Int64
destination_port             Int64
protocol                    string
flag                        string
forwarding                   Int64
type                         Int64
packet                       Int64
byte                         Int64
label                       string
dtype: object

## Statistique

In [6]:
data_df.describe()

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte
count,1440000.0,1440000.0,1440000.0,1440000.0,1440000.0,1440000.0,1440000.0
mean,3.437614,22894.88,18223.06,0.0,9.822794,20.52746,17156.97
std,27.82284,23743.02,22813.83,0.0,21.41562,3416.196,4806331.0
min,0.0,0.0,0.0,0.0,0.0,1.0,27.0
25%,0.0,80.0,80.0,0.0,0.0,1.0,69.0
50%,0.0,8600.0,5010.0,0.0,0.0,2.0,196.0
75%,1.036,47199.0,41605.25,0.0,0.0,6.0,683.0
max,2691.891,65535.0,65535.0,0.0,252.0,1816918.0,2658149000.0


In [7]:
hist_df = data_df.groupby([pd.Grouper(key='datetime_end', freq='W-MON'), "label"]).size().to_frame('count').reset_index()

In [8]:
fig = px.bar(hist_df, x="datetime_end", y="count", color="label",
            title="Histogramme des labels en fonction de la date de capture",
            color_discrete_sequence=px.colors.qualitative.G10_r)
fig.update_layout(title_x=0.5)
fig.show()
fig.write_image(f"count-extract-ugr16-{LEVEL}.svg")

## Preprocess
Cette partie :
1. Normaliser/Encoder les variables
2. Diviser la dataset en deux jeux : TRAIN/TEST

### One Hot Encoding
Cela concerne les variables suivantes :
* protocol : OHE
* label : OHE
* flag : split (0 ou 1)

#### Statistique

In [9]:
data_df.protocol.value_counts()

TCP     1010206
UDP      419961
ICMP       8829
GRE         589
ESP         322
IPIP         57
IPv6         35
SCTP          1
Name: protocol, dtype: Int64

In [10]:
data_df.label.value_counts()

background         720000
anomaly-spam        90000
anomaly-sshscan     90000
anomaly-udpscan     90000
blacklist           90000
dos                 90000
nerisbotnet         90000
scan11              90000
scan44              90000
Name: label, dtype: Int64

In [11]:
data_df.flag.value_counts()

.A....    444881
.AP.SF    318593
....S.    232384
.A..S.     97415
...RS.     59555
.APRS.     52810
.AP.S.     41287
.APRSF     38643
.AP...     32645
.A.R..     25652
...R..     25134
.A...F     22025
.A..SF     14280
.A.RS.     12917
.AP..F      9252
......      8062
.A.R.F      1801
.APR..      1669
.APR.F       749
.A.RSF       238
..P.S.         6
UA...F         1
UA..SF         1
Name: flag, dtype: Int64

#### `protocol` & `label`

In [12]:
data_df = pd.get_dummies(data_df, columns=["protocol", "label"])

In [13]:
LABEL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("label")]
LABEL_COLUMN_LIST

Index(['label_anomaly-spam', 'label_anomaly-sshscan', 'label_anomaly-udpscan',
       'label_background', 'label_blacklist', 'label_dos', 'label_nerisbotnet',
       'label_scan11', 'label_scan44'],
      dtype='object')

In [14]:
PROTOCOL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("protocol")]
PROTOCOL_COLUMN_LIST

Index(['protocol_ESP', 'protocol_GRE', 'protocol_ICMP', 'protocol_IPIP',
       'protocol_IPv6', 'protocol_SCTP', 'protocol_TCP', 'protocol_UDP'],
      dtype='object')

#### `flag`

In [15]:
FLAG_VALUE_LIST = ["U", "A", "P", "R", "S", "F"]
FLAG_ROOT = "flag"
FLAG_COLUMN_LIST = [f"{FLAG_ROOT}_{flag_value}" for flag_value in FLAG_VALUE_LIST]

In [16]:
flag_df = data_df.flag.str.split("", expand=True)
flag_df = flag_df.iloc[:,1:-1]
flag_df = flag_df != "."
flag_df = flag_df.astype(int)
flag_df.columns = FLAG_COLUMN_LIST
data_df = pd.concat([data_df, flag_df], axis=1).drop(columns=["flag"])
data_df

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,forwarding,type,packet,byte,...,label_dos,label_nerisbotnet,label_scan11,label_scan44,flag_U,flag_A,flag_P,flag_R,flag_S,flag_F
0,2016-08-07 04:48:34,0.616,42.219.156.212,253.139.127.227,54594,25,0,0,6,433,...,0,0,0,0,0,1,1,1,1,0
1,2016-08-08 23:39:59,0.584,177.235.191.18,42.219.156.223,25,33008,0,0,5,564,...,0,0,0,0,0,1,1,0,1,1
2,2016-08-08 19:20:10,0.572,42.219.156.213,194.192.119.230,38880,25,0,0,6,443,...,0,0,0,0,0,1,1,1,1,0
3,2016-08-07 00:11:24,0.616,42.219.156.223,253.139.127.227,55611,25,0,0,6,441,...,0,0,0,0,0,1,1,1,1,0
4,2016-08-09 04:48:51,0.72,42.219.156.223,253.136.161.201,40400,25,0,0,6,442,...,0,0,0,0,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1439995,2016-08-03 12:50:42,0.0,42.219.150.246,42.219.158.16,46660,22939,0,0,1,44,...,0,0,0,1,0,0,0,0,1,0
1439996,2016-08-07 20:51:26,0.0,42.219.156.30,42.219.150.247,6009,63808,0,0,1,40,...,0,0,0,1,0,1,0,1,0,0
1439997,2016-08-04 00:13:08,0.0,42.219.156.30,42.219.150.247,8888,42314,0,0,1,40,...,0,0,0,1,0,1,0,1,0,0
1439998,2016-07-28 12:31:35,0.0,42.219.156.30,42.219.150.247,52673,42464,0,0,1,40,...,0,0,0,1,0,1,0,1,0,0


### Split TRAIN/TEST

In [17]:
RAW_DATASET_PATH = f"../../data/extract/ugr16-{LEVEL}.csv"
raw_data_df = pd.read_csv(RAW_DATASET_PATH, index_col=0)
raw_data_df.head()

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,protocol,flag,forwarding,type,packet,byte,label
0,2016-08-07 04:48:34,0.616,42.219.156.212,253.139.127.227,54594,25,TCP,.APRS.,0,0,6,433,anomaly-spam
1,2016-08-08 23:39:59,0.584,177.235.191.18,42.219.156.223,25,33008,TCP,.AP.SF,0,0,5,564,anomaly-spam
2,2016-08-08 19:20:10,0.572,42.219.156.213,194.192.119.230,38880,25,TCP,.APRS.,0,0,6,443,anomaly-spam
3,2016-08-07 00:11:24,0.616,42.219.156.223,253.139.127.227,55611,25,TCP,.APRS.,0,0,6,441,anomaly-spam
4,2016-08-09 04:48:51,0.72,42.219.156.223,253.136.161.201,40400,25,TCP,.APRS.,0,0,6,442,anomaly-spam


In [18]:
train_df, test_df = train_test_split(data_df, test_size=0.1, shuffle=True, random_state=42, stratify=raw_data_df.label)

#### Statistiques

In [19]:
LABEL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("label")]
PROTOCOL_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("protocol")]
FLAG_COLUMN_LIST = data_df.columns[data_df.columns.str.startswith("flag")]
print(LABEL_COLUMN_LIST, PROTOCOL_COLUMN_LIST, FLAG_COLUMN_LIST)

Index(['label_anomaly-spam', 'label_anomaly-sshscan', 'label_anomaly-udpscan',
       'label_background', 'label_blacklist', 'label_dos', 'label_nerisbotnet',
       'label_scan11', 'label_scan44'],
      dtype='object') Index(['protocol_ESP', 'protocol_GRE', 'protocol_ICMP', 'protocol_IPIP',
       'protocol_IPv6', 'protocol_SCTP', 'protocol_TCP', 'protocol_UDP'],
      dtype='object') Index(['flag_U', 'flag_A', 'flag_P', 'flag_R', 'flag_S', 'flag_F'], dtype='object')


In [20]:
train_df.describe()

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte,protocol_ESP,protocol_GRE,protocol_ICMP,...,label_dos,label_nerisbotnet,label_scan11,label_scan44,flag_U,flag_A,flag_P,flag_R,flag_S,flag_F
count,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,...,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0
mean,3.430902,22886.66,18236.57,0.0,9.81911,19.80375,16395.47,0.0002121914,0.0004050926,0.006101852,...,0.0625,0.0625,0.0625,0.0625,1.54321e-06,0.7741968,0.3442948,0.1523302,0.6028804,0.2817515
std,27.8219,23741.0,22817.93,0.0,21.41329,3191.559,4495346.0,0.01456525,0.02012284,0.0778757,...,0.2420616,0.2420616,0.2420616,0.2420616,0.00124226,0.4181104,0.4751379,0.3593408,0.4893013,0.4498531
min,0.0,0.0,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,80.0,80.0,0.0,0.0,1.0,69.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,8572.5,5011.0,0.0,0.0,2.0,196.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,1.036,47198.0,41642.0,0.0,0.0,6.0,683.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
max,2691.891,65535.0,65535.0,0.0,252.0,1791858.0,2621445000.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
train_df[LABEL_COLUMN_LIST].sum()

label_anomaly-spam        81000
label_anomaly-sshscan     81000
label_anomaly-udpscan     81000
label_background         648000
label_blacklist           81000
label_dos                 81000
label_nerisbotnet         81000
label_scan11              81000
label_scan44              81000
dtype: int64

In [22]:
train_df[PROTOCOL_COLUMN_LIST].sum()

protocol_ESP        275
protocol_GRE        525
protocol_ICMP      7908
protocol_IPIP        53
protocol_IPv6        31
protocol_SCTP         1
protocol_TCP     909211
protocol_UDP     377996
dtype: int64

In [23]:
train_df[FLAG_COLUMN_LIST].sum()

flag_U          2
flag_A    1003359
flag_P     446206
flag_R     197420
flag_S     781333
flag_F     365150
dtype: int64

### Standardisation
Parfois, certain parle de "Normalisation". Ici, on centre (espérance = 0) et on réduit (ecart-type = 1) un certain nombre de variables.

Cela concerne les variables suivantes :
* duration : Normaliser
* type : Normaliser
* packet : Normaliser
* byte : Normaliser

In [24]:
column_to_standardize = ["duration", "type", "packet", "byte"]
describe_df = train_df[column_to_standardize].describe()
describe_df.to_csv(f"describe-train-ugr16-{LEVEL}.csv")

In [25]:
def standardize_df(data_df, describe_df, column_to_standardize=["duration", "type", "packet", "byte"]) -> pd.DataFrame:
    centered_df = data_df[column_to_standardize] - describe_df.loc["mean", column_to_standardize]
    data_df[column_to_standardize] = centered_df/describe_df.loc["std", column_to_standardize]
    return data_df

In [26]:
standardize_df(train_df, describe_df)

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,forwarding,type,packet,byte,...,label_dos,label_nerisbotnet,label_scan11,label_scan44,flag_U,flag_A,flag_P,flag_R,flag_S,flag_F
364372,2016-05-29 03:01:32,-0.123317,143.72.8.137,42.219.156.231,53,58807,0,-0.458552,-0.005892,-0.003618,...,0,0,0,0,0,1,0,0,0,0
771335,2016-06-25 15:16:29,-0.061351,42.219.159.82,133.18.60.180,10021,41155,0,-0.458552,-0.003072,-0.00351,...,0,0,0,0,0,1,1,0,1,1
473445,2016-06-12 03:59:04,-0.123173,42.219.155.90,37.112.225.194,443,44719,0,1.409447,-0.005265,-0.003621,...,0,0,0,0,0,0,0,1,0,0
381856,2016-03-22 08:20:26,-0.123317,42.219.154.122,143.72.8.137,47684,53,0,-0.458552,-0.005892,-0.003629,...,0,0,0,0,0,1,0,0,0,0
987253,2016-05-31 01:00:01,-0.123317,42.219.156.198,132.5.60.62,41275,53,0,-0.458552,-0.005892,-0.003629,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457952,2016-03-24 05:10:30,-0.00758,208.67.136.77,42.219.156.211,54774,80,0,-0.458552,-0.004325,-0.003499,...,0,0,0,0,0,1,1,0,1,1
78810,2016-08-15 19:14:35,-0.107214,194.192.119.38,42.219.156.215,25,49727,0,-0.458552,-0.004638,-0.003522,...,0,0,0,0,0,1,1,0,1,1
1023496,2016-06-24 16:34:40,-0.123317,189.82.76.9,42.219.152.14,80,49147,0,-0.458552,-0.005892,-0.003637,...,0,0,0,0,0,1,0,0,1,0
358116,2016-05-03 03:38:37,0.026062,42.219.156.211,37.147.194.64,80,50443,0,-0.458552,-0.004952,-0.003442,...,0,0,0,0,0,1,1,0,1,1


### Normalisation
Cela concerne les variables suivantes :
* datetime_end : jour / seconde
* source_ip : split / 255
* destination_ip : split / 255
* source_port : /65535
* destination_port : /65535

#### Date

In [27]:
def get_total_seconds_the_day(datetime_series):
    return datetime_series.dt.hour * 60*60 + datetime_series.dt.minute * 60 + datetime_series.dt.second

def day_progress(datetime_series):
    return get_total_seconds_the_day(datetime_series)/(24*60*60)

In [28]:
train_df["day_progress"] = day_progress(train_df.datetime_end)
train_df.drop(columns=["datetime_end"], inplace=True)

In [29]:
train_df

Unnamed: 0,duration,source_ip,destination_ip,source_port,destination_port,forwarding,type,packet,byte,protocol_ESP,...,label_nerisbotnet,label_scan11,label_scan44,flag_U,flag_A,flag_P,flag_R,flag_S,flag_F,day_progress
364372,-0.123317,143.72.8.137,42.219.156.231,53,58807,0,-0.458552,-0.005892,-0.003618,0,...,0,0,0,0,1,0,0,0,0,0.126065
771335,-0.061351,42.219.159.82,133.18.60.180,10021,41155,0,-0.458552,-0.003072,-0.00351,0,...,0,0,0,0,1,1,0,1,1,0.636447
473445,-0.123173,42.219.155.90,37.112.225.194,443,44719,0,1.409447,-0.005265,-0.003621,0,...,0,0,0,0,0,0,1,0,0,0.166019
381856,-0.123317,42.219.154.122,143.72.8.137,47684,53,0,-0.458552,-0.005892,-0.003629,0,...,0,0,0,0,1,0,0,0,0,0.347523
987253,-0.123317,42.219.156.198,132.5.60.62,41275,53,0,-0.458552,-0.005892,-0.003629,0,...,0,0,0,0,1,0,0,0,0,0.041678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457952,-0.00758,208.67.136.77,42.219.156.211,54774,80,0,-0.458552,-0.004325,-0.003499,0,...,0,0,0,0,1,1,0,1,1,0.215625
78810,-0.107214,194.192.119.38,42.219.156.215,25,49727,0,-0.458552,-0.004638,-0.003522,0,...,0,0,0,0,1,1,0,1,1,0.801794
1023496,-0.123317,189.82.76.9,42.219.152.14,80,49147,0,-0.458552,-0.005892,-0.003637,0,...,0,0,0,0,1,0,0,1,0,0.690741
358116,0.026062,42.219.156.211,37.147.194.64,80,50443,0,-0.458552,-0.004952,-0.003442,0,...,0,0,0,0,1,1,0,1,1,0.151817


#### IP

In [30]:
def normalize_ip(data_df, column_name):
    source_ip_df = data_df[column_name].str.split(".", expand=True).astype(int)
    source_ip_df.columns = [f"{column_name}_{i}" for i in source_ip_df.columns]
    source_ip_df = source_ip_df/255
    return pd.concat([data_df, source_ip_df], axis=1).drop(columns=[column_name])

In [31]:
train_df = normalize_ip(train_df, "source_ip")
train_df = normalize_ip(train_df, "destination_ip")

In [32]:
train_df

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte,protocol_ESP,protocol_GRE,protocol_ICMP,...,flag_F,day_progress,source_ip_0,source_ip_1,source_ip_2,source_ip_3,destination_ip_0,destination_ip_1,destination_ip_2,destination_ip_3
364372,-0.123317,53,58807,0,-0.458552,-0.005892,-0.003618,0,0,0,...,0,0.126065,0.560784,0.282353,0.031373,0.537255,0.164706,0.858824,0.611765,0.905882
771335,-0.061351,10021,41155,0,-0.458552,-0.003072,-0.00351,0,0,0,...,1,0.636447,0.164706,0.858824,0.623529,0.321569,0.521569,0.070588,0.235294,0.705882
473445,-0.123173,443,44719,0,1.409447,-0.005265,-0.003621,0,0,0,...,0,0.166019,0.164706,0.858824,0.607843,0.352941,0.145098,0.439216,0.882353,0.760784
381856,-0.123317,47684,53,0,-0.458552,-0.005892,-0.003629,0,0,0,...,0,0.347523,0.164706,0.858824,0.603922,0.478431,0.560784,0.282353,0.031373,0.537255
987253,-0.123317,41275,53,0,-0.458552,-0.005892,-0.003629,0,0,0,...,0,0.041678,0.164706,0.858824,0.611765,0.776471,0.517647,0.019608,0.235294,0.243137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457952,-0.00758,54774,80,0,-0.458552,-0.004325,-0.003499,0,0,0,...,1,0.215625,0.815686,0.262745,0.533333,0.301961,0.164706,0.858824,0.611765,0.827451
78810,-0.107214,25,49727,0,-0.458552,-0.004638,-0.003522,0,0,0,...,1,0.801794,0.760784,0.752941,0.466667,0.149020,0.164706,0.858824,0.611765,0.843137
1023496,-0.123317,80,49147,0,-0.458552,-0.005892,-0.003637,0,0,0,...,0,0.690741,0.741176,0.321569,0.298039,0.035294,0.164706,0.858824,0.596078,0.054902
358116,0.026062,80,50443,0,-0.458552,-0.004952,-0.003442,0,0,0,...,1,0.151817,0.164706,0.858824,0.611765,0.827451,0.145098,0.576471,0.760784,0.250980


#### Port

In [33]:
MAX_PORT_VALUE = 65_535
train_df[["source_port", "destination_port"]] = train_df[["source_port", "destination_port"]]/MAX_PORT_VALUE

### Exporter la version transformée

In [34]:
train_df.describe()

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte,protocol_ESP,protocol_GRE,protocol_ICMP,...,flag_F,day_progress,source_ip_0,source_ip_1,source_ip_2,source_ip_3,destination_ip_0,destination_ip_1,destination_ip_2,destination_ip_3
count,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,...,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0,1296000.0
mean,-4.261063e-17,0.349228,0.2782722,0.0,-2.6066940000000003e-17,-5.948601999999999e-19,7.045119e-19,0.0002121914,0.0004050926,0.006101852,...,0.2817515,0.4937189,0.3597511,0.6808599,0.5114209,0.5878986,0.3120764,0.6919081,0.5379622,0.5138796
std,1.0,0.3622644,0.3481793,0.0,1.0,1.0,1.0,0.01456525,0.02012284,0.0778757,...,0.4498531,0.2899832,0.2777161,0.258609,0.220343,0.3189019,0.2448845,0.26663,0.2124511,0.3074991
min,-0.1233166,0.0,0.0,0.0,-0.4585522,-0.005891713,-0.003641204,0.0,0.0,0.0,...,0.0,0.0,0.04705882,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.1233166,0.001220722,0.001220722,0.0,-0.4585522,-0.005891713,-0.003631861,0.0,0.0,0.0,...,0.0,0.2077894,0.1647059,0.545098,0.3686275,0.3215686,0.1647059,0.5372549,0.5647059,0.2705882
50%,-0.1233166,0.130808,0.07646296,0.0,-0.4585522,-0.005578386,-0.00360361,0.0,0.0,0.0,...,0.0,0.4982639,0.1647059,0.8588235,0.6,0.6666667,0.1647059,0.8588235,0.6039216,0.5333333
75%,-0.08607972,0.7201953,0.6354162,0.0,-0.4585522,-0.00432508,-0.003495275,0.0,0.0,0.0,...,1.0,0.7488889,0.5607843,0.8588235,0.6117647,0.8745098,0.3764706,0.8588235,0.6196078,0.827451
max,96.63106,1.0,1.0,0.0,11.30984,561.4305,583.1429,1.0,1.0,1.0,...,1.0,0.9999884,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
train_df.to_csv(f"../../data/normalize/normalized-train-ugr16-{LEVEL}.csv")

In [36]:
print("ok.")

ok.


---

* forwarding ???

### Normalisation/Standardisation sur le jeu de Test

In [37]:
del train_df
del data_df

In [38]:
standardize_df(test_df, describe_df)

Unnamed: 0,datetime_end,duration,source_ip,destination_ip,source_port,destination_port,forwarding,type,packet,byte,...,label_dos,label_nerisbotnet,label_scan11,label_scan44,flag_U,flag_A,flag_P,flag_R,flag_S,flag_F
1025746,2016-08-14 00:09:42,1.579658,42.219.158.188,71.235.12.249,80,20738,0,-0.458552,-0.004638,-0.003483,...,0,0,0,0,0,1,1,0,1,0
479251,2016-04-21 19:41:39,-0.123317,204.97.44.210,42.219.155.95,62151,53,0,-0.458552,-0.005892,-0.003633,...,0,0,0,0,0,1,0,0,0,0
1211790,2016-08-03 13:55:57,-0.123317,143.72.8.137,42.219.152.23,53,44290,0,-0.458552,-0.005892,-0.003628,...,0,1,0,0,0,1,0,0,0,0
483656,2016-06-26 17:33:05,0.240426,42.219.159.82,216.50.119.56,80,41125,0,-0.458552,-0.004952,-0.003528,...,0,0,0,0,0,1,1,0,1,1
847407,2016-03-26 08:18:17,-0.123317,242.101.129.130,42.219.148.113,44708,23,0,-0.458552,-0.005892,-0.003638,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583516,2016-04-08 18:46:02,-0.123317,204.97.46.119,42.219.153.62,49258,53,0,-0.458552,-0.005892,-0.003633,...,0,0,0,0,0,1,0,0,0,0
977910,2016-06-08 20:08:43,0.96791,42.219.159.82,201.174.205.72,8000,49693,0,-0.458552,-0.003698,-0.003526,...,0,0,0,0,0,1,1,0,0,0
472709,2016-06-20 01:12:48,0.024193,42.219.156.211,61.150.21.99,80,43582,0,-0.458552,-0.004952,-0.003445,...,0,0,0,0,0,1,1,0,1,1
520598,2016-06-18 04:07:56,-0.123317,42.219.154.172,143.72.4.250,48821,53,0,-0.458552,-0.005892,-0.003631,...,0,0,0,0,0,1,0,0,0,0


In [39]:
test_df["day_progress"] = day_progress(test_df.datetime_end)
test_df.drop(columns=["datetime_end"], inplace=True)

In [40]:
test_df = normalize_ip(test_df, "source_ip")
test_df = normalize_ip(test_df, "destination_ip")

In [41]:
test_df[["source_port", "destination_port"]] = test_df[["source_port", "destination_port"]]/MAX_PORT_VALUE

In [42]:
test_df.describe()

Unnamed: 0,duration,source_port,destination_port,forwarding,type,packet,byte,protocol_ESP,protocol_GRE,protocol_ICMP,...,flag_F,day_progress,source_ip_0,source_ip_1,source_ip_2,source_ip_3,destination_ip_0,destination_ip_1,destination_ip_2,destination_ip_3
count,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,...,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0,144000.0
mean,0.002413,0.350483,0.276211,0.0,0.001721,0.002268,0.001694,0.000326,0.000444,0.006396,...,0.280785,0.495747,0.360488,0.680701,0.511704,0.586575,0.311524,0.692569,0.537794,0.51444
std,1.000337,0.362572,0.347548,0.0,1.001093,1.567562,1.559313,0.018063,0.021077,0.079718,...,0.449384,0.290349,0.27839,0.258621,0.22027,0.319031,0.244114,0.266463,0.212592,0.306972
min,-0.123317,0.0,0.0,0.0,-0.458552,-0.005892,-0.003641,0.0,0.0,0.0,...,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.123317,0.001221,0.001221,0.0,-0.458552,-0.005892,-0.003632,0.0,0.0,0.0,...,0.0,0.209965,0.164706,0.545098,0.368627,0.321569,0.164706,0.545098,0.564706,0.270588
50%,-0.123317,0.135103,0.076326,0.0,-0.458552,-0.005578,-0.003604,0.0,0.0,0.0,...,0.0,0.501985,0.164706,0.858824,0.6,0.658824,0.164706,0.858824,0.603922,0.537255
75%,-0.085648,0.721805,0.631037,0.0,-0.458552,-0.004325,-0.003495,0.0,0.0,0.0,...,1.0,0.752954,0.560784,0.858824,0.611765,0.87451,0.372549,0.858824,0.619608,0.827451
max,73.303003,0.999985,1.0,0.0,10.002242,569.282434,591.307613,1.0,1.0,1.0,...,1.0,0.999988,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
test_df.to_csv(f"../../data/normalize/normalized-test-ugr16-{LEVEL}.csv")