# Packet analysis of a TCP Network

Using the SUEE1 dataset, we will analyse TCP packet data of a 'eduroam' network and build three machine learning classifiers with the objective of identifying IPs running slowloris attack inside the network. 

In [1]:
# Importando bibliotecas necessárias

import os


import pandas as pd





In [2]:
# Setting global vars

DATASETS_PATH = "datasets/"

In [15]:
# Reading the original, unchanged dataset after .CSV export

original_df = pd.read_csv(os.path.join(DATASETS_PATH, "original_dataset.csv"), sep=',', index_col="frame.number")

In [16]:
# Number of lines inside the dataset

print("Dataset number of lines: {}".format(len(original_df)))

Dataset number of lines: 2089436


In [17]:
# A little look at our dataset

original_df.head()

Unnamed: 0_level_0,frame.len,frame.protocols,eth.src,eth.dst,ip.proto,ip.len,ip.src,ip.dst,ip.ttl,tcp.flags,tcp.seq_raw,tcp.seq,tcp.stream,tcp.window_size_value,tcp.time_relative,tcp.time_delta
frame.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,52,192.168.0.1,192.168.0.2,64,0x00000011,1202351615,1,0,235,0.0,0.0
2,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:03,ff:ff:ff:ff:00:01,6,52,192.168.0.2,192.168.0.1,62,0x00000010,2705104314,1,0,245,0.038679,0.038679
3,54,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,40,192.168.0.1,192.168.0.3,64,0x00000011,3098131932,1,1,237,0.0,0.0
4,54,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:03,ff:ff:ff:ff:00:01,6,40,192.168.0.3,192.168.0.1,126,0x00000010,1558776769,1,1,254,0.001364,0.001364
5,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,52,192.168.0.1,192.168.0.4,64,0x00000011,1199048573,1,2,235,0.0,0.0


In [11]:
# Columns and their respective type

original_df.dtypes

frame.number               int64
frame.len                  int64
frame.protocols           object
eth.src                   object
eth.dst                   object
ip.proto                   int64
ip.len                     int64
ip.src                    object
ip.dst                    object
ip.ttl                     int64
tcp.flags                 object
tcp.seq_raw                int64
tcp.seq                    int64
tcp.stream                 int64
tcp.window_size_value      int64
tcp.time_relative        float64
tcp.time_delta           float64
dtype: object

In [21]:
# Statistical analysis of the quantitative columns

original_df[['frame.len',
             'ip.len',
             'ip.ttl',
             'tcp.window_size_value',
             'tcp.time_relative',
             'tcp.time_delta' 
           ]].describe()

Unnamed: 0,frame.len,ip.len,ip.ttl,tcp.window_size_value,tcp.time_relative,tcp.time_delta
count,2089436.0,2089436.0,2089436.0,2089436.0,2089436.0,2089436.0
mean,62.34917,790.1674,66.43968,10117.51,-29.29557,-0.5852291
std,9.250634,2815.735,18.59239,18626.29,370.8226,65.31709
min,54.0,40.0,37.0,0.0,-4293.922,-4293.949
25%,54.0,44.0,62.0,235.0,0.010678,7e-06
50%,66.0,52.0,64.0,305.0,0.235441,4.3e-05
75%,66.0,557.0,64.0,8192.0,1.545133,0.001274
max,537.0,65212.0,254.0,65535.0,2689.156,2684.0


In [32]:
# Checking how many unique IP addresses there's on the dataset

print("IP sources: {}\n"\
       "IP destinations: {}".format(len(original_df['ip.src'].value_counts()), len(original_df['ip.dst'].value_counts())))

IP sources: 1859
IP destinations: 1858


## Data cleaning

In [None]:
# Discarding useless columns



