# Packet analysis of a TCP Network

Using the SUEE1 dataset, we will analyse TCP packet data of a 'eduroam' network and build three machine learning classifiers with the objective of identifying IPs running slowloris attack inside the network. 

In [1]:
# Lib importing
import os
import re

import pandas as pd





In [2]:
# Setting global vars
DATASETS_PATH = "datasets/"

In [3]:
# Reading the original, unchanged dataset after .CSV export
original_df = pd.read_csv(os.path.join(DATASETS_PATH, "original_dataset.csv"), sep=',', index_col="frame.number")

In [4]:
# Number of lines inside the dataset
print("Dataset number of lines: {}".format(len(original_df)))

Dataset number of lines: 2089436


In [5]:
# A little look at our dataset
original_df.head()

Unnamed: 0_level_0,frame.len,frame.protocols,eth.src,eth.dst,ip.proto,ip.len,ip.src,ip.dst,ip.ttl,tcp.flags.res,...,tcp.flags.push,tcp.flags.reset,tcp.flags.syn,tcp.flags.fin,tcp.stream,tcp.seq_raw,tcp.seq,tcp.window_size_value,tcp.time_relative,tcp.time_delta
frame.number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,52,192.168.0.1,192.168.0.2,64,0,...,0,0,0,1,0,1202351615,1,235,0.0,0.0
2,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:03,ff:ff:ff:ff:00:01,6,52,192.168.0.2,192.168.0.1,62,0,...,0,0,0,0,0,2705104314,1,245,0.038679,0.038679
3,54,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,40,192.168.0.1,192.168.0.3,64,0,...,0,0,0,1,1,3098131932,1,237,0.0,0.0
4,54,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:03,ff:ff:ff:ff:00:01,6,40,192.168.0.3,192.168.0.1,126,0,...,0,0,0,0,1,1558776769,1,254,0.001364,0.001364
5,66,eth:ethertype:ip:tcp,ff:ff:ff:ff:00:01,ff:ff:ff:ff:00:02,6,52,192.168.0.1,192.168.0.4,64,0,...,0,0,0,1,2,1199048573,1,235,0.0,0.0


In [6]:
# Columns and their respective type
original_df.dtypes

frame.len                  int64
frame.protocols           object
eth.src                   object
eth.dst                   object
ip.proto                   int64
ip.len                     int64
ip.src                    object
ip.dst                    object
ip.ttl                     int64
tcp.flags.res              int64
tcp.flags.ns               int64
tcp.flags.urg              int64
tcp.flags.ack              int64
tcp.flags.push             int64
tcp.flags.reset            int64
tcp.flags.syn              int64
tcp.flags.fin              int64
tcp.stream                 int64
tcp.seq_raw                int64
tcp.seq                    int64
tcp.window_size_value      int64
tcp.time_relative        float64
tcp.time_delta           float64
dtype: object

In [7]:
# Statistical analysis of the quantitative columns
original_df[['frame.len',
             'ip.len',
             'ip.ttl',
             'tcp.window_size_value',
             'tcp.time_relative',
             'tcp.time_delta' 
           ]].describe()

Unnamed: 0,frame.len,ip.len,ip.ttl,tcp.window_size_value,tcp.time_relative,tcp.time_delta
count,2089436.0,2089436.0,2089436.0,2089436.0,2089436.0,2089436.0
mean,62.34917,790.1674,66.43968,10117.51,-29.29557,-0.5852291
std,9.250634,2815.735,18.59239,18626.29,370.8226,65.31709
min,54.0,40.0,37.0,0.0,-4293.922,-4293.949
25%,54.0,44.0,62.0,235.0,0.010678,7e-06
50%,66.0,52.0,64.0,305.0,0.235441,4.3e-05
75%,66.0,557.0,64.0,8192.0,1.545133,0.001274
max,537.0,65212.0,254.0,65535.0,2689.156,2684.0


In [8]:
# Checking how many unique IP addresses there's on the dataset
print("IP sources: {}\n"\
       "IP destinations: {}".format(len(original_df['ip.src'].value_counts()), len(original_df['ip.dst'].value_counts())))

IP sources: 1859
IP destinations: 1858


## Data cleaning

In [9]:
# Discarding useless columns
new_df = original_df.drop(labels=[
    'frame.len',      # Not needed 
    'frame.protocols',# Not needed'
    'eth.src',        # Redundant w/ ip.src
    'eth.dst',        # Redundant w/ ip.dst
    'ip.proto',       # Always 6 (TCP)
    'ip.len',         # Not needed
    'ip.dst',         # Doesn't matter who's being attacked
    'tcp.seq_raw'     # Not needed
], axis=1)

In [10]:
# Grouping and generating aggregates
new_df = new_df.groupby(['tcp.stream', 'ip.src']).agg({
    'ip.ttl': ['mean', 'median'],
    'tcp.time_relative': ['max', 'min', 'mean', 'median'],
    'ip.ttl': ['mean', 'median', 'max', 'min'],
    'tcp.flags.res': ['sum'],
    'tcp.flags.ns': ['sum'],
    'tcp.flags.urg': ['sum'],
    'tcp.flags.ack': ['sum'],
    'tcp.flags.push': ['sum'],
    'tcp.flags.reset': ['sum'],
    'tcp.flags.syn': ['sum'],
    'tcp.flags.fin': ['sum'],
    'tcp.seq': ['max'],
    'tcp.window_size_value': ['max', 'min', 'mean', 'median'],
    'tcp.time_relative': ['max', 'min', 'mean', 'median'],
    'tcp.time_delta': ['max', 'min', 'mean', 'median']
})

# Flattening new columns to index 0
new_df.columns = ["_".join(a) for a in new_df.columns.to_flat_index()]

# Flattening MultiIdx
new_df.index = [f"{num}_{value[1]}" for num, value in enumerate(new_df.index.to_flat_index())]

In [11]:
# Generating labels
new_df['attacker'] = new_df.index.str.match('.+_10\.128\.0\.[012]?[0-9][0-9]').astype(int) # Match IP addresses from 10.128.0.0 to 10.128.0.256

In [12]:
# Saving cleansed dataset

new_df.to_csv(os.path.join(DATASETS_PATH, 'labeled_dataset.csv'), sep=',')
new_df.head()

Unnamed: 0,ip.ttl_mean,ip.ttl_median,ip.ttl_max,ip.ttl_min,tcp.time_relative_max,tcp.time_relative_min,tcp.time_relative_mean,tcp.time_relative_median,tcp.flags.res_sum,tcp.flags.ns_sum,...,tcp.seq_max,tcp.window_size_value_max,tcp.window_size_value_min,tcp.window_size_value_mean,tcp.window_size_value_median,tcp.time_delta_max,tcp.time_delta_min,tcp.time_delta_mean,tcp.time_delta_median,attacker
0_192.168.0.1,64.0,64.0,64,64,0.0,-2792.830719,-1396.41536,-1396.41536,0,0,...,2,235,0,117.5,117.5,3.4e-05,0.0,1.7e-05,1.7e-05,0
1_192.168.0.2,62.0,62.0,62,62,0.038679,-2792.830753,-1396.396037,-1396.396037,0,0,...,1,245,245,245.0,245.0,0.038679,-2792.869432,-1396.415377,-1396.415377,0
2_192.168.0.1,64.0,64.0,64,64,2.544811,0.0,1.272406,1.272406,0,0,...,2,237,237,237.0,237.0,1.3e-05,0.0,6e-06,6e-06,0
3_192.168.0.3,126.0,126.0,126,126,2.544837,0.001364,1.697,2.544798,0,0,...,2,254,0,169.333333,254.0,2.543434,2.6e-05,0.848275,0.001364,0
4_192.168.0.1,64.0,64.0,64,64,6.003576,0.0,3.001788,3.001788,0,0,...,2,235,235,235.0,235.0,2.1e-05,0.0,1e-05,1e-05,0
