In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('archive/Dataset-Unicauca-Version2-87Atts.csv')

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.head(5)

# Explorative Data Analysis

In [3]:
names = df['ProtocolName'].value_counts()

In [4]:
# Show distribution of protocolName. I want to see what protocols are barely present in the dataset

for name, count in names.items():
    print("{:<30} {:<10}".format(name, count))
    

GOOGLE                         959110    
HTTP                           683734    
HTTP_PROXY                     623210    
SSL                            404883    
HTTP_CONNECT                   317526    
YOUTUBE                        170781    
AMAZON                         86875     
MICROSOFT                      54710     
GMAIL                          40260     
WINDOWS_UPDATE                 34471     
SKYPE                          30657     
FACEBOOK                       29033     
DROPBOX                        25102     
YAHOO                          21268     
TWITTER                        18259     
CLOUDFLARE                     14737     
MSN                            14478     
CONTENT_FLASH                  8589      
APPLE                          7615      
OFFICE_365                     5941      
WHATSAPP                       4593      
INSTAGRAM                      2415      
WIKIPEDIA                      2025      
MS_ONE_DRIVE                   174

The target labels in the dataset are very unbalanced, ranging from single digit sample sizes to almost 1M. Because of this, it may be useful to reduce the number of labels to only those with a significant number of samples, e.g. > 10000.

# Preprocessing

In [5]:
# remove rows for labels that have less than 10000 samples. Don't groupby, just filter
valid_names = names[names > 10000]

dataset = df[df['ProtocolName'].isin(valid_names.index)]

In [6]:
del df

# PCA

In [7]:
from sklearn.decomposition import PCA

pca = PCA()

X = dataset.drop(columns=['ProtocolName', 'L7Protocol'])
y = dataset['L7Protocol']

X = X.drop(columns=['Flow.ID', 'Source.IP', 'Source.Port', 'Destination.IP', 'Destination.Port', 'Timestamp', 'Label'])

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3529094 entries, 0 to 3577295
Data columns (total 78 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Protocol                     int64  
 1   Flow.Duration                int64  
 2   Total.Fwd.Packets            int64  
 3   Total.Backward.Packets       int64  
 4   Total.Length.of.Fwd.Packets  int64  
 5   Total.Length.of.Bwd.Packets  float64
 6   Fwd.Packet.Length.Max        int64  
 7   Fwd.Packet.Length.Min        int64  
 8   Fwd.Packet.Length.Mean       float64
 9   Fwd.Packet.Length.Std        float64
 10  Bwd.Packet.Length.Max        int64  
 11  Bwd.Packet.Length.Min        int64  
 12  Bwd.Packet.Length.Mean       float64
 13  Bwd.Packet.Length.Std        float64
 14  Flow.Bytes.s                 float64
 15  Flow.Packets.s               float64
 16  Flow.IAT.Mean                float64
 17  Flow.IAT.Std                 float64
 18  Flow.IAT.Max                 float64
 19  Flow.

In [9]:
del dataset

In [10]:
pca.fit(X)

In [11]:
X_pca = pca.transform(X)
X_pca_df = pd.DataFrame(X_pca)

# Training a Linear Classification Model

In [12]:
# make the dataset using the first 5 principle components.
NUM_COMPONENTS = 5

X_pca_5 = X_pca_df.iloc[:, :NUM_COMPONENTS].reset_index(drop=True)
y = y.reset_index(drop=True)

dataset = pd.concat([X_pca_5, y], axis=1, ignore_index=True)

In [15]:
dataset

Unnamed: 0,0,1,2,3,4,5
0,4.159893e+07,-2.525354e+07,-1.140621e+06,-2.507773e+06,-1.920897e+05,131
1,4.634924e+07,-1.693855e+07,-1.094968e+06,-2.501737e+06,-1.842778e+05,131
2,3.703998e+08,5.603264e+08,-5.145097e+05,-1.263781e+06,-9.228787e+04,7
3,4.047045e+07,-2.740550e+07,-1.104863e+06,-2.521422e+06,-1.854397e+05,7
4,4.038291e+07,-2.734076e+07,-1.095320e+06,-2.435154e+06,-1.697449e+05,131
...,...,...,...,...,...,...
3529089,3.594183e+07,-2.486588e+07,-4.943444e+05,-1.869375e+06,2.093018e+05,91
3529090,7.000412e+07,2.520565e+07,-1.053326e+06,-2.407583e+06,-1.808110e+05,91
3529091,3.539059e+07,-2.455580e+07,-4.880300e+05,-1.753801e+06,1.013372e+05,91
3529092,3.564846e+07,-2.470003e+07,-9.897191e+05,-1.994083e+06,9.045579e+05,91


## SVM

In [16]:
EVAL_TRAIN_RATIO = 0.2
TEST_VAL_RATIO = 0.5

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=EVAL_TRAIN_RATIO, random_state=42)

# Split test into validation and test
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=TEST_VAL_RATIO, random_state=42)

In [None]:
del dataset

In [17]:
# I have not tested if this works

from sklearn import svm

classifier = svm.LinearSVC()

classifier.fit(X_train, y_train)


