In [1]:
import pandas as pd
import numpy as np

import socket, struct

In [2]:
PATH_DF = '../Data/balanced_data.csv'

COLUMNS = ['pkSeqID','stime','flgs','proto','saddr','sport','daddr','dport',
           'pkts','bytes','state','ltime','seq','dur','mean','stddev','smac',
           'dmac','sum','min','max','soui','doui','sco','dco','spkts','dpkts',
           'sbytes','dbytes','rate','srate','drate','attack','category','subcategory' ]

TARGET = 'attack'

DTYPES = {
    "proto":      "string",
    "saddr":      "string",
    "sport":      "string",
    "daddr":      "string",
    "dport":      "string",
    "pkts":        "int64",
    "state":      "string",
    "dur":       "float64",
    "spkts":       "int64",
    "dpkts":       "int64",
    "sbytes":      "int64",
    "dbytes":      "int64",
    "srate":     "float64",
    "drate":     "float64",
    "attack":      "int64"
}

In [3]:
df = pd.read_csv(PATH_DF, 
    header=0, 
    names=COLUMNS,
    dtype=DTYPES)

In [4]:
df

Unnamed: 0,pkSeqID,stime,flgs,proto,saddr,sport,daddr,dport,pkts,bytes,...,spkts,dpkts,sbytes,dbytes,rate,srate,drate,attack,category,subcategory
0,1,1.526344e+09,e,arp,3232261121,0,3232261123,0,4,240,...,2,2,120,120,0.002508,0.000836,0.000836,0,Normal,Normal
1,2,1.526344e+09,e,tcp,3232261127,139,3232261124,36390,10,680,...,5,5,350,330,0.006190,0.002751,0.002751,0,Normal,Normal
2,3,1.526344e+09,e,udp,3232261269,51838,461143546,123,2,180,...,1,1,90,90,20.590960,0.000000,0.000000,0,Normal,Normal
3,4,1.526344e+09,e,arp,3232261124,0,3232261127,0,10,510,...,5,5,210,300,0.006189,0.002751,0.002751,0,Normal,Normal
4,5,1.526344e+09,e,udp,3232261147,58999,3232261121,53,4,630,...,2,2,174,456,0.005264,0.001755,0.001755,0,Normal,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,62019089,1.528100e+09,e,udp,3232261268,30275,3232261123,80,7,420,...,7,0,420,0,0.547367,0.547367,0.000000,1,DDoS,UDP
28625,62019090,1.528100e+09,e,udp,3232261268,30276,3232261123,80,7,420,...,7,0,420,0,0.547367,0.547367,0.000000,1,DDoS,UDP
28626,62019091,1.528100e+09,e,udp,3232261268,30277,3232261123,80,7,420,...,7,0,420,0,0.547367,0.547367,0.000000,1,DDoS,UDP
28627,62019092,1.528100e+09,e,udp,3232261268,30278,3232261123,80,7,420,...,7,0,420,0,0.547367,0.547367,0.000000,1,DDoS,UDP


In [5]:
df = df.drop(['category', 'subcategory'], axis=1)
df = df.drop(['sco', 'dco'], axis=1)
df = df.drop(['soui', 'doui'], axis=1)
df = df.drop(['flgs', 'stime'], axis=1)
df = df.drop(['smac', 'dmac'], axis=1)
df = df.drop(['pkSeqID'], axis=1)
df = df.drop(['srate', 'drate', 'rate'], axis=1)
df = df.drop(['sbytes', 'dbytes'], axis=1)
df = df.drop(['sum', 'max', 'min'], axis=1)
df = df.drop(['state'], axis=1)

In [6]:
def ip2long(ip):
    try:
        packed_ip = socket.inet_aton(ip)
    except OSError:
        packed_ip = socket.inet_aton('192.168.0.1')
        
    return struct.unpack("!L", packed_ip)[0]

In [7]:
def str2int(s):
    if type(s) == pd._libs.missing.NAType:
        return np.NaN
    
    try:
        if s[:2] == '0x':
            return int(s, base=16)
        else:
            return int(s)
    except ValueError:
        return np.NaN

In [8]:
df.sport = df.sport.apply(str2int)
df.dport = df.dport.apply(str2int)

In [9]:
df.saddr = df.saddr.apply(ip2long)
df.daddr = df.daddr.apply(ip2long)

In [10]:
df

Unnamed: 0,proto,saddr,sport,daddr,dport,pkts,bytes,ltime,seq,dur,mean,stddev,spkts,dpkts,attack
0,arp,3232261121,0,3232261123,0,4,240,1.526345e+09,9,1195.996582,0.000006,0.000002,2,2,0
1,tcp,3232261127,139,3232261124,36390,10,680,1.526346e+09,10,1453.945923,0.000028,0.000008,5,5,0
2,udp,3232261269,51838,461143546,123,2,180,1.526344e+09,11,0.048565,0.048565,0.000000,1,1,0
3,arp,3232261124,0,3232261127,0,10,510,1.526346e+09,12,1454.080322,0.000238,0.000022,5,5,0
4,udp,3232261147,58999,3232261121,53,4,630,1.526345e+09,14,569.933960,0.098505,0.080150,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,udp,3232261268,30275,3232261123,80,7,420,1.528100e+09,11736,10.961563,4.646587,0.243034,7,0,1
28625,udp,3232261268,30276,3232261123,80,7,420,1.528100e+09,11737,10.961563,4.646587,0.243033,7,0,1
28626,udp,3232261268,30277,3232261123,80,7,420,1.528100e+09,11738,10.961563,4.646586,0.243033,7,0,1
28627,udp,3232261268,30278,3232261123,80,7,420,1.528100e+09,11739,10.961563,4.646587,0.243033,7,0,1


In [11]:
dummies_proto = pd.get_dummies(df.proto, prefix='proto')
df = df.drop(['proto'], axis=1)

In [12]:
df

Unnamed: 0,saddr,sport,daddr,dport,pkts,bytes,ltime,seq,dur,mean,stddev,spkts,dpkts,attack
0,3232261121,0,3232261123,0,4,240,1.526345e+09,9,1195.996582,0.000006,0.000002,2,2,0
1,3232261127,139,3232261124,36390,10,680,1.526346e+09,10,1453.945923,0.000028,0.000008,5,5,0
2,3232261269,51838,461143546,123,2,180,1.526344e+09,11,0.048565,0.048565,0.000000,1,1,0
3,3232261124,0,3232261127,0,10,510,1.526346e+09,12,1454.080322,0.000238,0.000022,5,5,0
4,3232261147,58999,3232261121,53,4,630,1.526345e+09,14,569.933960,0.098505,0.080150,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,3232261268,30275,3232261123,80,7,420,1.528100e+09,11736,10.961563,4.646587,0.243034,7,0,1
28625,3232261268,30276,3232261123,80,7,420,1.528100e+09,11737,10.961563,4.646587,0.243033,7,0,1
28626,3232261268,30277,3232261123,80,7,420,1.528100e+09,11738,10.961563,4.646586,0.243033,7,0,1
28627,3232261268,30278,3232261123,80,7,420,1.528100e+09,11739,10.961563,4.646587,0.243033,7,0,1


In [13]:
X = df.drop(TARGET, axis=1).values
y = df[TARGET]

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()

In [16]:
X = scaler.fit_transform(X)

In [17]:
df = pd.concat([
    pd.DataFrame(X, columns=df.drop([TARGET], axis=1).columns),
    dummies_proto, y]
    , axis=1)

In [18]:
df

Unnamed: 0,saddr,sport,daddr,dport,pkts,bytes,ltime,seq,dur,mean,...,spkts,dpkts,proto_arp,proto_icmp,proto_igmp,proto_ipv6-icmp,proto_rarp,proto_tcp,proto_udp,attack
0,0.017292,-1.925358,0.259410,-0.188136,-0.099729,-0.090844,-2.611974,-0.703900,4.932378,-1.724670,...,-0.102623,-0.065129,1,0,0,0,0,0,0,0
1,0.017292,-1.916259,0.259410,22.266787,-0.098887,-0.090778,-2.611333,-0.703891,6.033519,-1.724658,...,-0.102035,-0.064089,0,0,0,0,0,1,0,0
2,0.017295,1.467936,-3.470081,-0.112237,-0.100009,-0.090853,-2.613917,-0.703882,-0.172914,-1.698443,...,-0.102819,-0.065476,0,0,0,0,0,0,1,0
3,0.017292,-1.925358,0.259410,-0.188136,-0.098887,-0.090804,-2.611323,-0.703874,6.034093,-1.724545,...,-0.102035,-0.064089,1,0,0,0,0,0,0,0
4,0.017293,1.936692,0.259410,-0.155432,-0.099729,-0.090786,-2.612766,-0.703856,2.259827,-1.671471,...,-0.102623,-0.065129,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,0.017295,0.056431,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599336,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28625,0.017295,0.056496,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599327,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28626,0.017295,0.056562,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599318,-0.126328,0.784920,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28627,0.017295,0.056627,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599309,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1


In [19]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [20]:
x = StandardScaler().fit_transform(df.iloc[:,:])

pca = PCA()
x_pca = pca.fit_transform(x)

for i, p in enumerate(pca.explained_variance_ratio_):
	print('Feature \'{}\' - {:.3f}%'.format(df.columns[i], p*100))

Feature 'saddr' - 20.405%
Feature 'sport' - 15.028%
Feature 'daddr' - 10.986%
Feature 'dport' - 7.139%
Feature 'pkts' - 6.099%
Feature 'bytes' - 5.512%
Feature 'ltime' - 5.107%
Feature 'seq' - 4.836%
Feature 'dur' - 4.760%
Feature 'mean' - 3.784%
Feature 'stddev' - 3.395%
Feature 'spkts' - 3.072%
Feature 'dpkts' - 2.926%
Feature 'proto_arp' - 2.061%
Feature 'proto_icmp' - 1.951%
Feature 'proto_igmp' - 1.555%
Feature 'proto_ipv6-icmp' - 0.847%
Feature 'proto_rarp' - 0.427%
Feature 'proto_tcp' - 0.109%
Feature 'proto_udp' - 0.000%
Feature 'attack' - 0.000%


In [21]:
df

Unnamed: 0,saddr,sport,daddr,dport,pkts,bytes,ltime,seq,dur,mean,...,spkts,dpkts,proto_arp,proto_icmp,proto_igmp,proto_ipv6-icmp,proto_rarp,proto_tcp,proto_udp,attack
0,0.017292,-1.925358,0.259410,-0.188136,-0.099729,-0.090844,-2.611974,-0.703900,4.932378,-1.724670,...,-0.102623,-0.065129,1,0,0,0,0,0,0,0
1,0.017292,-1.916259,0.259410,22.266787,-0.098887,-0.090778,-2.611333,-0.703891,6.033519,-1.724658,...,-0.102035,-0.064089,0,0,0,0,0,1,0,0
2,0.017295,1.467936,-3.470081,-0.112237,-0.100009,-0.090853,-2.613917,-0.703882,-0.172914,-1.698443,...,-0.102819,-0.065476,0,0,0,0,0,0,1,0
3,0.017292,-1.925358,0.259410,-0.188136,-0.098887,-0.090804,-2.611323,-0.703874,6.034093,-1.724545,...,-0.102035,-0.064089,1,0,0,0,0,0,0,0
4,0.017293,1.936692,0.259410,-0.155432,-0.099729,-0.090786,-2.612766,-0.703856,2.259827,-1.671471,...,-0.102623,-0.065129,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,0.017295,0.056431,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599336,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28625,0.017295,0.056496,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599327,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28626,0.017295,0.056562,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599318,-0.126328,0.784920,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1
28627,0.017295,0.056627,0.259410,-0.138771,-0.099308,-0.090817,0.514722,-0.599309,-0.126328,0.784921,...,-0.101642,-0.065822,0,0,0,0,0,0,1,1


In [22]:
print(len(df[df.attack==1]))
print(len(df[df.attack==0]))

19086
9543


In [25]:
19086+9543

28629

In [26]:
df.to_csv('/home/maxim/IBKS/4year/ОИИ/data/prepared_full.csv', index=False)

In [23]:
from sklearn.decomposition import PCA

In [32]:
X = df.drop(TARGET, axis=1).values
y = df[TARGET]

In [33]:
X = PCA(3).fit_transform(X)

In [34]:
df1 = pd.concat([pd.DataFrame(X), y], axis=1)

In [35]:
# Перемешиваем семплы и сохраняем
df1 = df1.sample(frac=1)
df1.to_csv('/home/maxim/IBKS/4year/ОИИ/data/prepared_3.csv', index=False)