In [1]:
import pandas as pd
import numpy as np

duration 	    length (number of seconds) of the connection 
protocol_type 	type of the protocol, e.g. tcp, udp, etc. 	
service 	    network service on the destination, e.g., http, telnet, etc. 	
src_bytes 	    number of data bytes from source to destination
dst_bytes 	    number of data bytes from destination to source 	
flag 	        normal or error status of the connection 	 
land 	        1 if connection is from/to the same host/port; 0 otherwise 	
wrong_fragment 	number of wrong fragments 
urgent       	number of urgent packets 	continuous
 
Table 1: Basic features of individual TCP connections.
 
hot 	            number of hot indicators
num_failed_logins 	number of failed login attempts 	
logged_in 	        1 if successfully logged in; 0 otherwise 	
num_compromised 	number of compromised conditions 	
root_shell 	        1 if root shell is obtained; 0 otherwise 	
su_attempted 	    1 if su root command attempted; 0 otherwise 
num_root 	        number of root accesses 	
num_file_creations 	number of file creation operations 	
num_shells 	        number of shell prompts 	
num_access_files 	number of operations on access control files 	
num_outbound_cmds	number of outbound commands in an ftp session 	
is_hot_login 	    1 if the login belongs to the hot list; 0 otherwise 
is_guest_login 	    1 if the login is a guest login; 0 otherwise 
  
Table 2: Content features within a connection suggested by domain knowledge.
 

count number of connections to the same host as the current connection in the past two seconds 	
Note: The following  features refer to these same-host connections.	
serror_rate 	% of connections that have SYN errors 	
rerror_rate 	% of connections that have REJ errors 	
same_srv_rate 	% of connections to the same service 	
diff_srv_rate 	% of connections to different services

srv_count 	   number of connections to the same service as the current connection in the past two seconds
Note: The following features refer to these same-service connections.	
srv_serror_rate 	% of connections that have SYN errors 
srv_rerror_rate 	% of connections that have REJ errors 
srv_diff_host_rate 	% of connections to different hosts 

In [2]:
kdd = pd.read_csv("data/kddcup_data_10_percent")
kdd

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,classification
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1,0,0.11,0.00,0.00,0.00,0.00,0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1,0,0.05,0.00,0.00,0.00,0.00,0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1,0,0.03,0.00,0.00,0.00,0.00,0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1,0,0.03,0.00,0.00,0.00,0.00,0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1,0,0.02,0.00,0.00,0.00,0.00,0,normal.
5,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1,0,0.02,0.00,0.00,0.00,0.00,0,normal.
6,0,tcp,http,SF,212,1940,0,0,0,0,...,69,1,0,1.00,0.04,0.00,0.00,0.00,0,normal.
7,0,tcp,http,SF,159,4087,0,0,0,0,...,79,1,0,0.09,0.04,0.00,0.00,0.00,0,normal.
8,0,tcp,http,SF,210,151,0,0,0,0,...,89,1,0,0.12,0.04,0.00,0.00,0.00,0,normal.
9,0,tcp,http,SF,212,786,0,0,0,1,...,99,1,0,0.12,0.05,0.00,0.00,0.00,0,normal.


DOS: denial-of-service, e.g. syn flood;
R2L: unauthorized access from a remote machine, e.g. guessing password;
U2R:  unauthorized access to local superuser (root) privileges, e.g., various buffer overflow attacks;
probing: surveillance and other probing, e.g., port scanning.

In [3]:
pd.unique(kdd.classification.ravel())

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [4]:
normal = kdd[kdd.classification == "normal."]

In [5]:
#dos
back= kdd[kdd.classification == "back."]
land = kdd[kdd.classification == "land."]
neptune = kdd[kdd.classification == "neptune."]
pod = kdd[kdd.classification == "pod."]
smurf = kdd[kdd.classification == "smurf."]
teardrop = kdd[kdd.classification == "teardrop."]

frames = [back,land,neptune,pod,smurf,teardrop]
dos = pd.concat(frames)

In [6]:
#u2r
loadmodule = kdd[kdd.classification == "loadmodule."]
buffer_overflow = kdd[kdd.classification == "buffer_overflow."]
perl = kdd[kdd.classification == "perl."]
rootkit = kdd[kdd.classification == "rootkit."]

frames = [loadmodule,buffer_overflow,perl,rootkit]
u2r = pd.concat(frames)

In [7]:
#r2l
ftp_write = kdd[kdd.classification == "ftp_write."]
guess_passwd = kdd[kdd.classification == "guess_passwd."]
imap = kdd[kdd.classification == "imap."]
multihop = kdd[kdd.classification == "multihop."]
phf = kdd[kdd.classification == "phf."]
spy = kdd[kdd.classification == "spy."]
warezclient = kdd[kdd.classification == "warezclient."]
warezmaster = kdd[kdd.classification == "warezmaster."]

frames = [ftp_write,guess_passwd,imap,multihop,phf,spy,warezclient,warezmaster]
rl2 = pd.concat(frames)

In [8]:
#probe
ipsweep = kdd[kdd.classification == "ipsweep."]
nmap = kdd[kdd.classification == "nmap."]
portsweep = kdd[kdd.classification == "portsweep."]
satan = kdd[kdd.classification == "satan."]

frames = [ipsweep,nmap,portsweep,satan]
probe = pd.concat(frames)

What do we consider items that go in the basket.
Each of our basket has 42 items but many of them are floats or ints
so they have multiple values. If we look at the min, max and mean of those
values we can break them up into distinct parts so each one will have it's own
designation in the basket. This way we are checking for absence of some attributes
as well as presence since that is also important.



In [9]:
print kdd.duration.mean()
print kdd.duration.median()
print kdd.duration.max()
print kdd.duration.min()
# 0 = "low_duration"
# 0-48 = "medium_duration"
# >48 = "high_duration"

47.9793024993
0.0
58329
0


In [10]:
print kdd.src_bytes.mean()
print kdd.src_bytes.median()
print kdd.src_bytes.max()
print kdd.src_bytes.min()
# 0-520 = low_src_bytes
# 520 - 3025 = medium_src_bytes
# >3025 = high_src_bytes

3025.61029592
520.0
693375640
0


In [11]:
print kdd.dst_bytes.mean()
print kdd.dst_bytes.median()
print kdd.dst_bytes.max()
print kdd.dst_bytes.min()
# 0 = low_dst_bytes
# 0-869 = medium_dst_bytes
# > 869 = high_dst_bytes

868.532424735
0.0
5155468
0


In [12]:
print kdd.wrong_fragment.mean()
print kdd.wrong_fragment.max()
print kdd.wrong_fragment.min()
# 0 = low_wrong_fragment
# 0-1 = medium_wrong_fragment
# >1 = high_wrong_fragment

0.00643292491615
3
0


In [13]:
print kdd.urgent.mean()
print kdd.urgent.max()
print kdd.urgent.min()
# 0 = low_urgent
# 0-1 = medium_urgent
#>1 = high_urgent

1.41694381413e-05
3
0


In [14]:
print kdd.hot.mean()
print kdd.hot.max()
print kdd.hot.min()
# 0 = low_hot
# 0-1 = medium_hot
# >1 = high_hot

0.0345187755176
30
0


In [15]:
print kdd.num_failed_logins.mean()
print kdd.num_failed_logins.max()
print kdd.num_failed_logins.min()
# 0 = low_num_failed_logins
# 0-1 = medium_num_failed_logins
# >1 = high_num_failed_logins

0.000151815408657
5
0


In [16]:
print kdd.num_compromised.mean()
print kdd.num_compromised.max()
print kdd.num_compromised.min()
# 0 = low_num_compromised
# 0-1 = medium_num_compromised
# >1 = high_num_compromised

0.010212116489
884
0


In [17]:
print kdd.root_shell.mean()
print kdd.root_shell.max()
print kdd.root_shell.min()
# 0 = low_root_shell
# >0 = high_root_shell

0.000111331299682
1
0


In [18]:
print kdd.su_attempted.mean()
print kdd.su_attempted.max()
print kdd.su_attempted.min()
#0 = low_su_attempted
#>1 = high_su_attempted

3.64356980776e-05
2
0


In [19]:
print kdd.num_root.mean()
print kdd.num_root.median()
print kdd.num_root.max()
print kdd.num_root.min()
#0 = low_num_root
# 0-1 = medium_num_root
# >1 = high_num_root

0.0113517441566
0.0
993
0


In [20]:
print kdd.num_file_creations.mean()
print kdd.num_file_creations.max()
print kdd.num_file_creations.min()

# 0 = low_file_creations
# 0-1 = medium_file_creations
#>1 = high_file_creations

0.00108294991508
28
0


In [21]:
print kdd.num_shells.mean()
print kdd.num_shells.max()
print kdd.num_shells.min()
# 0 = low_num_shells
# >1 = high_num_shells

0.000109307094233
2
0


In [22]:
print kdd.num_access_files.mean()
print kdd.num_access_files.max()
print kdd.num_access_files.min()
# 0 = low_num_access_files
#>1 = high_num_access_files

0.00100805431348
8
0


In [23]:
print kdd.num_outbound_cmds.mean()
print kdd.num_outbound_cmds.min()
print kdd.num_outbound_cmds.max()
#0 = low_outbound_cmds
#>0 = high_outbound_cmds

0.0
0
0


In [24]:
print kdd['count'].mean()
print kdd['count'].median()
print kdd['count'].min()
print kdd['count'].max()
#0-332 low_count
#332-510
#510-511

332.285690284
510.0
0
511


In [25]:
print kdd.srv_count.mean()
print kdd.srv_count.median()
print kdd.srv_count.max()
print kdd.srv_count.min()
#0-293 = low_srv_count
#293-510 = medium_srv_count
#>=510 = high_srv_count

292.906556604
510.0
511
0


In [26]:
print kdd.serror_rate.mean()
print kdd.serror_rate.median()
# 0 = low_serror_rate
# 0 - 0.17 = medium_serror_rate
# > 0.17 = high_serror_rate

0.176686659069
0.0


In [27]:
print kdd.srv_serror_rate.mean()
print kdd.srv_serror_rate.median()
# 0 = low_srv_serror_rate
# 0 - 0.17 = medium_srv_serror_rate
# > 0.17 = high_srv_serror_rate

0.176608808128
0.0


In [28]:
print kdd.rerror_rate.mean()
print kdd.rerror_rate.median()
# 0 = low rerror_rate
# 0 - 0.05 = medium_rerror_rate
#>.05 = high_rerror_rate


0.0574334087012
0.0


In [29]:
print kdd.srv_rerror_rate.mean()
print kdd.srv_rerror_rate.median()
# 0 = low rerror_rate
# 0 - 0.05 = medium_rerror_rate
#>.05 = high_rerror_rate

0.0577189431219
0.0


In [30]:
print kdd.same_srv_rate.mean()
print kdd.same_srv_rate.median()
# 0- .79 = low_same_srv_rate
#.79-1 = medium_same_srv_rate
# 1 = high_same_srv_rate

0.791547343129
1.0


In [31]:
print kdd.diff_srv_rate.mean()
print kdd.diff_srv_rate.median()
# 0 = low_diff_srv_rate
# 0-.02 = medium_diff_srv_rate
# > .02 = high_diff_srv_rate

0.0209823873884
0.0


In [32]:
print kdd.srv_diff_host_rate.mean()
print kdd.srv_diff_host_rate.median()
# 0 = low_srv_diff_host_rate
# 0-.02 = medium_srv_diff_host_rate
# > .02 = high_srv_diff_host_rate

0.0289968037796
0.0


In [33]:
print kdd.dst_host_count.mean()
print kdd.dst_host_count.median()
print kdd.dst_host_count.max()
print kdd.dst_host_count.min()
#255 = high_dst_host_count
#255-232 = medium_dst_host_count
# 0-232 = low_dst_host_count

232.470777558
255.0
255
0


In [34]:
print kdd.dst_host_srv_count.mean()
print kdd.dst_host_srv_count.median()
print kdd.dst_host_srv_count.min()
print kdd.dst_host_srv_count.max()
#255 = high_dst_host_srv_count
#255-188 = medium_dst_host_srv_count
# 0-188 = low_dst_host_srv_count

188.665670083
255.0
0
255


In [35]:
print kdd.dst_host_same_srv_rate.mean()
print kdd.dst_host_same_srv_rate.median()
# 0-.75 low_dst_host_same_srv_rate
# .75 -1 high_dst_host_sam_srv_rate

0.753779697624
1.0


In [36]:
print kdd.dst_host_diff_srv_rate.mean()
print kdd.dst_host_diff_srv_rate.median()
# 0 low_dst_host_diff_srv_rate
# >0 high_dst_host_diff_srv_rate

0.030905730728
0.0


In [37]:
print kdd.dst_host_same_src_port_rate.mean()
print kdd.dst_host_same_src_port_rate.median()
#<.6 low_dst_host_same_src_port_rate
#>=.6 high_dst_host_same_src_port_rate

0.60193475581
1.0


In [38]:
print kdd.dst_host_srv_diff_host_rate.mean()
print kdd.dst_host_srv_diff_host_rate.median()
# <.01 low_dst_host_srv_diff_host_rate
# >.01 high_dst_host_srv_diff_host_rate

0.00668350130865
0.0


In [39]:
print kdd.dst_host_serror_rate.mean()
print kdd.dst_host_serror_rate.median()
# <.17 low_dst_host_serror_rate
# >.17 high_dst_host_serror_rate

0.1767539639
0.0


In [40]:
print kdd.dst_host_srv_serror_rate.mean()
print kdd.dst_host_srv_serror_rate.median()
# <.17 low_dst_host_srv_serror_rate
#>.17 high_dst_host_srv_serror_rate

0.17644262086
0.0


In [41]:
print kdd.dst_host_rerror_rate.mean()
print kdd.dst_host_rerror_rate.median()
# <.05 low_dst_host_rerror_rate
# >.05 high_dst_host_rerror_rate

0.058117610385
0.0


In [42]:
print kdd.dst_host_srv_rerror_rate.mean()
print kdd.dst_host_srv_rerror_rate.median()
#<.05 low_dst_host_srv_rerror_rate
#>.05 high_dst_host_srv_rerror_rate

0.0574116687347
0.0


The following are the symbolic elements of the signature
Each of these will have it's own presence in the bucket

In [43]:
pd.unique(kdd.protocol_type.ravel())

array(['tcp', 'udp', 'icmp'], dtype=object)

In [44]:
pd.unique(kdd.service.ravel())

array(['http', 'smtp', 'finger', 'domain_u', 'auth', 'telnet', 'ftp',
       'eco_i', 'ntp_u', 'ecr_i', 'other', 'private', 'pop_3', 'ftp_data',
       'rje', 'time', 'mtp', 'link', 'remote_job', 'gopher', 'ssh', 'name',
       'whois', 'domain', 'login', 'imap4', 'daytime', 'ctf', 'nntp',
       'shell', 'IRC', 'nnsp', 'http_443', 'exec', 'printer', 'efs',
       'courier', 'uucp', 'klogin', 'kshell', 'echo', 'discard', 'systat',
       'supdup', 'iso_tsap', 'hostnames', 'csnet_ns', 'pop_2', 'sunrpc',
       'uucp_path', 'netbios_ns', 'netbios_ssn', 'netbios_dgm', 'sql_net',
       'vmnet', 'bgp', 'Z39_50', 'ldap', 'netstat', 'urh_i', 'X11',
       'urp_i', 'pm_dump', 'tftp_u', 'tim_i', 'red_i'], dtype=object)

In [45]:
pd.unique(kdd.flag.ravel())

array(['SF', 'S1', 'REJ', 'S2', 'S0', 'S3', 'RSTO', 'RSTR', 'RSTOS0',
       'OTH', 'SH'], dtype=object)

In [46]:
pd.unique(kdd.land.ravel())
# no_land
# yes_land

array([0, 1])

In [47]:
pd.unique(kdd.logged_in.ravel())
#no_logged_in
#yes_logged_in

array([1, 0])

In [48]:
pd.unique(kdd.is_host_login.ravel())
#no_is_host_login
#yes_is_host_login

array([0])

In [49]:
pd.unique(kdd.is_guest_login.ravel())
#no_is_guest_login
#yes_is_guest_login

array([0, 1])

In [50]:
normal_basket_data = normal.values
dos_basket_data = dos.values
u2r_basket_data = u2r.values
rl2_basket_data = rl2.values
probe_basket_data = probe.values

Take the array and make it and transform all of its elements into identifiers to be used in the Apriori Association algorithm.

In [52]:
def makeBaskets(array):
    tempArray = []
    for x in range(0, len(array)):
        tempArray.append([])
        # 0 = "low_duration"
        # 0-48 = "medium_duration"
        # >48 = "high_duration"
        if array[x][0]== 0:
            tempArray[x].append("low_duration")
        elif array[x][0] > 0 and array[x][0]<= 100:
            tempArray[x].append("medium_duration")
        elif array[x][0] > 100:
            tempArray[x].append("high_duration")
            
        # 0-520 = low_src_bytes
        # 520 - 3025 = medium_src_bytes
        # >3025 = high_src_bytes
        if array[x][4] <=520:
            tempArray[x].append("low_src_bytes")
        elif array[x][4] > 520 and array[x][4] <= 5000:
            tempArray[x].append("medium_src_bytes")
        elif array[x][4] > 5000:
            tempArray[x].append("high_src_bytes")
            
        # 0 = low_dst_bytes
        # 0-869 = medium_dst_bytes
        # > 869 = high_dst_bytes
        if array[x][5] == 0:
            tempArray[x].append("low_dst_bytes")
        elif array[x][5]> 0 & array[x][5]<= 1000:
            tempArray[x].append("medium_dst_bytes")
        elif array[x][5]>1000:
            tempArray[x].append("high_dst_bytes")
            
        # yes_land or no_land
        if array[x][6] == 0:
            tempArray[x].append("no_land")
        elif array[x][6] == 1:
            tempArray[x].append("yes_land")
            
        # 0 = low_wrong_fragment
        # 0-1 = medium_wrong_fragment
        # >1 = high_wrong_fragment
        if array[x][7] > 0:
            tempArray[x].append("wrong_fragment")
            
        # 0 = low_urgent
        #>0 = high_urgent
        
        if array[x][8] > 0:
            tempArray[x].append("urgent")
            
        # 0 = low_hot
        # 0-1 = medium_hot
        # >0 = high_hot
        if array[x][9] >0:
            tempArray[x].append("hot")
            
        # 0 = low_num_failed_logins
        # 0-1 = medium_num_failed_logins
        # >1 = high_num_failed_logins
        if array[x][10] > 0:
            tempArray[x].append("failed_logins")
    
            
        #no_logged_in
        #yes_logged_in
        if array[x][11] == 0:
            tempArray[x].append("no_logged_in")
        elif array[x][11] == 1:
            tempArray[x].append("yes_logged_in")
            
        # 0 = low_num_compromised
        # >0 = high_num_compromised
        if array[x][12] > 0:
            tempArray[x].append("num_compromised")

        # 0 = low_root_shell
        # >0 = high_root_shell
        if array[x][13] >0:
            tempArray[x].append("root_shell")
            
        #0 = low_su_attempted
        #>1 = high_su_attempted
        if array[x][14] >0:
            tempArray[x].append("su_attempted")
            
        #0 = low_num_root
        # >0 = high_num_root
        if array[x][15] >0:
            tempArray[x].append("high_num_root")
            
        # 0 = low_file_creations
        #>1 = high_file_creations
        if array[x][16] >0:
            tempArray[x].append("file_creations")
            
        # 0 = low_num_shells
        # >1 = high_num_shells
        if array[x][17] >0:
            tempArray[x].append("high_num_shells")
            
        # 0 = low_num_access_files
        #>1 = high_num_access_files
        if array[x][18] >0:
            tempArray[x].append("high_num_access_files")
            
        # 0 = low_num_outbound_cmds
        #>1 = high_num_outbound_cmds
        if array[x][19] >0:
            tempArray[x].append("high_num_outbound_cmds")
             
        #'is_hot_login',
        if array[x][20] == 0:
            tempArray[x].append("no_hot_login")
        elif array[x][20] ==1:
            tempArray[x].append("yes_hot_login")
 
        #'is_guest_login',
        if array[x][21] == 0:
            tempArray[x].append("no_guest_login")
        elif array[x][21] ==1:
            tempArray[x].append("yes_guest_login")
            
        #0-332 low_count
        #332-510 medium_count
        #510-511 high_count
        if array[x][22] < 332:
            tempArray[x].append("low_count")
        elif array[x][22]>= 332:
            tempArray[x].append("high_count")
            
        #0-293 = low_srv_count
        #293-510 = medium_srv_count
        #>=510 = high_srv_count
        if array[x][23] < 292:
            tempArray[x].append("low_srv_count")
        elif array[x][23]>= 292:
            tempArray[x].append("high_srv_count")

            
        # 0 = low_serror_rate
        # 0 - 0.17 = medium_serror_rate
        # > 0.17 = high_serror_rate
        
        if array[x][24] >0:
            tempArray[x].append("high_serror_rate")
            
        # 0 = low_srv_serror_rate
        # 0 - 0.17 = medium_srv_serror_rate
        # > 0.17 = high_srv_serror_rate
        if array[x][25] >0:
            tempArray[x].append("high_srv_serror_rate")
            
        # 0 = low rerror_rate
        # 0 - 0.05 = medium_rerror_rate
        #>.05 = high_rerror_rate
        if array[x][26] >0:
            tempArray[x].append("high_rerror_rate")
            
        # 0 = low_srv_rerror_rate
        # 0 - 0.05 = medium_srv_rerror_rate
        #>.05 = high_srv_rerror_rate
        if array[x][27] >0:
            tempArray[x].append("high_srv_rerror_rate")

        # 0- .79 = low_same_srv_rate
        #.79-1 = medium_same_srv_rate
        # 1 = high_same_srv_rate
        if array[x][28] <1:
            tempArray[x].append("low_same_srv_rate")
            
        # 0 = low_diff_srv_rate
        # 0-.02 = medium_diff_srv_rate
        # > .02 = high_diff_srv_rate
        if array[x][29] >0:
            tempArray[x].append("high_diff_srv_rate")

            
        # 0 = low_srv_diff_host_rate
        # 0-.02 = medium_srv_diff_host_rate
        # > .02 = high_srv_diff_host_rate
        if array[x][30] >0:
            tempArray[x].append("high_srv_diff_host_rate")
    
        #255 = high_dst_host_count
        #255-232 = medium_dst_host_count
        # 0-232 = low_dst_host_count
        if array[x][31] <232:
            tempArray[x].append("low_dst_host_count")
        elif array[x][31] >=232: 
            tempArray[x].append("high_dst_host_count")
    
        #255 = high_dst_host_srv_count
        #255-188 = medium_dst_host_srv_count
        # 0-188 = low_dst_host_srv_count
        if array[x][32] <188:
            tempArray[x].append("low_dst_host_srv_count")
        elif array[x][32] >=188:
            tempArray[x].append("high_dst_host_srv_count")
        
        
        # 0-.75 low_dst_host_same_srv_rate
        # .75 -1 high_dst_host_sam_srv_rate
        
        if array[x][33]<.75:
            tempArray[x].append("low_dst_host_same_srv_rate")
        elif array[x][33] >= .75:
            tempArray[x].append("high_dst_host_sam_srv_rate")
            
        # 0 low_dst_host_diff_srv_rate
        # >0 high_dst_host_diff_srv_rate
        if array[x][34]>0:
            tempArray[x].append("high_dst_host_diff_srv_rate")
    
        #<.6 low_dst_host_same_src_port_rate
        #>=.6 high_dst_host_same_src_port_rate
        if array[x][35]<.6:
            tempArray[x].append("low_dst_host_same_src_port_rate")
        elif array[x][35] >=.6:
            tempArray[x].append("high_dst_host_same_src_port_rate")
        
        # <.01 low_dst_host_srv_diff_host_rate
        # >.01 high_dst_host_srv_diff_host_rate
        if array[x][36]>0:
            tempArray[x].append("high_dst_host_srv_diff_host_rate")

            
        # <.17 low_dst_host_serror_rate
        # >.17 high_dst_host_serror_rate
        if array[x][37]<.17:
            tempArray[x].append("low_dst_host_serror_rate")
        elif array[x][37] >=.17:
            tempArray[x].append("high_dst_host_serror_rate")
            
        # <.17 low_dst_host_srv_serror_rate
        #>.17 high_dst_host_srv_serror_rate
        if array[x][38]<.17:
            tempArray[x].append("low_dst_host_srv_serror_rate")
        elif array[x][38] >=.17:
            tempArray[x].append("high_dst_host_srv_serror_rate")
        
        # <.05 low_dst_host_rerror_rate
        # >.05 high_dst_host_rerror_rate
        if array[x][39]>0:
            tempArray[x].append("low_dst_host_srv_serror_rate")
            
        #<.05 low_dst_host_srv_rerror_rate
        #>.05 high_dst_host_srv_rerror_rate
        if array[x][40]>0:
            tempArray[x].append("low_dst_host_srv_serror_rate")
    return tempArray

        

Apply the transformation to have all of the proper characteristics in the basket

In [53]:
normal_baskets = makeBaskets(normal_basket_data)
u2r_baskets = makeBaskets(u2r_basket_data)
dos_baskets = makeBaskets(dos_basket_data)
rl2_baskets = makeBaskets(rl2_basket_data)
probe_baskets = makeBaskets(probe_basket_data)

The following is the apriori with association algorithm taken from 
https://github.com/asaini/Apriori

In [54]:
def load_dataset():
    "Load the sample dataset."
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]


def createC1(dataset):
    "Create a list of candidate item sets of size one."
    c1 = []
    for transaction in dataset:
        for item in transaction:
            if not [item] in c1:
                c1.append([item])
    c1.sort()
    #frozenset because it will be a ket of a dictionary.
    return map(frozenset, c1)


def scanD(dataset, candidates, min_support):
    "Returns all candidates that meets a minimum support level"
    sscnt = {}
    for tid in dataset:
        for can in candidates:
            if can.issubset(tid):
                sscnt.setdefault(can, 0)
                sscnt[can] += 1

    num_items = float(len(dataset))
    retlist = []
    support_data = {}
    for key in sscnt:
        support = sscnt[key] / num_items
        if support >= min_support:
            retlist.insert(0, key)
        support_data[key] = support
    return retlist, support_data


def aprioriGen(freq_sets, k):
    "Generate the joint transactions from candidate sets"
    retList = []
    lenLk = len(freq_sets)
    for i in range(lenLk):
        for j in range(i + 1, lenLk):
            L1 = list(freq_sets[i])[:k - 2]
            L2 = list(freq_sets[j])[:k - 2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(freq_sets[i] | freq_sets[j])
    return retList


def apriori(dataset, minsupport=0.5):
    "Generate a list of candidate item sets"
    C1 = createC1(dataset)
    D = map(set, dataset)
    L1, support_data = scanD(D, C1, minsupport)
    L = [L1]
    k = 2
    while (len(L[k - 2]) > 0):
        Ck = aprioriGen(L[k - 2], k)
        Lk, supK = scanD(D, Ck, minsupport)
        support_data.update(supK)
        L.append(Lk)
        k += 1

    return L, support_data

In [55]:
C1 = createC1(u2r_baskets)
C1

[frozenset({'failed_logins'}),
 frozenset({'file_creations'}),
 frozenset({'high_diff_srv_rate'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'high_dst_host_sam_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'high_dst_host_srv_serror_rate'}),
 frozenset({'high_duration'}),
 frozenset({'high_num_access_files'}),
 frozenset({'high_num_root'}),
 frozenset({'high_num_shells'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_serror_rate'}),
 frozenset({'high_src_bytes'}),
 frozenset({'high_srv_rerror_rate'}),
 frozenset({'hot'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_host_same_srv_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dur

In [56]:
C2 = createC1(normal_baskets)
C2

[frozenset({'failed_logins'}),
 frozenset({'file_creations'}),
 frozenset({'high_count'}),
 frozenset({'high_diff_srv_rate'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'high_dst_host_sam_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_serror_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'high_dst_host_srv_serror_rate'}),
 frozenset({'high_duration'}),
 frozenset({'high_num_access_files'}),
 frozenset({'high_num_root'}),
 frozenset({'high_num_shells'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_serror_rate'}),
 frozenset({'high_src_bytes'}),
 frozenset({'high_srv_count'}),
 frozenset({'high_srv_diff_host_rate'}),
 frozenset({'high_srv_rerror_rate'}),
 frozenset({'high_srv_serror_rate'}),
 frozenset({'hot'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_d

In [57]:
C3 = createC1(dos_baskets)
C3

[frozenset({'high_count'}),
 frozenset({'high_diff_srv_rate'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'high_dst_host_sam_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_serror_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'high_dst_host_srv_serror_rate'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_serror_rate'}),
 frozenset({'high_src_bytes'}),
 frozenset({'high_srv_count'}),
 frozenset({'high_srv_diff_host_rate'}),
 frozenset({'high_srv_rerror_rate'}),
 frozenset({'high_srv_serror_rate'}),
 frozenset({'hot'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_host_same_srv_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'low_dst_host_srv_serror_rat

In [58]:
C4 = createC1(rl2_baskets)
C4

[frozenset({'failed_logins'}),
 frozenset({'file_creations'}),
 frozenset({'high_diff_srv_rate'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'high_dst_host_sam_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_serror_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'high_dst_host_srv_serror_rate'}),
 frozenset({'high_duration'}),
 frozenset({'high_num_access_files'}),
 frozenset({'high_num_root'}),
 frozenset({'high_num_shells'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_serror_rate'}),
 frozenset({'high_src_bytes'}),
 frozenset({'high_srv_diff_host_rate'}),
 frozenset({'high_srv_rerror_rate'}),
 frozenset({'high_srv_serror_rate'}),
 frozenset({'hot'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_host_sam

In [59]:
C5 = createC1(probe_baskets)
C5

[frozenset({'high_count'}),
 frozenset({'high_diff_srv_rate'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'high_dst_host_sam_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_serror_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'high_dst_host_srv_serror_rate'}),
 frozenset({'high_duration'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_serror_rate'}),
 frozenset({'high_src_bytes'}),
 frozenset({'high_srv_diff_host_rate'}),
 frozenset({'high_srv_rerror_rate'}),
 frozenset({'high_srv_serror_rate'}),
 frozenset({'hot'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_host_same_srv_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'low_dst_host_srv_serror_rate

In [60]:
D1 = map(set, u2r_baskets)

In [61]:
D2 = map(set, normal_baskets)

In [62]:
D3 = map(set, dos_baskets)

In [63]:
D4 = map(set, rl2_baskets)

In [64]:
D5 = map(set, probe_baskets)

In [65]:
L1, support_data1 = scanD(D1, C1, .5)

In [66]:
#u2r
L1

[frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'no_guest_login'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_land'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'medium_dst_bytes'}),
 frozenset({'root_shell'}),
 frozenset({'low_srv_count'}),
 frozenset({'no_hot_login'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'hot'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [67]:
L2, support_data2 = scanD(D2, C2, .5)

In [68]:
#normal
L2

[frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_duration'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'no_guest_login'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_land'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'medium_dst_bytes'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'low_srv_count'}),
 frozenset({'no_hot_login'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [69]:
L3, support_data3 = scanD(D3, C3, .5)

In [70]:
#dos
L3

[frozenset({'low_duration'}),
 frozenset({'no_guest_login'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_logged_in'}),
 frozenset({'no_land'}),
 frozenset({'medium_src_bytes'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_srv_count'}),
 frozenset({'no_hot_login'}),
 frozenset({'high_count'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [71]:
L4, support_data4 = scanD(D4, C4, .5)

In [72]:
#rl2
L4

[frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'no_guest_login'}),
 frozenset({'low_duration'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_land'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_srv_count'}),
 frozenset({'no_hot_login'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [73]:
L5, support_data5 = scanD(D5, C5, .5)

In [74]:
#probe
L5

[frozenset({'low_duration'}),
 frozenset({'low_src_bytes'}),
 frozenset({'no_guest_login'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_logged_in'}),
 frozenset({'no_land'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_host_same_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'low_srv_count'}),
 frozenset({'no_hot_login'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'high_srv_rerror_rate'})]

In [75]:
# The following are common to all sets so it represents a "don't care" for the groups

In [76]:
set(L1) & set(L2) & set(L3) & set(L4) & set(L5)

{frozenset({'low_dst_host_serror_rate'}),
 frozenset({'no_hot_login'}),
 frozenset({'low_dst_host_srv_serror_rate'}),
 frozenset({'no_land'}),
 frozenset({'no_guest_login'})}

In [77]:
L1 = [x for x in L1 if x != frozenset({'low_dst_host_serror_rate'})]
L1 = [x for x in L1 if x != frozenset({'no_hot_login'})]
L1 = [x for x in L1 if x != frozenset({'low_dst_host_srv_serror_rate'})]
L1 = [x for x in L1 if x != frozenset({'no_land'})]
L1 = [x for x in L1 if x != frozenset({'no_guest_login'})]

L2 = [x for x in L2 if x != frozenset({'low_dst_host_serror_rate'})]
L2 = [x for x in L2 if x != frozenset({'no_hot_login'})]
L2 = [x for x in L2 if x != frozenset({'low_dst_host_srv_serror_rate'})]
L2 = [x for x in L2 if x != frozenset({'no_land'})]
L2 = [x for x in L2 if x != frozenset({'no_guest_login'})]

L3 = [x for x in L3 if x != frozenset({'low_dst_host_serror_rate'})]
L3 = [x for x in L3 if x != frozenset({'no_hot_login'})]
L3 = [x for x in L3 if x != frozenset({'low_dst_host_srv_serror_rate'})]
L3 = [x for x in L3 if x != frozenset({'no_land'})]
L3 = [x for x in L3 if x != frozenset({'no_guest_login'})]

L4 = [x for x in L4 if x != frozenset({'low_dst_host_serror_rate'})]
L4 = [x for x in L4 if x != frozenset({'no_hot_login'})]
L4 = [x for x in L4 if x != frozenset({'low_dst_host_srv_serror_rate'})]
L4 = [x for x in L4 if x != frozenset({'no_land'})]
L4 = [x for x in L4 if x != frozenset({'no_guest_login'})]

L5 = [x for x in L5 if x != frozenset({'low_dst_host_serror_rate'})]
L5 = [x for x in L5 if x != frozenset({'no_hot_login'})]
L5 = [x for x in L5 if x != frozenset({'low_dst_host_srv_serror_rate'})]
L5 = [x for x in L5 if x != frozenset({'no_land'})]
L5 = [x for x in L5 if x != frozenset({'no_guest_login'})]

In [78]:
#u2r
L1

[frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'medium_dst_bytes'}),
 frozenset({'root_shell'}),
 frozenset({'low_srv_count'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'hot'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [79]:
#normal
L2

[frozenset({'low_dst_host_same_src_port_rate'}),
 frozenset({'low_duration'}),
 frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'medium_dst_bytes'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'low_srv_count'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [80]:
#dos
L3

[frozenset({'low_duration'}),
 frozenset({'no_logged_in'}),
 frozenset({'medium_src_bytes'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'high_dst_host_srv_count'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'high_srv_count'}),
 frozenset({'high_count'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [81]:
#rl2
L4

[frozenset({'high_dst_host_srv_diff_host_rate'}),
 frozenset({'low_src_bytes'}),
 frozenset({'yes_logged_in'}),
 frozenset({'low_duration'}),
 frozenset({'low_dst_host_count'}),
 frozenset({'low_count'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'low_srv_count'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'high_dst_host_sam_srv_rate'})]

In [82]:
#probe
L5

[frozenset({'low_duration'}),
 frozenset({'low_src_bytes'}),
 frozenset({'no_logged_in'}),
 frozenset({'high_rerror_rate'}),
 frozenset({'high_dst_host_diff_srv_rate'}),
 frozenset({'low_count'}),
 frozenset({'low_dst_host_same_srv_rate'}),
 frozenset({'high_dst_host_same_src_port_rate'}),
 frozenset({'low_dst_bytes'}),
 frozenset({'high_dst_host_count'}),
 frozenset({'low_srv_count'}),
 frozenset({'low_dst_host_srv_count'}),
 frozenset({'high_srv_rerror_rate'})]

In [95]:
def make_guess(test_signature):
    C_test = []
    C_test.append(test_signature)
    C_test = createC1(C_test)
    dif1 = len(set(C_test) & set(L1))
    dif2 = len(set(C_test) & set(L2))
    dif3 = len(set(C_test) & set(L3))
    dif4 = len(set(C_test) & set(L4))
    dif5 = len(set(C_test) & set(L5))
    
    max = dif2
    result = "normal"
    if dif1 > max:
        max = dif1
        result = "u2r"
    if dif3 > max:
        max = dif3
        result = "dos"
    if dif4 > max:
        max = dif4
        result = "rl2"
    if dif5 > max:
        result = "probe"
    return result
    

In [96]:
correct = 0
false_negative = 0
for x in range(len(normal_baskets)):
    result = make_guess(normal_baskets[x])
    if result == "normal":
        correct = correct +1
    else:
        false_negative = false_negative +1
print "num correct " + str(correct)
print "num false negative " + str(false_negative)
print "percent correct " + str(100* float(correct)/(float(correct+false_negative)))
print "false negative percent " + str(100* float(false_negative)/(float(correct+false_negative)))

num correct 78213
num false negative 19065
percent correct 80.4015296367
false negative percent 19.5984703633
