## Label the ISCX Data


### 1) Load the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
#load the data
data = pd.read_csv('ISCX_ISCX_Botnet.csv')

Wall time: 889 ms


In [3]:
data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,8.6.0.1,0,8.0.6.4,0,0,61804233,0.0,0.210342,5150353.0,4942438.0,...,0.0,910704.5,280640.074851,1109147.0,712262.0,9003695.5,4108267.0,15120887.0,5587976.0,ISCX
1,147.32.84.180,1040,147.32.84.171,139,6,14119,129470.925703,1416.530916,743.1053,1697.691,...,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
2,147.32.84.180,1041,147.32.84.19,139,6,40800,7205.882353,196.078431,5828.571,11137.09,...,1459.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
3,147.32.84.180,1042,147.32.84.19,139,6,44000,6681.818182,181.818182,6285.714,11020.03,...,1409.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
4,147.32.84.180,1043,147.32.84.171,139,6,56403,62656.241689,567.345709,1819.452,3344.01,...,291.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX


### Shuffle the data

In [4]:
data = data.sample(frac=1).reset_index(drop=True)

In [5]:
data.columns

Index(['Source IP', ' Source Port', ' Destination IP', ' Destination Port',
       ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Active Mean', ' Active Std', ' Active Max', ' Active Min', 'Idle Mean',
       ' Idle Std', ' Idle Max', ' Idle Min', 'label'],
      dtype='object')

In [6]:
data.shape

(309206, 29)

In [7]:
#take a look at the data
data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,192.168.3.116,2520,66.225.226.27,80,6,507,0.0,5917.159763,253.5,303.348809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
1,158.65.110.24,41339,158.65.12.103,53,17,4000,27000.0,500.0,4000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
2,192.168.248.165,1249,54.230.16.240,80,6,208032,0.0,9.613906,208032.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
3,172.16.0.11,14042,212.95.7.75,53,17,452020,477.854962,4.424583,452020.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
4,66.249.73.56,54067,158.65.110.24,80,6,777389,231.544311,12.863573,86376.555556,228531.318538,...,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX


In [8]:
#take the first 10000 rows to save time
## feel free to use the entire dataset in your own time
data = data.iloc[:10000,:]
data.shape

(10000, 29)

## Max and Min values in features
The resulting feature values after applying FlowMeter to transform the data from PCAP into csv could be as high as +infinity or as low as -infinity .. if that's the case .. it's a good idea to replace those values with reasonable values such as a very hight or very lowe number

In [9]:
## check the max and min values in the features
print(data.max())
print(data.min())

Source IP             99.24.221.80
 Source Port                 65500
 Destination IP      99.88.240.160
 Destination Port            65534
 Protocol                       17
 Flow Duration           119999709
 Flow Bytes/s          9.95714e+07
 Flow Packets/s                inf
 Flow IAT Mean         1.12716e+08
 Flow IAT Std          2.36174e+11
 Flow IAT Max          4.31901e+09
 Flow IAT Min          1.12716e+08
Fwd IAT Mean           1.12716e+08
 Fwd IAT Std           2.36174e+11
 Fwd IAT Max           4.31901e+09
 Fwd IAT Min           1.12716e+08
Bwd IAT Mean           2.16008e+09
 Bwd IAT Std           7.83745e+10
 Bwd IAT Max           3.60018e+09
 Bwd IAT Min           7.19981e+08
Active Mean            4.41963e+07
 Active Std            5.91558e+07
 Active Max            8.60258e+07
 Active Min            3.25982e+07
Idle Mean              4.31901e+09
 Idle Std              2.03661e+09
 Idle Max              4.31901e+09
 Idle Min              4.31901e+09
label               

In [10]:
## Here we replace the +inf or the -inf with a reasonable value
## you can try with the highest possible value in Python: sys.float_info.max
data.replace(np.inf, 1000000000000.0, inplace=True)
data.replace(-np.inf, -1000000000000.0, inplace=True)

### 2) Load list of IP addresses and their corresponding Botnet Names

Data taken from: https://www.unb.ca/cic/datasets/botnet.html

#### Also write functions to apply labelling according to Source and Destination IP addresses

In [11]:
# load ip addresses and the labels
ip1 = pd.read_csv('bots1.csv')
ip2 = pd.read_csv('bots2.csv')

In [12]:
#this function goes through the data one row at a time, checks the source IP and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip1.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [13]:
# this function goes through the data one row at a time, checks the source and dest IPs and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

In [14]:
%%time
labels1 = data.apply(find_class1, axis=1)

Wall time: 9.45 s


In [15]:
labels1

0             Other
1        Weasel Bot
2       Zero access
3             Other
4             Other
           ...     
9995     Weasel Bot
9996          Other
9997          Other
9998          Other
9999          Virut
Length: 10000, dtype: object

In [16]:
len(labels1[labels1 == 'Other']) 

4989

In [17]:
%%time
labels2 = data.apply(find_class2, axis=1)

Wall time: 8.74 s


In [18]:
len(labels2[labels2 == 'Other'])

9864

In [19]:
#len(ls1)

In [20]:
ls1 = list(labels1.values)

In [21]:
ls2 = list(labels2.values)

In [22]:
len(ls1)

10000

In [23]:
label = list()

In [24]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

In [25]:
label

['Normal',
 'Weasel Bot',
 'Zero access',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Menti',
 'Neris',
 'TBot',
 'Neris',
 'Normal',
 'Virut',
 'Normal',
 'Normal',
 'Virut',
 'Neris',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Neris',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Virut',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Weasel Bot',
 'Virut',
 'Virut',
 'Normal',
 'Virut',
 'Normal',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Virut',
 'Virut',
 'Virut',
 'Murlo',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'IRC',
 'Murlo',
 'IRC',
 'Normal',
 'Virut',
 'Neris',
 'Virut',
 'Neris',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Weasel Bot',
 'Weasel Bot',
 'Normal',
 'Murlo',
 'Virut',
 'Normal',
 'Normal',
 'Virut',
 'Murlo',
 'Normal',
 'Normal',
 'Normal',
 'Virut',
 'IRC',
 'Normal',
 'Weasel Bo

#### Here we add 'BotNet_Label' column to the data after we filled it as above

In [26]:
data['BotNet_Label'] = label

In [27]:
data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label,BotNet_Label
0,192.168.3.116,2520,66.225.226.27,80,6,507,0.0,5917.159763,253.5,303.348809,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX,Normal
1,158.65.110.24,41339,158.65.12.103,53,17,4000,27000.0,500.0,4000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX,Weasel Bot
2,192.168.248.165,1249,54.230.16.240,80,6,208032,0.0,9.613906,208032.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX,Zero access
3,172.16.0.11,14042,212.95.7.75,53,17,452020,477.854962,4.424583,452020.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX,Normal
4,66.249.73.56,54067,158.65.110.24,80,6,777389,231.544311,12.863573,86376.555556,228531.318538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX,Normal


In [28]:
data.columns

Index(['Source IP', ' Source Port', ' Destination IP', ' Destination Port',
       ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Active Mean', ' Active Std', ' Active Max', ' Active Min', 'Idle Mean',
       ' Idle Std', ' Idle Max', ' Idle Min', 'label', 'BotNet_Label'],
      dtype='object')

In [29]:
#remove spaces from column names
data = data.rename(columns=lambda x: x.strip())

In [30]:
# Explore BotNet_Label values
data['BotNet_Label'].value_counts()

Normal                    4853
Weasel Bot                2158
Virut                     1405
Neris                      742
Murlo                      400
Menti                      163
IRC                        136
Zero access                 71
TBot                        22
Black hole 2                14
Zeus                        14
RBot                         6
Black hole 3                 4
IRCbot and black hole1       3
Sogou                        3
Smoke bot                    2
Weasel Botmaster             2
Osx_trojan                   2
Name: BotNet_Label, dtype: int64

In [31]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

In [32]:
data.shape

(10000, 27)

### This is how to apply one-hot encoding using Pandas

In [33]:
df_src_port = pd.get_dummies(data['Source Port'],prefix='SrcPort')
df_dest_port = pd.get_dummies(data['Destination Port'],prefix='DestPort')
df_protocol = pd.get_dummies(data['Protocol'],prefix='Protocol')

In [34]:
df_src_port.head()

Unnamed: 0,SrcPort_0,SrcPort_22,SrcPort_25,SrcPort_53,SrcPort_68,SrcPort_80,SrcPort_110,SrcPort_123,SrcPort_135,SrcPort_137,...,SrcPort_64661,SrcPort_64674,SrcPort_64784,SrcPort_64786,SrcPort_64787,SrcPort_64841,SrcPort_64974,SrcPort_65266,SrcPort_65430,SrcPort_65500
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
data = pd.concat([data, df_src_port,df_dest_port,df_protocol], axis=1)
data.shape

(10000, 7696)

In [36]:
data.drop(['Source Port','Destination Port','Protocol'],inplace=True,axis=1)
data.shape

(10000, 7693)

### Save the Data .. it is ready for further analysis and machine learning

In [37]:
%%time
data.to_csv('ISCX_Botnet_Labelled.csv',index=False)

Wall time: 13.8 s
