## How to Label the ISCX Data


### 1) Load the Data

In [1]:
import pandas as pd
from time import time

In [2]:
#load the data
t0 = time()
data = pd.read_csv('ISCX_ISCX_Botnet.csv')
t1 = time()
print('data loaded in %f seconds'%(t1-t0),flush=True)

data loaded in 1.648654 seconds


### Shuffle the data

In [3]:
data = data.sample(frac=1).reset_index(drop=True)

In [4]:
data.columns

Index(['Source IP', ' Source Port', ' Destination IP', ' Destination Port',
       ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Active Mean', ' Active Std', ' Active Max', ' Active Min', 'Idle Mean',
       ' Idle Std', ' Idle Max', ' Idle Min', 'label'],
      dtype='object')

In [5]:
data.shape

(309206, 29)

In [6]:
#take a look at the data
#data.head()

In [7]:
#take the first 10000 rows to save time
data = data.iloc[:10000,:]
data.shape

(10000, 29)

### 2) Load list of IP addresses and their corresponding Botnet Names

Data taken from: https://www.unb.ca/cic/datasets/botnet.html

#### Also write functions to apply labelling according to Source and Destination IP addresses

In [8]:
# load ip addresses and the labels
ip1 = pd.read_csv('bots1.csv')
ip2 = pd.read_csv('bots2.csv')

In [9]:
#this function goes through the data one row at a time, checks the source IP and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip1.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [10]:
# this function goes through the data one row at a time, checks the source and dest IPs and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

In [11]:
labels1 = data.apply(find_class1, axis=1)

In [12]:
len(labels1[labels1 == 'Other']) 

4942

In [13]:
labels2 = data.apply(find_class2, axis=1)

In [14]:
len(labels2[labels2 == 'Other'])

9858

In [15]:
#len(ls1)

In [16]:
ls1 = list(labels1.values)

In [17]:
ls2 = list(labels2.values)

In [18]:
len(ls1)

10000

In [19]:
label = list()

In [20]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

#### Here we add 'BotNet_Label' column to the data after we filled it as above

In [21]:
data['BotNet_Label'] = label

In [22]:
#remove spaces from column names
data = data.rename(columns=lambda x: x.strip())

In [23]:
# Explore BotNet_Label values
data['BotNet_Label'].value_counts()

Normal                    4800
Weasel Bot                2195
Virut                     1422
Neris                      737
Murlo                      402
Menti                      169
IRC                        142
Zero access                 65
TBot                        23
Zeus                        15
Black hole 2                14
Black hole 3                 4
Sogou                        3
Weasel Botmaster             3
IRCbot and black hole1       2
Smoke bot                    2
RBot                         2
Name: BotNet_Label, dtype: int64

In [24]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

### This is how to apply one-hot encoding using Pandas

In [25]:
df_src_port = pd.get_dummies(data['Source Port'],prefix='SrcPort')
df_dest_port = pd.get_dummies(data['Destination Port'],prefix='DestPort')
df_protocol = pd.get_dummies(data['Protocol'],prefix='Protocol')

In [26]:
data = pd.concat([data, df_src_port,df_dest_port,df_protocol], axis=1)
data.shape

(10000, 7670)

In [27]:
data.drop(['Source Port','Destination Port','Protocol'],inplace=True,axis=1)
data.shape

(10000, 7667)

### Save the Data .. it is ready for further analysis and machine learning

In [28]:
t0 = time()
data.to_csv('ISCX_Botnet_Labelled.csv',index=False)
t1 = time()
print('data saved in %f seconds'%(t1-t0),flush=True)

data saved in 28.528847 seconds
