## How to Label the ISCX Data


### 1) Load the Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
%%time
#load the data
data = pd.read_csv('../Data/ISCX_ISCX_Botnet.csv')

In [None]:
data.head()

In [None]:
data.shape

### Shuffle the data

In [None]:
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
data.columns

In [None]:
data.shape

In [None]:
#take a look at the data
#data.head()

In [None]:
#take the first 10000 rows to save time
## feel free to use the entire dataset in your own time
data = data.iloc[:10000,:]
data.shape

## Max and Min values in features
The resulting feature values after applying FlowMeter to transform the data from PCAP into csv could be as high as +infinity or as low as -infinity .. if that's the case .. it's a good idea to replace those values with reasonable values such as a very hight or very lowe number

In [None]:
## check the max and min values in the features
print(data.max())
#print(data.min())

In [None]:
## Here we replace the +inf or the -inf with a reasonable value
## you can try with the highest possible value in Python: sys.float_info.max
data.replace(np.inf, 1000000000000.0, inplace=True)
#data.replace(-np.inf, -1000000000000.0, inplace=True)

### 2) Load list of IP addresses and their corresponding Botnet Names

Data taken from: https://www.unb.ca/cic/datasets/botnet.html

#### Also write functions to apply labelling according to Source and Destination IP addresses

In [None]:
# load ip addresses and the labels
ip1 = pd.read_csv('../Data/bots1.csv')
ip2 = pd.read_csv('../Data/bots2.csv')

In [None]:
#this function goes through the data one row at a time, checks the source IP and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip1.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [None]:
# this function goes through the data one row at a time, checks the source and dest IPs and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

In [None]:
%%time
labels1 = data.apply(find_class1, axis=1)

In [None]:
len(labels1[labels1 == 'Other']) 

In [None]:
%%time
labels2 = data.apply(find_class2, axis=1)

In [None]:
len(labels2[labels2 == 'Other'])

In [None]:
#len(ls1)

In [None]:
ls1 = list(labels1.values)

In [None]:
ls2 = list(labels2.values)

In [None]:
len(ls1)

In [None]:
label = list()

In [None]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

In [None]:
label

#### Here we add 'BotNet_Label' column to the data after we filled it as above

In [None]:
data['BotNet_Label'] = label

In [None]:
data.head()

In [None]:
data.columns

In [None]:
#remove spaces from column names
data = data.rename(columns=lambda x: x.strip())

In [None]:
data.columns

In [None]:
# Explore BotNet_Label values
data['BotNet_Label'].value_counts()

In [None]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

In [None]:
data.head()

### This is how to apply one-hot encoding using Pandas

In [None]:
df_src_port = pd.get_dummies(data['Source Port'],prefix='SrcPort')
df_dest_port = pd.get_dummies(data['Destination Port'],prefix='DestPort')
df_protocol = pd.get_dummies(data['Protocol'],prefix='Protocol')

In [None]:
df_src_port.head()

In [None]:
data = pd.concat([data, df_src_port,df_dest_port,df_protocol], axis=1)
data.shape

In [None]:
data.drop(['Source Port','Destination Port','Protocol'],inplace=True,axis=1)
data.shape

### Save the Data .. it is ready for further analysis and machine learning

In [None]:
%%time
data.to_csv('../Data/ISCX_Botnet_Labelled.csv',index=False)