     
This code is submitted as part of project 2 for the subject COMP90073 (Security Analytics) at the University of Melbourne .
     
    -------------------------------------------
    COMP90073 Security Analytics - Project 2 
    Machine learning based Threat detection
    
    
    Author : Mohammed Ahsan Kollathodi 
    Student id: 1048942.
    

#### The Primary aim through this code to perform pre-processing or cleaning to the raw data provided for the task 2 of the project.  

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

###  Train data 

The dataset provided contain the NetFlow data for a network under cyberattacks. Each line of the dataset includes the following 15 fields: (1) stream ID, (2) timestamp, (3) duration, (4) protocol, (5) source IP address, (6) source port, (7) direction, (8) destination IP address, (9) destination port, (10) state, (11) source type of service, (12) destination type of service, (13) the number of total packets, (14) the number of bytes transferred in both directions, (15) the number of bytes transferred from the source to the destination.

I have not labelled stream ID as it's not very relevant with respect to the project. 

In [3]:
train_df = pd.read_csv('training_data_with_label.csv', sep=','
                       ,names=['timestamp','duration','protocol','src_ip','src_port','direction',
                               'dst_ip','dst_port','state','srctye_service','dsttype_service',
                               'num_total_packets','total_bytes','src_bytes','label'])

In [4]:
labels_new = pd.DataFrame(data=np.where(train_df['label'].str.contains("Botnet",case=False), 1, 0),columns=["Botnet"])

In [5]:
train_df_selected = train_df[['src_ip','dst_ip','direction','duration','num_total_packets','total_bytes','src_bytes']]


In [6]:
train_df_selected = pd.concat([train_df_selected,labels_new], axis=1)

In [7]:
train_df_selected.head()

Unnamed: 0,src_ip,dst_ip,direction,duration,num_total_packets,total_bytes,src_bytes,Botnet
0,,,,,,,,0.0
1,49.199.46.19,150.35.87.121,<->,1823.865899,1.0,213.0,107.0,0.0
2,191.78.136.101,150.35.89.128,<->,3616.701751,416111.0,206682147.0,181257472.0,0.0
3,122.2.175.95,150.35.87.17,<?>,2058.744475,224008.0,267775005.0,3187813.0,0.0
4,220.172.180.85,150.35.88.29,<?>,28.856103,6071.0,5664455.0,138406.0,0.0


In [8]:
# packets_in_Sec to estimate the total number of packets in a unit time.

train_df_selected['packets_in_Sec'] = train_df_selected['num_total_packets']/train_df_selected['duration']

# bytes_total_in_Sec to estimate the total number of bytes in a unit time.

train_df_selected['bytes_total_in_Sec'] = train_df_selected['total_bytes']/train_df_selected['duration']

# source_Bytes_Sec to estimate the source bytes transferred in a unit time. 

train_df_selected['Source_Bytes_Sec'] = train_df_selected['src_bytes']/train_df_selected['duration']


In [13]:
# Filtering the training data. 
filtered_train_df =  train_df_selected[~train_df_selected.isin([np.nan, np.inf, -np.inf]).any(1)]

In [10]:
# Group the data by source ip address. 
group_df = filtered_train_df.groupby(['src_ip']).mean()

In [12]:
# Counting the number of samples that would consist of the Botnet attack data. 
group_df['Botnet'].value_counts()

0.000000    32490
0.333333     5599
0.250000     3761
0.200000     1682
0.166667      879
            ...  
0.056202        1
0.037879        1
0.064644        1
0.029661        1
0.105960        1
Name: Botnet, Length: 471, dtype: int64

In [15]:
group_df.loc[(group_df['Botnet'] > 0.5), 'Botnet'] = 1.0

In [16]:
group_df.head()

Unnamed: 0_level_0,duration,num_total_packets,total_bytes,src_bytes,Botnet,packets_in_Sec,bytes_total_in_Sec,Source_Bytes_Sec
src_ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100.104.46.220,2661.046342,10.0,722.2,400.0,0.0,0.0039,0.279471,0.154669
100.104.70.48,1996.260106,4.0,303.75,165.5,0.0,164.066083,21984.772468,11976.77903
100.106.5.100,67.119335,3.5,367.25,260.0,0.25,1276.200709,153622.993734,115767.499796
100.107.147.140,208.207561,2.75,1308.75,531.5,0.0,617.074478,117063.29299,88074.451915
100.110.43.251,0.247294,2.166667,155.833333,85.833333,0.0,2001.622342,144130.804035,79320.056252


In [17]:
# Save as output CSV. 
group_df.to_csv('training_data_with_ip_A2.csv', sep=',')


###  Test data 

In [22]:
test_df = pd.read_csv('test_data_with_label.csv', sep=','
                       ,names=['timestamp','duration','protocol','src_ip','src_port','direction',
                               'dst_ip','dst_port','state','srctye_service','dsttype_service',
                               'num_total_packets','total_bytes','src_bytes','label'])

In [24]:
label_new_test = pd.DataFrame(data=np.where(test_df['label'].str.contains("Botnet",case=False), 1, 0),columns=["Botnet"])

In [29]:
# Seletected features. 
test_df_selected = test_df[['src_ip','dst_ip','direction','duration','num_total_packets','total_bytes','src_bytes']]

In [30]:
test_df_selected = pd.concat([test_df_selected,label_new_test], axis=1)

In [31]:
test_df_selected.head()

Unnamed: 0,src_ip,dst_ip,direction,duration,num_total_packets,total_bytes,src_bytes,Botnet
0,,,,,,,,0.0
1,85.4.198.159,150.35.87.121,<->,1998.730056,1.0,214.0,106.0,0.0
2,154.31.224.125,150.35.87.121,<->,1959.8882,1.0,214.0,107.0,0.0
3,93.186.104.171,150.35.89.92,<?>,14.908785,8.0,3166.0,2979.0,0.0
4,92.188.3.136,150.35.89.110,<?>,0.001743,4.0,600.0,457.0,0.0


In [32]:
# packets_in_Sec to estimate the total number of packets in a unit time.

test_df_selected['packets_in_Sec'] = test_df_selected['num_total_packets']/test_df_selected['duration']

# bytes_total_in_Sec to estimate the total number of bytes in a unit time.

test_df_selected['bytes_total_in_Sec'] = test_df_selected['total_bytes']/test_df_selected['duration']

# source_Bytes_Sec to estimate the source bytes transferred in a unit time. 

test_df_selected['Source_Bytes_Sec'] = test_df_selected['src_bytes']/test_df_selected['duration']


In [34]:
# Filter the test data. 
filtered_test_df = test_df_selected[~test_df_selected.isin([np.nan, np.inf, -np.inf]).any(1)]


In [35]:
group_test_df = filtered_test_df.groupby(['src_ip']).mean()

In [36]:
# Checking for the Botnet data and for the botnet attack.
group_test_df['Botnet'].value_counts()

0.000000    224172
1.000000     21659
0.500000     12186
0.333333      2088
0.666667       192
0.492925         1
0.498261         1
0.478807         1
0.413832         1
Name: Botnet, dtype: int64

In [37]:
group_test_df.loc[(group_test_df['Botnet'] > 0.5), 'Botnet'] = 1.0

In [38]:
group_test_df.to_csv('test_data_with_ip_A2.csv', sep=',')