In [2]:
import pandas as pd 
import numpy as np 

This code is submitted as part of project 2 for the subject COMP90037 (Security Analytics) at the University of  Melbourne .
     
    -------------------------------------------
    COMP90037 Security Analytics - Project 2 
    Machine learning based Threat detection

    Author : Mohammed Ahsan Kollathodi 
    Student id: 1048942.
    

#### The principal aim of this code is to perform pre-processing to the given dataset to clean dataset or remove noise.

### Train data 

The dataset provided contain the NetFlow data for a network under cyberattacks. Each line of the dataset includes the following 15 fields: (1) stream ID, (2) timestamp, (3) duration, (4) protocol, (5) source IP address, (6) source port, (7) direction, (8) destination IP address, (9) destination port, (10) state, (11) source type of service, (12) destination type of service, (13) the number of total packets, (14) the number of bytes transferred in both directions, (15) the number of bytes transferred from the source to the destination.

I have not labelled stream ID as it's not very relevant with respect to the project. 

In [3]:
# Create a new train dataset dataframe and assign fields in the given unlabelled CSV for training dataset.
train_df = pd.read_csv('training_data.csv',sep=',',names=['timestamp','duration','protocol','src_ip','src_port','direction','dst_ip','dst_port','state','srctype_service','dsttype_service','num_total_packets','total_bytes','src_bytes'])

In [4]:
print(train_df)

                          timestamp     duration protocol           src_ip  \
1        2021-08-15 18:24:13.357372     0.002631      tcp    150.35.89.169   
2        2021-08-15 18:24:13.359765  3562.810777      tcp    150.35.89.169   
3        2021-08-15 18:24:13.364512   632.856571      udp     150.35.89.27   
4        2021-08-15 18:24:13.365311     0.000000     icmp    150.35.87.121   
5        2021-08-15 18:24:13.366361  1316.538437      tcp     191.98.64.45   
...                             ...          ...      ...              ...   
1045451  2021-08-16 19:50:43.445184     0.190298      tcp    150.35.87.126   
1045452  2021-08-16 19:50:43.487227     0.120374      tcp  207.127.183.239   
1045453  2021-08-16 19:50:43.501126     0.064432      tcp     150.35.88.21   
1045454  2021-08-16 19:50:43.637173     0.000000      udp     150.35.87.62   
1045455  2021-08-16 19:50:43.676861     0.000000      udp    150.35.87.232   

         src_port direction           dst_ip  dst_port     stat

In [5]:
"""
.isna() is used to detect the missing values in the given series object. 
It would return a boolan of same-sized object indicating if the values are NA. 
# Missing values gets maapped to True and non-missing values gets mapped to False.

"""

train_df.isna().any()  # To detect missing values in the given series object. 
                       

timestamp            False
duration             False
protocol             False
src_ip               False
src_port             False
direction            False
dst_ip               False
dst_port             False
state                 True
srctype_service       True
dsttype_service       True
num_total_packets    False
total_bytes          False
src_bytes            False
dtype: bool

In [6]:
train_df.nunique()   # returns the number of unique values for each column or row. 

timestamp            1045451
duration              951288
protocol                  11
src_ip                  5498
src_port               60939
direction                  6
dst_ip                103143
dst_port               57825
state                    316
srctype_service            5
dsttype_service            4
num_total_packets       3309
total_bytes            71947
src_bytes              26904
dtype: int64

In [7]:
train_df.direction.unique()   # Cleaning the 'direction' field. 

array(['  <?>', '  <->', '   ->', '   ?>', '  who', '  <-'], dtype=object)

In [8]:
# cleaned values for the field direction in the dataset. 
# We replace the existing column 'direction' values with cleaned values. 

cleanedvalues = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}

# replace in the existing dataset. 
train_df = train_df.replace({"direction": cleanedvalues})

# count the number of values for direction column values. 
train_df['direction'].value_counts()



<->    529973
->     485965
<?>     17322
<-      12195
Name: direction, dtype: int64

In [9]:
# We introduce a new field called as dir_ratio to have a quantitative analysis of the fields in direction.

train_df['dir_ratio'] = train_df['src_bytes']/train_df['total_bytes']


In [10]:
# Display the ratio of all fields in the column direction for the given training dataset. 

print(train_df.loc[train_df['direction'] == '->']['dir_ratio'].mean())
print(train_df.loc[train_df['direction'] == '<->']['dir_ratio'].mean())
print(train_df.loc[train_df['direction'] == '<?>']['dir_ratio'].mean())
print(train_df.loc[train_df['direction'] == '<-']['dir_ratio'].mean())


0.5732127382993537
0.37838143058224705
0.656080404635475
0.0


In [11]:
# We could find that the '<-' value in direction set would have the "source_bytes" equal to the value of zero. 
# So we further investigate all other data that are related to the above value. 

train_df.loc[train_df['direction'] == '<-'].head()

Unnamed: 0,timestamp,duration,protocol,src_ip,src_port,direction,dst_ip,dst_port,state,srctype_service,dsttype_service,num_total_packets,total_bytes,src_bytes,dir_ratio
1478,2021-08-15 18:24:34.783048,2.8e-05,icmp,150.35.90.252,0,<-,150.35.90.14,2,RED,,0.0,2,147,0,0.0
1532,2021-08-15 18:24:35.564441,0.001818,icmp,150.35.90.252,0,<-,150.35.90.16,2,RED,,0.0,1,148,0,0.0
1549,2021-08-15 18:24:35.770106,1.4e-05,icmp,150.35.90.252,0,<-,150.35.90.14,2,ECR,,0.0,1,148,0,0.0
1596,2021-08-15 18:24:36.551252,0.00043,icmp,150.35.90.252,0,<-,150.35.90.16,2,RED,,0.0,2,148,0,0.0
1610,2021-08-15 18:24:36.770012,9e-06,icmp,150.35.90.252,0,<-,150.35.90.14,2,ECR,,0.0,1,149,0,0.0


In [12]:
# The direction field in the dataset can be further completed, with the ratio being estimated.
train_df.loc[(train_df['src_bytes'] == 0) & (train_df['direction'] == '<?>'), 'direction'] = '<-'
train_df.loc[(train_df['dir_ratio'] > 0.5) & (train_df['direction'] == '<?>'), 'direction'] = '->'
train_df.loc[(train_df['dir_ratio'] <= 0.5) & (train_df['direction'] == '<?>'), 'direction'] = '<->'


In [13]:
# Count the number of values for each field in 'direction'. 
train_df['direction'].value_counts()


<->    535265
->     497900
<-      12290
Name: direction, dtype: int64

In [14]:
"""
We create new fields to estimate the total packets,
total bytes and the total number of source bytes transferred in a unit time.
"""
# packetsinSec to estimate the number of packets in a unit time.

train_df['packets_in_Sec'] = train_df['num_total_packets']/train_df['duration']

# bytestotalinSec to estimate the total number of bytes in a unit time.

train_df['bytes_total_in_Sec'] = train_df['total_bytes']/train_df['duration']

# sourcebytesinSec to estimate the source bytes transferred in a unit time. 

train_df['Source_Bytes_Sec'] = train_df['src_bytes']/train_df['duration']


In [15]:
# convert the cleaned training data into a CSV and save it in the root folder. 
train_df.to_csv('trainingdata_cleaned.csv', sep=',', index=False)

### Test data 

In [16]:
# Introduce new test dataframe. 
# Create a new test dataset dataframe and assign fields in the given unlabelled CSV for test dataset.

test_df = pd.read_csv('test_data.csv',sep=',',names=['timestamp','duration','protocol','src_ip','src_port','direction','dst_ip','dst_port','state','srctype_service','dsttype_service','num_total_packets','total_bytes','src_bytes'])




In [17]:
# To obtain the shape of the test data frame. 

test_df.shape


(348477, 14)

In [18]:
# Again we go for pre-processing or cleaning the direction field in the dataset. 

test_df = test_df.replace({"direction": cleanedvalues})

# To count the number of values in the direction field. 

test_df['direction'].value_counts()


<->    261013
->      82759
<?>      4690
<-         15
Name: direction, dtype: int64

In [19]:
# We introduce a new field called as dir_ratio to have a quantitative analysis of the fields in direction.
test_df['dir_ratio'] = test_df['src_bytes']/test_df['total_bytes']

In [20]:
# The newly created field called as "dir_ratio" would help us to have a better understanding of the 'direction' field.
test_df.loc[(test_df['src_bytes'] == 0) & (test_df['direction'] == '<?>'), 'direction'] = '<-'
test_df.loc[(test_df['dir_ratio'] > 0.5) & (test_df['direction'] == '<?>'), 'direction'] = '->'
test_df.loc[(test_df['dir_ratio'] <= 0.5) & (test_df['direction'] == '<?>'), 'direction'] = '<->'
test_df = test_df.drop('dir_ratio',1)



  test_df = test_df.drop('dir_ratio',1)


In [21]:
# packets_in_Sec to estimate the total number of packets in a unit time.

test_df['packets_in_Sec'] = test_df['num_total_packets']/test_df['duration']

# bytes_total_in_Sec to estimate the total number of bytes in a unit time.

test_df['bytes_total_in_Sec'] = test_df['total_bytes']/test_df['duration']

# source_Bytes_Sec to estimate the source bytes transferred in a unit time. 

test_df['Source_Bytes_Sec'] = test_df['src_bytes']/test_df['duration']


In [22]:
# Save the pre-processed and cleaned test dataset as CSV.

test_df.to_csv('testdata_cleaned.csv', sep=',', index=False)

### Validation data. 


In [85]:
valid_df = pd.read_csv('validation_data_with_label.csv',sep=',',names=['timestamp','duration','protocol','src_ip','src_port','direction','dst_ip','dst_port','state','srctype_service','dsttype_service','num_total_packets','total_bytes','src_bytes','label'])


In [86]:
# packets_in_Sec to estimate the total number of packets in a unit time.
valid_df['packets_in_Sec'] = valid_df['num_total_packets']/valid_df['duration']

# bytes_total_in_Sec to estimate the total number of bytes in a unit time.
valid_df['bytes_total_in_Sec'] = valid_df['total_bytes']/valid_df['duration']

# source_Bytes_Sec to estimate the source bytes transferred in a unit time. 
valid_df['Source_Bytes_Sec'] = valid_df['src_bytes']/valid_df['duration']

In [87]:
valid_df.loc[valid_df['packets_in_Sec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,duration,src_port,dst_port,srctype_service,dsttype_service,num_total_packets,total_bytes,src_bytes,packets_in_Sec,bytes_total_in_Sec,Source_Bytes_Sec
count,337024.0,337024.0,337024.0,337023,322277.0,337024.0,337024.0,337024.0,325567.0,337024.0,337024.0
mean,12.4874,46344.0,1026.24,0,0.000136529,3.89106,2351.25,1928.6,6147.21,inf,inf
std,137.708,11013.9,6393.28,0,0.0183057,444.977,360171.0,349893.0,6002.73,,
min,0.0,8.0,0.0,0,0.0,0.0,59.0,0.0,0.0,0.0544814,0.0
25%,0.000207279,39389.0,53.0,0,0.0,1.0,213.0,80.0,3757.81,702472.0,258162.0
50%,0.000254398,47672.5,53.0,0,0.0,1.0,214.0,81.0,5439.91,871172.0,317480.0
75%,0.00031311,54856.0,53.0,0,0.0,2.0,215.0,81.0,7867.46,1085930.0,389516.0
max,3631.38,65536.0,65534.0,0,3.0,176606.0,139969000.0,139802000.0,669885.0,inf,inf


In [88]:
valid_df['label'].value_counts()

flow=To-Background-UDP-CVUT-DNS-Server                              312465
flow=From-Botnet-V51-2-ICMP                                          10415
flow=From-Botnet-V51-3-ICMP                                          10363
flow=Background-UDP-Established                                       8217
flow=From-Botnet-V52-2-ICMP                                           3999
flow=Background-UDP-Attempt                                            690
flow=Background-TCP-Established                                        555
flow=From-Botnet-V53-2-UDP-Attempt                                     292
flow=From-Botnet-V53-2-UDP-Established                                 277
flow=Background                                                        199
flow=From-Botnet-V53-3-UDP-Established                                 194
flow=From-Botnet-V53-3-UDP-Attempt                                     182
flow=From-Botnet-V53-2-TCP-WEB-Established                             118
flow=From-Botnet-V53-3-TC

In [89]:
train_df.loc[train_df['packets_in_Sec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,duration,src_port,dst_port,srctype_service,dsttype_service,num_total_packets,total_bytes,src_bytes,dir_ratio,packets_in_Sec,bytes_total_in_Sec,Source_Bytes_Sec
count,998351.0,998351.0,998351.0,982560.0,896949.0,998351.0,998351.0,998351.0,998351.0,951287.0,998351.0,998320.0
mean,224.685,36859.3,10220.6,0.036239,0.000719104,65.5725,51248.4,9020.98,0.44412,2223.2,inf,inf
std,714.425,21598.1,1206310.0,2.62732,0.0426888,4977.63,4312870.0,1400310.0,0.272958,12865.1,,
min,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0340369,0.0
25%,0.00038774,13363.0,53.0,0.0,0.0,1.0,228.0,77.0,0.260904,0.777223,308.237,79.4737
50%,0.102172,44891.0,80.0,0.0,0.0,2.0,421.0,170.0,0.376812,37.27,16041.6,3719.17
75%,9.08958,54839.0,6878.0,0.0,0.0,11.0,2390.5,1064.0,0.59173,2669.86,704230.0,209602.0
max,3659.68,65537.0,539033000.0,192.0,3.0,1833590.0,1933960000.0,1238500000.0,1.03846,1995530.0,inf,inf
