In [13]:
import os
import pandas as pd
import warnings
import matplotlib.pyplot as plt

import numpy as np
import seaborn as sns

from tqdm import tqdm
from random import randint
from tqdm.auto import tqdm
from scipy.stats import entropy
import time
from sklearn.decomposition import PCA
tqdm.pandas()

plt.rcParams["figure.figsize"] = (20, 16)
warnings.simplefilter("ignore")



In [2]:
current_dataset = 3
name_directory = "CTU-13-Dataset"
path = os.path.join(name_directory,str(current_dataset))
os.listdir(path)

['botnet-capture-20110812-rbot.pcap', 'capture20110812.binetflow', 'README']

In [3]:
[i for i in os.listdir(path) if i.endswith(".binetflow") ]

['capture20110812.binetflow']

In [40]:
datafile_name = [i for i in os.listdir(path) if i.endswith(".binetflow") ][0]
df = pd.read_csv(os.path.join(path,datafile_name))
df

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/12 15:25:56.021112,11.337043,tcp,195.68.34.68,52475,->,147.32.86.165,12114,SR_SA,0.0,0.0,11,824,606,flow=Background-TCP-Established
1,2011/08/12 15:29:25.508940,2.962470,tcp,147.32.86.58,1393,->,77.75.73.156,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
2,2011/08/12 15:30:21.101931,2.962828,tcp,201.54.33.206,2550,->,147.32.86.110,443,S_RA,0.0,0.0,4,240,120,flow=Background-TCP-Attempt
3,2011/08/12 15:37:08.317877,1.986249,tcp,221.134.221.114,8204,->,147.32.84.189,51413,S_RA,0.0,0.0,4,252,132,flow=Background-TCP-Attempt
4,2011/08/12 15:33:53.620500,767.978638,tcp,147.32.84.59,49156,->,147.32.80.7,80,SRPA_FSPA,0.0,0.0,14,3710,774,flow=Background-Established-cmpgw-CVUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4710633,2011/08/15 10:13:26.395097,0.002210,udp,147.32.85.2,55572,<->,147.32.80.9,53,CON,0.0,0.0,2,505,78,flow=To-Background-UDP-CVUT-DNS-Server
4710634,2011/08/15 10:13:26.408351,0.000312,udp,147.32.85.2,61416,<->,147.32.80.9,53,CON,0.0,0.0,2,369,78,flow=To-Background-UDP-CVUT-DNS-Server
4710635,2011/08/15 10:13:26.412151,0.006138,tcp,147.32.85.2,49590,->,195.113.232.98,80,SPA_SPA,0.0,0.0,9,5206,295,flow=Background-TCP-Established
4710636,2011/08/15 10:13:26.427955,0.000549,udp,89.176.63.143,27520,<->,147.32.84.229,13363,CON,0.0,0.0,2,131,71,flow=Background-UDP-Established


In [42]:
def preprocess_df(df): 
    df['Sport'] = df['Sport'].fillna(-1)
    df['Dport'] = df['Dport'].fillna(-1)
    df["Backgroung_label"] = df.Label.str.contains("Background")
    df["Normal_label"] = df.Label.str.contains("Normal")
    df["Botnet_label"] = df.Label.str.contains("Botnet")
    df["StartTime"] = pd.to_datetime(df["StartTime"])
    return df
df = preprocess_df(df)
#df = df.iloc[0:100]

#  KGB


The entropy prediction algorithm presented by Lakhina et al. (2005) is based on the similar PCA-based traffic model than
Section 3.2.3, but it uses different features. It **aggregates the
traffic from the individual source IP addresses**, but instead of
traffic volumes, it predicts the **entropies of destination IP addresses**, **destination ports** and **source ports** 
over the set of context NetFlows for each source. The context space is
therefore three dimensional. 

Since the support of these distributions
is discrete and finite, they are simple histograms. Finally,
from these histograms we can easily calculate the entropy
by the usual formula H(x) = −sum(Pk.log Pk, foreach k)
where pk denotes the probability of kth bin.

Unnamed: 0,SrcAddr,DstAddr,Sport,Dport,Label,Backgroung_label,Normal_label,Botnet_label
0,195.68.34.68,147.32.86.165,52475,12114,flow=Background-TCP-Established,True,False,False
1,147.32.86.58,77.75.73.156,1393,80,flow=Background-TCP-Attempt,True,False,False
2,201.54.33.206,147.32.86.110,2550,443,flow=Background-TCP-Attempt,True,False,False
3,221.134.221.114,147.32.84.189,8204,51413,flow=Background-TCP-Attempt,True,False,False
4,147.32.84.59,147.32.80.7,49156,80,flow=Background-Established-cmpgw-CVUT,True,False,False
...,...,...,...,...,...,...,...,...
4710633,147.32.85.2,147.32.80.9,55572,53,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
4710634,147.32.85.2,147.32.80.9,61416,53,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
4710635,147.32.85.2,195.113.232.98,49590,80,flow=Background-TCP-Established,True,False,False
4710636,89.176.63.143,147.32.84.229,27520,13363,flow=Background-UDP-Established,True,False,False


In [10]:
def initial_context(df)
    df_for_for_entropies = df[["SrcAddr","DstAddr","Sport", "Dport","Label","Backgroung_label","Normal_label","Botnet_label"]]
    tmp_mesure = time.time_ns()
    base = 10
    entropy_agg =  lambda x : entropy(x.value_counts(), base=base)
    df_with_Entropies = df_for_for_entropies.groupby(by = "SrcAddr").agg({'DstAddr': entropy_agg,
                                                                            'Dport': entropy_agg,
                                                                             'Sport': entropy_agg})

    df_with_Entropies.rename(columns = {"DstAddr" : "H_DstAddr","Dport" : "H_Dport","Sport" : "H_Sport"}, inplace = True)
    df_with_Entropies
    print(f"Time required to compute aggregation : {np.round((time.time_ns() - tmp_mesure)/(10**9),2)} s")
    return df_with_Entropies

Time required to compute aggregation : 365.94 s




- For Lakhina Entropies

The complexity of PCT depends cubically on the
dimension of the feature vector x. Consequently,
the calculation of the PCT transformation might be
impossible, or at the time it will be calculated, it will
be no longer relevant.

To address these issues, the [8] simplified Lakhina’s detectors as follows: (a) it uses statistics aggregated only by
source IP addresses, and (b) it uses only latest five time
windows to build the model 1
. The feature vector in [8]
equals to

- For KGP

All entropies are calculated from all flows
observed during 5-minute long time windows.

The vector x
t
(ι) effectively describes user’s traffic at five
consecutive time windows t − 4, . . . , t. The entropies are
calculated iff the number of flows associated with the user
ι is higher than 1 (if there is only one flow originating at
given source IP, or ending at given destination IP, than all
entropies are equal to zero).

=> vecteur de taille 3 ou 15 (5*3 où on mesure l entropie sur chaque minute ????

For numerical stability, all vectors
yj corresponding to eigenvalues λj smaller than 10−6 are
discarded

dtype('O')

In [112]:
def compute_contexte_vector(df, window_time = 5, verbose = False, fillna= True):
    # window_time : the window_time which will be considered
    to_keep = ["StartTime","SrcAddr","DstAddr","Sport", "Dport","Label","Backgroung_label","Normal_label","Botnet_label"]
    df_for_entropies = df[to_keep]
    
    last_time = df_for_entropies.StartTime.max()
    time_min = last_time - pd.Timedelta(f"{window_time} minutes")
    df_for_entropies = df_for_entropies[df_for_entropies.StartTime>=time_min]

    #print(df_for_entropies)
    tmp_mesure = time.time_ns()
    base = 10
    entropy_agg =  lambda x : entropy(x.value_counts(), base=base)
    futur_df = pd.DataFrame({"SrcIP" : df_for_entropies.SrcAddr.unique()}).set_index("SrcIP")
    
    for i in range(window_time) :
        start = time_min + pd.Timedelta(f"{i} minutes")
        end = start + pd.Timedelta(f"{1} minutes")
        current_minute = df_for_entropies[(df_for_entropies.StartTime>=start) & (df_for_entropies.StartTime<end)]
    
        df_with_Entropies = current_minute.groupby(by = "SrcAddr").agg({'DstAddr': entropy_agg,
                                                                                'Dport': entropy_agg,
                                                                                 'Sport': entropy_agg})
        id_name = window_time-i-1
        df_with_Entropies.rename(columns = {"DstAddr" : f"H_DstAddr_t_{id_name}",
                                            "Dport" : f"H_Dport_t_{id_name}",
                                            "Sport" : f"H_Sport_t_{id_name}"}, inplace = True)
        futur_df = futur_df.merge(df_with_Entropies, left_index = True, right_index = True, how = "outer")
        
    if verbose : 
        print(f"Time required to compute aggregation : {np.round((time.time_ns() - tmp_mesure)/(10**9),2)} s")
    # Consequently, for successful detection, at least two flows per five minutes are needed.
    if fillna :
        futur_df = futur_df.fillna(0)  
    
    return futur_df


context = compute_contexte_vector(df, window_time = 5)
context

Unnamed: 0,H_DstAddr_t_4,H_Dport_t_4,H_Sport_t_4,H_DstAddr_t_3,H_Dport_t_3,H_Sport_t_3,H_DstAddr_t_2,H_Dport_t_2,H_Sport_t_2,H_DstAddr_t_1,H_Dport_t_1,H_Sport_t_1,H_DstAddr_t_0,H_Dport_t_0,H_Sport_t_0
1.168.194.180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.225.140.228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.245.52.153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.52.21.187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.53.103.177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.57.141.137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99.61.190.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99.8.149.215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99.90.228.171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
def compute_pca_over_vector(df, drop_limit = 10**-6):
    
    pca = PCA()
    new_flows = pca.fit_transform(df)
    
    cov_matrix= pca.get_covariance()

    eigen_values_vectors = []
    for i, eigenvector in enumerate(pca.components_):

        eigen_value = np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector))

        eigen_values_vectors.append((eigen_value,eigenvector))
        if eigen_value <  drop_limit :
            break

    new_df = pd.DataFrame(new_flows[:,:i+1],columns = [f"component_{j}" for j in range(i+1)])
    new_df.set_index(df.index, drop = True, inplace = True)
    
    
    return new_df, eigen_values_vectors
           
   
new_components, eigen_values_vectors,  = compute_pca_over_vector(context)    
new_components

Unnamed: 0,component_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,component_11,component_12,component_13,component_14
1.168.194.180,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
1.225.140.228,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
1.245.52.153,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
1.52.21.187,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
1.53.103.177,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.57.141.137,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
99.61.190.19,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
99.8.149.215,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347
99.90.228.171,-0.053876,0.002077,0.001034,0.001713,-0.000755,-0.000273,0.001249,0.002213,-0.000024,-0.000157,0.000114,-0.000444,0.000065,-0.000363,0.000347


In [114]:
eigen_values_vectors[0]

(0.11806604884443984,
 array([0.19458421, 0.12555353, 0.40595001, 0.18387995, 0.12263127,
        0.41363172, 0.18280745, 0.11749716, 0.41254603, 0.18036881,
        0.12928233, 0.34936984, 0.17976737, 0.12841617, 0.35228528]))

In [137]:
def compute_anomaly_score(row, eigen, major_components = True,k=1):

    if major_components : 
        end = k
        start = 1 
    else :
        end = len(row)
        start = k+1 
        
    value_sum = 0
    for j in range(start,end+1) :
        e_val, e_vec = eigen[j-1]
        #print(row.values)
        #print()
        val = np.dot(e_vec,np.array(row.values))
        
        val/=e_val
        val=val**2
        value_sum+=val
    
    return value_sum
compute_anomaly_score(context.iloc[0], eigen_values_vectors,major_components = False)

0.0

In [138]:
res = context.apply(lambda x: compute_anomaly_score(x,eigen_values_vectors,major_components = True  ),
                                        axis = 1)
res

1.168.194.180    0.0
1.225.140.228    0.0
1.245.52.153     0.0
1.52.21.187      0.0
1.53.103.177     0.0
                ... 
99.57.141.137    0.0
99.61.190.19     0.0
99.8.149.215     0.0
99.90.228.171    0.0
99.93.142.120    0.0
Length: 3885, dtype: float64

In [139]:
#res.sort_values(ascending=False)

147.32.84.59      3309.609443
147.32.84.118     2295.319393
147.32.84.138     2121.861395
147.32.86.20      1440.807101
147.32.85.60      1384.136778
                     ...     
195.204.70.125       0.000000
195.206.42.216       0.000000
195.208.15.12        0.000000
195.211.218.1        0.000000
99.93.142.120        0.000000
Length: 3885, dtype: float64

In [140]:
#df[df.SrcAddr=="147.32.84.59"]

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,Backgroung_label,Normal_label,Botnet_label
4,2011-08-12 15:33:53.620500,767.978638,tcp,147.32.84.59,49156,->,147.32.80.7,80,SRPA_FSPA,0.0,0.0,14,3710,774,flow=Background-Established-cmpgw-CVUT,True,False,False
5,2011-08-12 15:33:54.633135,767.104309,tcp,147.32.84.59,49158,->,213.199.181.90,80,SRPA_FSPA,0.0,0.0,8,900,397,flow=Background-Established-cmpgw-CVUT,True,False,False
6,2011-08-12 15:34:21.307147,752.838135,tcp,147.32.84.59,49271,->,205.188.93.197,80,FSA_FSA,0.0,0.0,6,366,246,flow=Background-Established-cmpgw-CVUT,True,False,False
7,2011-08-12 15:34:21.354417,753.096497,tcp,147.32.84.59,49274,->,64.12.152.17,80,FSA_FSA,0.0,0.0,6,366,246,flow=Background-Established-cmpgw-CVUT,True,False,False
8,2011-08-12 15:34:21.354648,753.096680,tcp,147.32.84.59,49275,->,64.12.152.17,80,FSA_FSA,0.0,0.0,6,366,246,flow=Background-Established-cmpgw-CVUT,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4710618,2011-08-15 10:13:26.030100,0.000000,udp,147.32.84.59,59975,->,118.171.121.227,27529,INT,0.0,,1,177,177,flow=Background-Attempt-cmpgw-CVUT,True,False,False
4710620,2011-08-15 10:13:26.061247,0.192811,udp,147.32.84.59,15141,<->,66.205.136.89,59005,CON,0.0,0.0,2,136,75,flow=Background-Established-cmpgw-CVUT,True,False,False
4710627,2011-08-15 10:13:26.203065,0.000289,udp,147.32.84.59,48491,<->,147.32.80.9,53,CON,0.0,0.0,2,218,74,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
4710628,2011-08-15 10:13:26.203784,0.000367,udp,147.32.84.59,46298,<->,147.32.80.9,53,CON,0.0,0.0,2,400,74,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False


In [141]:
#df[df.SrcAddr=="147.32.84.118"].sort_values("StartTime")

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,Backgroung_label,Normal_label,Botnet_label
7064,2011-08-12 15:26:43.329933,0.000000,icmp,147.32.84.118,0x0303,->,189.70.228.171,0xe11a,URP,0.0,,1,135,135,flow=Background,True,False,False
20359,2011-08-12 15:32:24.505560,3195.006104,udp,147.32.84.118,123,<->,93.99.64.165,123,CON,0.0,0.0,8,720,360,flow=Background-UDP-NTP-Established-1,True,False,False
23610,2011-08-12 15:33:43.505496,3195.008789,udp,147.32.84.118,123,<->,193.85.174.5,123,CON,0.0,0.0,8,720,360,flow=Background-UDP-NTP-Established-1,True,False,False
44134,2011-08-12 15:44:02.858700,0.000000,icmp,147.32.84.118,0x0303,->,78.43.155.93,0xe11a,URP,0.0,,1,135,135,flow=Background,True,False,False
47863,2011-08-12 15:46:01.916134,0.000298,udp,147.32.84.118,35094,<->,147.32.80.9,53,CON,0.0,0.0,2,206,78,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4710567,2011-08-15 10:13:24.803141,0.000000,udp,147.32.84.118,1150,->,114.32.141.22,16881,INT,0.0,,1,107,107,flow=Background-UDP-Attempt,True,False,False
4710582,2011-08-15 10:13:25.409170,0.000000,tcp,147.32.84.118,4330,->,31.9.52.56,6881,S_,0.0,,1,62,62,flow=Background-TCP-Attempt,True,False,False
4710591,2011-08-15 10:13:25.589784,0.018220,udp,147.32.84.118,1150,<->,91.121.96.135,48205,CON,0.0,0.0,2,215,107,flow=Background-UDP-Established,True,False,False
4710597,2011-08-15 10:13:25.763060,0.000000,icmp,147.32.84.118,0x0303,->,68.100.215.199,0x7e04,URP,0.0,,1,173,173,flow=Background,True,False,False


In [145]:
def proceed_KGB(df, anomaly_threshold, window_time = 5, major_components = True ) :
    
    context = compute_contexte_vector(df, window_time = window_time)
    
    new_components, eigen_values_vectors = compute_pca_over_vector(context)  
    res = context.apply(lambda x: compute_anomaly_score(x,eigen_values_vectors,major_components = major_components  ),
                                        axis = 1)
    #return res.sort_values(ascending=False)
    
    return [ index for index, row in res.iteritems() if row>anomaly_threshold]



In [146]:
proceed_KGB(df, 2000 ) 

['147.32.84.118', '147.32.84.138', '147.32.84.59']