In [1]:
import os
import pandas as pd
import warnings
import matplotlib.pyplot as plt

import numpy as np
import seaborn as sns

from tqdm import tqdm
from random import randint
from tqdm.auto import tqdm
from scipy.stats import entropy
import time
tqdm.pandas()

plt.rcParams["figure.figsize"] = (20, 16)
warnings.simplefilter("ignore")



In [2]:
current_dataset = 3
name_directory = "CTU-13-Dataset"
path = os.path.join(name_directory,str(current_dataset))
os.listdir(path)

['botnet-capture-20110812-rbot.pcap', 'capture20110812.binetflow', 'README']

In [3]:
[i for i in os.listdir(path) if i.endswith(".binetflow") ]

['capture20110812.binetflow']

In [4]:
datafile_name = [i for i in os.listdir(path) if i.endswith(".binetflow") ][0]
df = pd.read_csv(os.path.join(path,datafile_name))
df

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/12 15:25:56.021112,11.337043,tcp,195.68.34.68,52475,->,147.32.86.165,12114,SR_SA,0.0,0.0,11,824,606,flow=Background-TCP-Established
1,2011/08/12 15:29:25.508940,2.962470,tcp,147.32.86.58,1393,->,77.75.73.156,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
2,2011/08/12 15:30:21.101931,2.962828,tcp,201.54.33.206,2550,->,147.32.86.110,443,S_RA,0.0,0.0,4,240,120,flow=Background-TCP-Attempt
3,2011/08/12 15:37:08.317877,1.986249,tcp,221.134.221.114,8204,->,147.32.84.189,51413,S_RA,0.0,0.0,4,252,132,flow=Background-TCP-Attempt
4,2011/08/12 15:33:53.620500,767.978638,tcp,147.32.84.59,49156,->,147.32.80.7,80,SRPA_FSPA,0.0,0.0,14,3710,774,flow=Background-Established-cmpgw-CVUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4710633,2011/08/15 10:13:26.395097,0.002210,udp,147.32.85.2,55572,<->,147.32.80.9,53,CON,0.0,0.0,2,505,78,flow=To-Background-UDP-CVUT-DNS-Server
4710634,2011/08/15 10:13:26.408351,0.000312,udp,147.32.85.2,61416,<->,147.32.80.9,53,CON,0.0,0.0,2,369,78,flow=To-Background-UDP-CVUT-DNS-Server
4710635,2011/08/15 10:13:26.412151,0.006138,tcp,147.32.85.2,49590,->,195.113.232.98,80,SPA_SPA,0.0,0.0,9,5206,295,flow=Background-TCP-Established
4710636,2011/08/15 10:13:26.427955,0.000549,udp,89.176.63.143,27520,<->,147.32.84.229,13363,CON,0.0,0.0,2,131,71,flow=Background-UDP-Established


In [5]:
def preprocess_df(df): 
    df['Sport'] = df['Sport'].fillna(-1)
    df['Dport'] = df['Dport'].fillna(-1)
    df["Backgroung_label"] = df.Label.str.contains("Background")
    df["Normal_label"] = df.Label.str.contains("Normal")
    df["Botnet_label"] = df.Label.str.contains("Botnet")
    return df
df = preprocess_df(df)
#df = df.iloc[0:100]

#  KGB


The entropy prediction algorithm presented by Lakhina et al. (2005) is based on the similar PCA-based traffic model than
Section 3.2.3, but it uses different features. It **aggregates the
traffic from the individual source IP addresses**, but instead of
traffic volumes, it predicts the **entropies of destination IP addresses**, **destination ports** and **source ports** 
over the set of context NetFlows for each source. The context space is
therefore three dimensional. 

Since the support of these distributions
is discrete and finite, they are simple histograms. Finally,
from these histograms we can easily calculate the entropy
by the usual formula H(x) = −sum(Pk.log Pk, foreach k)
where pk denotes the probability of kth bin.

In [6]:
df_for_for_entropies = df[["SrcAddr","DstAddr","Sport", "Dport","Label","Backgroung_label","Normal_label","Botnet_label"]]
df_for_for_entropies

Unnamed: 0,SrcAddr,DstAddr,Sport,Dport,Label,Backgroung_label,Normal_label,Botnet_label
0,195.68.34.68,147.32.86.165,52475,12114,flow=Background-TCP-Established,True,False,False
1,147.32.86.58,77.75.73.156,1393,80,flow=Background-TCP-Attempt,True,False,False
2,201.54.33.206,147.32.86.110,2550,443,flow=Background-TCP-Attempt,True,False,False
3,221.134.221.114,147.32.84.189,8204,51413,flow=Background-TCP-Attempt,True,False,False
4,147.32.84.59,147.32.80.7,49156,80,flow=Background-Established-cmpgw-CVUT,True,False,False
...,...,...,...,...,...,...,...,...
4710633,147.32.85.2,147.32.80.9,55572,53,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
4710634,147.32.85.2,147.32.80.9,61416,53,flow=To-Background-UDP-CVUT-DNS-Server,True,False,False
4710635,147.32.85.2,195.113.232.98,49590,80,flow=Background-TCP-Established,True,False,False
4710636,89.176.63.143,147.32.84.229,27520,13363,flow=Background-UDP-Established,True,False,False


In [10]:
# On doit mesurer l entropie pour chaque aggreg, mais jsp comment cette entropie est calculée.
tmp_mesure = time.time_ns()
base = 10
entropy_agg =  lambda x : entropy(x.value_counts(), base=base)
df_with_Entropies = df_for_for_entropies.groupby(by = "SrcAddr").agg({'DstAddr': entropy_agg,
                                                                        'Dport': entropy_agg,
                                                                         'Sport': entropy_agg})


df_with_Entropies.rename(columns = {"DstAddr" : "H_DstAddr","Dport" : "H_Dport","Sport" : "H_Sport"}, inplace = True)
df_with_Entropies
print(f"Time required to compute aggregation : {np.round((time.time_ns() - tmp_mesure)/(10**9),2)} s")

Time required to compute aggregation : 365.94 s
