In [None]:
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from scapy.all import *
from scapy.layers.inet import TCP,UDP
from sklearn.decomposition import PCA
import tqdm
import pandas as pd



## Importaion des données

In [None]:

file_path = "/media/samamola/Logiciels/DATA/Supervision_systemes/CTU-13-Dataset/1/capture20110810.binetflow"

df = pd.read_csv(file_path,delimiter=',')

## les fonctions

In [None]:

def upload_data(df):
    format_data ={}
    targets=[]
    for ligne in range(df.shape[0]):

        ip_source =  df.SrcAddr[ligne]
        ip_destination = df.DstAddr[ligne]
        drapeaux = df.State[ligne]
        port_source =df.Sport[ligne]
        port_destination = df.Dport[ligne]
        if "Botnet" in df.Label[ligne]:
            label = 1
        else :
            label =0


        targets.append(label)


        if ip_source not in format_data:
                format_data[ip_source] = {"liste_src_P":[], "liste_dest_P":[],"liste_dest_IP":[],"liste_flags":[]}
        format_data[ip_source]["liste_src_P"].append(port_source)
        format_data[ip_source]["liste_dest_P"].append(port_destination)
        format_data[ip_source]["liste_dest_IP"].append(ip_destination)
        format_data[ip_source]["liste_flags"].append(drapeaux)

    return format_data,targets

def calcul_entropies(datas):
     histogramme = {adresse_ip: {"liste_src_P":Counter(dic_list["liste_src_P"]), "liste_dest_P":Counter(dic_list["liste_dest_P"])
                            ,"liste_dest_IP":Counter(dic_list["liste_dest_IP"]),"liste_flags":Counter(dic_list["liste_flags"])}
               for adresse_ip, dic_list in tqdm.tqdm(datas.items())}

     probabilites = {ip_source: {"liste_src_P":{port:count/sum(dic_listip_src["liste_src_P"].values()) for port,count in dic_listip_src["liste_src_P"].items()}
                                , "liste_dest_P":{port:count/sum(dic_listip_src["liste_dest_P"].values()) for port,count in dic_listip_src["liste_dest_P"].items()}
                                ,"liste_dest_IP":{ip_dest:count/sum(dic_listip_src["liste_dest_IP"].values()) for ip_dest,count in dic_listip_src["liste_dest_IP"].items()}
                                ,"liste_flags":{flag:count/sum(dic_listip_src["liste_flags"].values()) for flag,count in dic_listip_src["liste_flags"].items()}}
                    for ip_source, dic_listip_src in tqdm.tqdm(histogramme.items())}

     print("calcul des entropies")
     entropries = {adresse_ip: {"H_src_P":-sum([p * np.log2(p) for p in dic_listip_src["liste_src_P"].values()])
                            ,"H_dest_P": -sum([p * np.log2(p) for p in dic_listip_src["liste_dest_P"].values()])
                            ,"H_dest_IP":-sum([p * np.log2(p) for p in dic_listip_src["liste_dest_IP"].values()])
                            ,"H_flags":-sum([p * np.log2(p) for p in dic_listip_src["liste_flags"].values()])}
                for adresse_ip, dic_listip_src in tqdm.tqdm(probabilites.items())}
     return histogramme,probabilites,entropries


def calcul_scores(k,data_matrix,significant_components,eigenvalues):
    # Projetion des données originales sur les composantes principales significatives
    projected_data_majeur = data_matrix.dot(significant_components.T[:,:-k])
    projected_data_mineur = data_matrix.dot(significant_components.T[:,-k:])


    # Calcul des scores d'anomalie pour chaque observation
    # Division par les carrés des valeurs propres
    eigenvalues_squared_majeur = np.square(eigenvalues)[:-k]
    eigenvalues_squared_mineur = np.square(eigenvalues)[-k:]

    anomaly_scores_majeur = np.sum(np.square(projected_data_majeur) / eigenvalues_squared_majeur, axis=1)
    anomaly_scores_mineur = np.sum(np.square(projected_data_mineur) / eigenvalues_squared_mineur, axis=1)

    return anomaly_scores_majeur,anomaly_scores_mineur

def predire(anomaly_scores_majeur,eigenvalues_squared_mineur,seuil_anomalie_majeur,seuil_anomalie_mineur,entropries,df):
    predictions = np.logical_or(anomaly_scores_majeur > seuil_anomalie_majeur,eigenvalues_squared_mineur > seuil_anomalie_mineur)
    ips = list(entropries.keys())
    labels={ip:0 for ip in ips}
    for prediction_indice in range(len(predictions)):
        if predictions[prediction_indice]:
            key = ips[prediction_indice]
            labels[key] = 1

    netFlows_predict = []
    for netflow_index in range(len(df.Label)):
        netFlows_predict.append(labels[df.SrcAddr[netflow_index]])

    return netFlows_predict


les entêtes : StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label

In [None]:

datas,df["Target"] = upload_data(df)

histogramme,probabilites,entropries = calcul_entropies(datas)

100%|██████████| 542093/542093 [00:04<00:00, 117495.65it/s]
100%|██████████| 542093/542093 [01:06<00:00, 8109.57it/s]  


calcul des entropies


100%|██████████| 542093/542093 [00:03<00:00, 169327.82it/s]


In [None]:
#création de la matrices des entropies

data_matrix=np.array([[entropie_ip_src["H_src_P"],entropie_ip_src["H_dest_P"],entropie_ip_src["H_dest_IP"],entropie_ip_src["H_flags"]]
                      for _,entropie_ip_src in entropries.items()])
data_matrix.reshape((len(entropries.keys()),4))

print("data_matrix.shape: ",data_matrix.shape)

data_matrix.shape:  (542093, 4)


# PCA

In [None]:
# Création de l'objet PCA, possibilités de définir le nombre de composantes désiré en paramètres
pca = PCA()

# Adapter le PCA aux données
pca.fit(data_matrix)

# Obtenir les composantes principales et leurs valeurs propres
principal_components = pca.components_
eigenvalues = pca.explained_variance_

# Sélectionner les composantes principales significatives,le seuil 1e-6 est choisi selon l'article.
significant_components = principal_components[eigenvalues > 1e-6]
eigenvalues = eigenvalues[:significant_components.shape[0]]

print(" significant_components shape: ",significant_components.shape,"; eigenvalues :",eigenvalues.shape)


#calcul des scores
k =1                 #separateur de la variances des composantes majeur et mineur

anomaly_scores_majeur,eigenvalues_squared_mineur = calcul_scores(k,data_matrix,significant_components,eigenvalues)


# Définir un seuil pour classer les observations comme anomalies (il faut avoir les bons seuils)
seuil_anomalie_majeur = 0.5
seuil_anomalie_mineur = 0.5



 significant_components shape:  (4, 4) ; eigenvalues : (4,)


In [None]:

netFlows_predict = predire(anomaly_scores_majeur,eigenvalues_squared_mineur,seuil_anomalie_majeur,seuil_anomalie_mineur,entropries,df)
print("nb netFlows_predict: ",len(netFlows_predict)," ;len(df.Label) :",len(df.Label))

nb netFlows_predict:  2824636  ;len(df.Label) : 2824636


In [None]:

#Calcul des metriques
C_TP=0
C_TN=0
C_FP=0
C_FN=0

for i in range(len(netFlows_predict)):
    if df.Target[i]:
        if netFlows_predict[i]:
            C_TP+=1
        else:
            C_TN+=1

    else:
        if netFlows_predict[i]:
            C_FP+=1
        else:
            C_FN+=1

print("C_TP:",C_TP,"C_FN:",C_FN,"C_TN:",C_TN,"C_FP:",C_FP)

FPR = C_FP/(C_TN+C_FP)


TPR = C_TP/(C_TP+C_FN)

TNR = C_TN/(C_TN+C_FN)

FNR = C_FN/(C_TP+C_FN)

precision =  C_TP/(C_TP + C_FP)

accuracy = (C_TN + C_TN)/(C_TP+C_TN+C_FP+C_FN)

errorRate = (C_FN+C_FP)/(C_TP+C_TN+C_FP+C_FN)

f_measure = 2*precision*TPR/(precision+TPR)


print("FPR :",FPR)
print("TPR :",TPR)
print("TNR :",TNR)
print("FNR :",FNR)
print("precision :",precision)
print("accuracry :",accuracy)
print("error_rate :",errorRate)
print("f_measure : ",f_measure)



C_TP: 40961 C_FN: 709388 C_TN: 0 C_FP: 2074287
FPR : 1.0
TPR : 0.05458926446227022
TNR : 0.0
FNR : 0.9454107355377298
precision : 0.019364632421352012
accuracry : 0.0
error_rate : 0.9854986624825287
f_measure :  0.02858810921424052
