In [1]:
import pandas as pd
import xgboost as xgb
import scapy.all as scapy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
from concurrent.futures import ThreadPoolExecutor
import random
import torch
import joblib
# Global variables
initial_bandwidth = 0
initial_packet_rate = 0
initial_total_packets = 0
initial_total_time = 0
label = ""
source_ip = ""
destination_ip = ""

def calculate_total_packets_and_time(packets):
    total_packets = len(packets)
    total_time = packets[-1].time - packets[0].time if len(packets) > 1 else 0.000012
    return total_packets, total_time

def calculate_bandwidth_and_packet_rate(packets, total_length, total_time):
    bandwidth = total_length / total_time
    packet_rate = len(packets) / total_time
    return round(bandwidth, 2), round(packet_rate, 2)

def calculate_initial_features(packets):
    total_length = sum(len(packet) for packet in packets)

    with ThreadPoolExecutor() as executor:
        total_packets_and_time_future = executor.submit(calculate_total_packets_and_time, packets)
        total_packets, total_time = total_packets_and_time_future.result()

        bandwidth_and_packet_rate_future = executor.submit(calculate_bandwidth_and_packet_rate, packets, total_length, total_time)
        bandwidth, packet_rate = bandwidth_and_packet_rate_future.result()

    return packets, round(bandwidth, 2), round(packet_rate, 2), round(total_packets, 2), round(total_time, 2)

def load_pretrained_model(): #change here
    tabnet_model = torch.load('Original-TabNet-XGB-model-Dataset-A-1-80-20-Tabnet.h5')
    xgb_model = joblib.load('Original-TabNet-XGB-model-Dataset-A-1-80-20-XGB.pkl')
    label_encoder = joblib.load('Original-TabNet-XGB-Dataset-A-1-80-20-label_encoder.pkl')
    return tabnet_model, xgb_model, label_encoder

def predict_with_model(tabnet_model, xgb_model, label_encoder, features):
    features_df = pd.DataFrame([features], columns=[
        'Protocol', 'Source Port', 'Destination Port', 'Total Fwd Pkt', 'Total Bwd Pkt', 'Total Fwd Len', 'Total Bwd Len', 
        'Min Fwd Len', 'Max Fwd Len', 'Mean Fwd Len', 'Std Fwd Len', 'Min Bwd Len', 'Max Bwd Len', 
        'Mean Bwd Len', 'Std Bwd Len', 'Flow Bytes/s', 'Flow Pkt/s', 'Flow Duration', 
        'Bandwidth', 'Packet Rate','Total-Pkt','T-Time'
    ])
    features_df = features_df.astype('float64') #change here

    if features_df.empty:
        return 'Normal-Traffic'  # Handle empty dataframe

    # Convert features to TabNet input format and get the TabNet output
    try:
        tabnet_output = tabnet_model.predict_proba(features_df.values)
    except RuntimeError as e:
        #print(f"Error during TabNet prediction: {e}")
        return 'Normal-Traffic'  # Handle prediction error

    # Use the TabNet output to make predictions with the XGBoost model
    prediction = xgb_model.predict(tabnet_output)
    predicted_label = label_encoder.inverse_transform(prediction)[0]
    return predicted_label

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    labels = sorted(set(y_true) | set(y_pred))
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (0, 0, 0, 0)
    return accuracy, precision, recall, f1, tp, tn, fp, fn

def group_packets_by_flow(packets):
    flow_dict = {}
    
    for packet in packets:
        if scapy.IP in packet and scapy.TCP in packet:
            src_ip = packet[scapy.IP].src
            dst_ip = packet[scapy.IP].dst
            src_port = packet[scapy.TCP].sport
            dst_port = packet[scapy.TCP].dport
            proto = packet[scapy.IP].proto

            flow_key = (src_ip, dst_ip, src_port, dst_port, proto)
            
            if flow_key not in flow_dict:
                flow_dict[flow_key] = []
            flow_dict[flow_key].append(packet)
    
    return flow_dict

def calculate_features_for_flow(flow, initial_bandwidth, initial_packet_rate, initial_total_packets, initial_total_time):
    features = {}
    packets = flow

    if packets and scapy.IP in packets[0] and scapy.TCP in packets[0]:
        src_ip = packets[0][scapy.IP].src
        dst_ip = packets[0][scapy.IP].dst
        packet_len = len(packets[0])

    # Calculate other features if not a special packet
    total_len = sum(len(pkt) for pkt in packets)
    min_len = min(len(pkt) for pkt in packets)
    max_len = max(len(pkt) for pkt in packets)
    mean_len = total_len / len(packets)
    std_len = pd.Series([len(pkt) for pkt in packets]).std()
    
    first_time = packets[0].time
    last_time = packets[-1].time
    flow_duration = last_time - first_time

    # Ensure flow_duration is not zero to avoid DivisionByZero error
    if flow_duration == 0:
        flow_duration = 0.000012

    flow_bytes_s = total_len / flow_duration
    flow_pkt_s = len(packets) / flow_duration

    features = {
        'Protocol': packets[0][scapy.IP].proto,
        'Source Port': packets[0][scapy.TCP].sport,
        'Destination Port': packets[0][scapy.TCP].dport,
        'Total Fwd Pkt': len(packets) if src_ip == source_ip else 0,
        'Total Fwd Len': total_len if src_ip == source_ip else 0,
        'Min Fwd Len': min_len if src_ip == source_ip else 0,
        'Max Fwd Len': max_len if src_ip == source_ip else 0,
        'Mean Fwd Len': mean_len if src_ip == source_ip else 0,
        'Std Fwd Len': std_len if src_ip == source_ip else 0,
        'Total Bwd Pkt': len(packets) if src_ip != source_ip else 0,
        'Total Bwd Len': total_len if src_ip != source_ip else 0,
        'Min Bwd Len': min_len if src_ip != source_ip else 0,
        'Max Bwd Len': max_len if src_ip != source_ip else 0,
        'Mean Bwd Len': mean_len if src_ip != source_ip else 0,
        'Std Bwd Len': std_len if src_ip != source_ip else 0, 
        'Flow Bytes/s': flow_bytes_s,
        'Flow Pkt/s': flow_pkt_s,
        'Flow Duration': flow_duration,
        'Bandwidth': initial_bandwidth,
        'Packet Rate': initial_packet_rate,
        'Total-Pkt': initial_total_packets,
        'T-Time': initial_total_time
        
    }

    return features, None

def is_request(packet):
    return packet.haslayer(scapy.IP) and packet.haslayer(scapy.TCP)

def is_reply(packet):
    return packet.haslayer(scapy.IP) and packet.haslayer(scapy.TCP)

def is_spyware_rule_1(packets):
    normal_traffic_set = set()
    spyware_traffic_set = set()
    
    for packet in packets:
        global destination_ip
        if is_request(packet):
            normal_traffic_set.add((packet[scapy.IP].src, packet[scapy.IP].dst, 'RQ'))
        elif is_reply(packet):
            normal_traffic_set.add((packet[scapy.IP].dst, packet[scapy.IP].src, 'RP'))
        destination_ip = packet[scapy.IP].dst
    
    for packet in packets:
        if is_request(packet):
            spyware_traffic_set.add((packet[scapy.IP].src, packet[scapy.IP].dst, 'RQ'))
        elif is_reply(packet):
            spyware_traffic_set.add((packet[scapy.IP].dst, packet[scapy.IP].src, 'RP'))
    for flow in spyware_traffic_set:
        if flow not in normal_traffic_set:
            return True
    
    return False

def is_spyware_rule_2():
    global destination_ip
    unique_dest_ips = len(set(destination_ip))
    return unique_dest_ips > 3

def is_spyware_rule_3(total_fwd_len, total_bwd_len):
    return total_fwd_len > total_bwd_len

def main():
    csv_files = ['1-All-Together-Update-Single-26-8-24.csv', '2-All-Together-Update-Single-26-8-24.csv',
                 '3-All-Together-Update-Single-26-8-24.csv', '4-All-Together-Update-Single-26-8-24.csv',
                 '5-All-Together-Update-Single-26-8-24.csv']
    
    tabnet_model, xgb_model, label_encoder = load_pretrained_model() #change here
    
    previous_samples = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
    y_true = previous_samples['Label']
    y_true_encoded = label_encoder.transform(y_true)

    # Capture initial packets to calculate initial features
    initial_packets = scapy.sniff(count=50)  # Capturing an initial set of 50 packets
    packets, initial_bandwidth, initial_packet_rate, initial_total_packets, initial_total_time = calculate_initial_features(initial_packets)
    
    label_row = previous_samples[previous_samples['Total-Pkt'] == initial_total_packets]
    if not label_row.empty:
        label = label_row['Label'].values[0]
    else:
        label = 'Normal-Traffic'

    packet_count = 0
    predictions = []
    true_labels = []
    captured_packets = []

    def process_packet(packet):
        nonlocal packet_count, captured_packets, initial_bandwidth, initial_packet_rate, initial_total_packets, initial_total_time, label

        captured_packets.append(packet)
        packet_count += 1

        if packet_count % 50 == 0:
            flow_dict = group_packets_by_flow(captured_packets)
            
            for flow_key, flow_packets in flow_dict.items():
                features, special_label = calculate_features_for_flow(flow_packets, initial_bandwidth, initial_packet_rate, initial_total_packets, initial_total_time)

                if special_label:
                    true_labels.append(special_label)
                    predicted_label = special_label
                else:
                    if is_spyware_rule_1(flow_packets) or is_spyware_rule_2() or is_spyware_rule_3(features['Total Fwd Len'], features['Total Bwd Len']):
                        label_row = previous_samples[previous_samples['Total-Pkt'] == initial_total_packets]
                        if not label_row.empty:
                            label = label_row['Label'].values[0]
                        else:
                            label = 'Normal-Traffic'
                    else:
                        label = 'Normal-Traffic'

                    true_labels.append(label)
                    predicted_label = predict_with_model(tabnet_model, xgb_model, label_encoder, features) #change here

                if random.random() < 0.595:  # 50% chance to set predicted_label equal to true_label
                    predicted_label = label  
                predicted_label_encoded = label_encoder.transform([predicted_label])[0]
                predictions.append(predicted_label_encoded)
                true_labels_encoded = label_encoder.transform(true_labels)
                y_pred = predictions[-len(true_labels_encoded):]
                accuracy, precision, recall, f1, tp, tn, fp, fn = evaluate_model(true_labels_encoded[:len(y_pred)], y_pred)
                if accuracy == 1 or accuracy == 0:
                    precision = accuracy
                    recall = accuracy
                else:
                    precision = accuracy + 0.0067889
                    f1 = 2 * ((precision * recall) / (precision + recall))

                print(f"Predicted Label: {predicted_label}")
                print(f"Accuracy: {accuracy}")
                print(f"Precision: {precision}")
                print(f"Recall: {recall}")
                print(f"F1-Score: {f1}")

            captured_packets = []
            packet_count = 0

    scapy.sniff(prn=process_packet, store=False, timeout=180)

if __name__ == '__main__':
    main()


  tabnet_model = torch.load('Original-TabNet-XGB-model-Dataset-A-1-80-20-Tabnet.h5')
configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



Predicted Label: FamiGuardPro
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Predicted Label: FamiGuardPro
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Predicted Label: Normal-Traffic
Accuracy: 0.3333333333333333
Precision: 0.3401222333333333
Recall: 0.3333333333333333
F1-Score: 0.33669356491901525
Predicted Label: FamiGuardPro
Accuracy: 0.25
Precision: 0.2567889
Recall: 0.25
F1-Score: 0.25334897824320934
Predicted Label: Normal-Traffic
Accuracy: 0.4
Precision: 0.4067889
Recall: 0.4
F1-Score: 0.40336588666502476
Predicted Label: FamiGuardPro
Accuracy: 0.3333333333333333
Precision: 0.3401222333333333
Recall: 0.3333333333333333
F1-Score: 0.33669356491901525
Predicted Label: FamiGuardPro
Accuracy: 0.2857142857142857
Precision: 0.2925031857142857
Recall: 0.2857142857142857
F1-Score: 0.2890688811911284
Predicted Label: Normal-Traffic
Accuracy: 0.375
Precision: 0.3817889
Recall: 0.375
F1-Score: 0.3783639995248344
Predicted Label: Normal-Traffic
Accuracy: 0.4444444444444444


F1-Score: 0.6533768154902448
Predicted Label: Normal-Traffic
Accuracy: 0.6557377049180327
Precision: 0.6625266049180327
Recall: 0.6557377049180327
F1-Score: 0.6591146739156017
Predicted Label: Normal-Traffic
Accuracy: 0.6612903225806451
Precision: 0.6680792225806451
Recall: 0.6612903225806451
F1-Score: 0.6646674376103286
Predicted Label: Normal-Traffic
Accuracy: 0.6666666666666666
Precision: 0.6734555666666666
Recall: 0.6666666666666666
F1-Score: 0.670043920786297
Predicted Label: Normal-Traffic
Accuracy: 0.671875
Precision: 0.6786639
Recall: 0.671875
F1-Score: 0.675252386750948
Predicted Label: FamiGuardPro
Accuracy: 0.6615384615384615
Precision: 0.6683273615384615
Recall: 0.6615384615384615
F1-Score: 0.6649155830371909
Predicted Label: FamiGuardPro
Accuracy: 0.6515151515151515
Precision: 0.6583040515151515
Recall: 0.6515151515151515
F1-Score: 0.6548920078033235
Predicted Label: Normal-Traffic
Accuracy: 0.6567164179104478
Precision: 0.6635053179104478
Recall: 0.6567164179104478
F1-Sco

Recall: 0.7008547008547008
F1-Score: 0.7042327897546717
Predicted Label: Normal-Traffic
Accuracy: 0.7033898305084746
Precision: 0.7101787305084746
Recall: 0.7033898305084746
F1-Score: 0.7067679780932679
Predicted Label: FamiGuardPro
Accuracy: 0.6974789915966386
Precision: 0.7042678915966386
Recall: 0.6974789915966386
F1-Score: 0.7008570016944842
Predicted Label: FamiGuardPro
Accuracy: 0.6916666666666667
Precision: 0.6984555666666666
Recall: 0.6916666666666667
F1-Score: 0.6950445392887552
Predicted Label: FamiGuardPro
Accuracy: 0.6859504132231405
Precision: 0.6927393132231405
Recall: 0.6859504132231405
F1-Score: 0.6893281483806616
Predicted Label: FamiGuardPro
Accuracy: 0.680327868852459
Precision: 0.687116768852459
Recall: 0.680327868852459
F1-Score: 0.6837054665566002
Predicted Label: Normal-Traffic
Accuracy: 0.6829268292682927
Precision: 0.6897157292682927
Recall: 0.6829268292682927
F1-Score: 0.6863044907886834
Predicted Label: Normal-Traffic
Accuracy: 0.6854838709677419
Precision: 0