# I2P k-Means-Algorithmus
Es wird ein k-Means-Algorithmus angewendet.

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scapy.all import rdpcap, IP, TCP, UDP, Raw
from scipy.stats import entropy
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
source_ip = '10.8.0.2'
target_ip = '10.8.0.11'
packets = rdpcap('traffic_with_filter.pcap')

In [None]:
filter_with_source_ip = True

def filter_packets(packets):
    return [
        (frame_number, pkt) for frame_number, pkt in enumerate(packets, start=1)
        if IP in pkt
           and pkt[IP].dst == target_ip
           and (pkt[IP].src == source_ip if filter_with_source_ip else True)
    ]
filtered_packets = filter_packets(packets)
print(
    f"Number of packets to {target_ip}{' and from ' + source_ip if 'source_ip' in globals() and source_ip else ''}: {len(filtered_packets)}")

# Features
Werden gelesen und für k-Means vorbereitet.

In [None]:
def byte_histogram(payload: bytes):
    counts = Counter(payload)
    hist = np.array([counts.get(i, 0) for i in range(256)])
    return hist / hist.sum() if hist.sum() > 0 else np.zeros(256)

def calc_entropy(payload: bytes):
    if not payload:
        return 0.0
    counts = Counter(payload)
    probs = np.array(list(counts.values())) / len(payload)
    return entropy(probs, base=2)
features = []
frame_numbers = []

def build_with_protocol():
    for frame_number, pkt in filtered_packets:
        if Raw in pkt:
            payload = bytes(pkt[Raw].load)
        if TCP in pkt:
            protocol = "tcp"
            src_port = pkt[TCP].sport
            dst_port = pkt[TCP].dport
        elif UDP in pkt:
            protocol = "udp"
            src_port = pkt[UDP].sport
            dst_port = pkt[UDP].dport
        else:
            protocol = "unknown"
            src_port = 0
            dst_port = 0
        payload_len = len(payload)
        hist = byte_histogram(payload)
        protocol_encoded = 0 if protocol == "tcp" else (1 if protocol == "udp" else 2)
        vec = [src_port, dst_port, payload_len, protocol_encoded, *hist]
        frame_numbers.append(frame_number)
        features.append(vec)

In [None]:
def create_feature_columns():
    df = pd.DataFrame(features)
    df.columns = ['src_port', 'dst_port', 'payload_len', 'protocol'] + [f'byte_{i}' for i in range(256)]
    df.head()
    scaler = StandardScaler()
    return scaler.fit_transform(df)

In [None]:
build_with_protocol()
X_scaled = create_feature_columns()

# Elbow-Auswertung

In [None]:
inertia = []
k_range = range(1, 55)

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    inertia.append(km.inertia_)

plt.plot(k_range, inertia, marker='o')
plt.title("")
plt.xlabel("Anzahl Cluster (k)")
plt.ylabel("Verzehrung")
plt.grid(True)
plt.xticks(range(1, 55, 3))
plt.show()

# Anzahl Punkte pro Cluster zählen

In [None]:
from sklearn.cluster import KMeans

optimal_k = 49
km_optimal = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = km_optimal.fit_predict(X_scaled)

# Collect clusters
cluster_dict = {}
for index, label in enumerate(cluster_labels):
    cluster_dict.setdefault(label, []).append(index)

# Sort by size
sorted_clusters = sorted(cluster_dict.items(), key=lambda item: len(item[1]), reverse=True)

# Output: Cluster with frame numbers
for cluster, indices in sorted_clusters:
    frame_ids = [frame_numbers[i] for i in indices]
    print(f"Cluster {cluster} ({len(indices)} points): Frames {frame_ids}\n")

# Average
total_points = sum(len(indices) for _, indices in sorted_clusters)
average_points = total_points / optimal_k
print(f"⟶ Average number of points per cluster: {average_points:.2f}\n")

# Summarize cluster sizes
cluster_size_counts = {}
for indices in cluster_dict.values():
    size = len(indices)
    cluster_size_counts[size] = cluster_size_counts.get(size, 0) + 1

print("Distribution of cluster sizes:")
for size in sorted(cluster_size_counts):
    count = cluster_size_counts[size]
    print(f"{size} data points: {count} clusters")
