In [17]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta


In [18]:
HOST_IPS = {'10.194.224.177', '192.168.0.48'}

insta = pd.read_csv('instagram_packet_log.csv')
reddit = pd.read_csv('reddit_packet_log.csv')
tetris = pd.read_csv('tetris_packet_log.csv')
wiki = pd.read_csv('wikipedia_packet_log.csv')
wyr = pd.read_csv('wouldyourather_packet_log.csv')
yt_browse = pd.read_csv('youtube_browsing_packet_log.csv')
yt_watch = pd.read_csv('youtube_watching_packet_log.csv')

datasets = {
    "instagram": insta,
    "reddit": reddit,
    "tetris": tetris,
    "wikipedia": wiki,
    "wouldyourather": wyr,
    "youtube_browsing": yt_browse,
    "youtube_watching": yt_watch
}

In [19]:
def preprocess(df):
    df = df.copy()
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.sort_values('Timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [20]:
def extract_features(df, dataset_name):
    df = preprocess(df)
    results = []
    
    start_time = df['Timestamp'].min()
    end_time = df['Timestamp'].max()

    current_start = start_time
    while current_start < end_time:
        current_end = current_start + timedelta(seconds=5)
        window_df = df[(df['Timestamp'] >= current_start) & (df['Timestamp'] < current_end)]

        if window_df.empty:
            current_start = current_end
            continue

        sent = window_df[window_df['Source IP'].isin(HOST_IPS)]
        recv = window_df[window_df['Destination IP'].isin(HOST_IPS)]

        avg_packet_size_sent = sent['Length'].mean()
        sd_packet_size_sent = sent['Length'].std()

        avg_packet_size_recv = recv['Length'].mean()
        sd_packet_size_recv = recv['Length'].std()

        def time_deltas(packets):
            times = packets['Timestamp'].sort_values().values
            if len(times) < 2:
                return 0.0, 0.0
            deltas = np.diff(times) / np.timedelta64(1, 's')
            return np.mean(deltas), np.std(deltas)

        avg_time_between_outgoing, sd_time_between_outgoing = time_deltas(sent)
        avg_time_between_incoming, sd_time_between_incoming = time_deltas(recv)

        number_outgoing = len(sent)
        number_incoming = len(recv)
        ratio_incoming_to_outgoing = number_incoming / number_outgoing if number_outgoing > 0 else np.nan

        results.append({
            "avg_packet_size_sent": avg_packet_size_sent,
            "sd_packet_size_sent": sd_packet_size_sent,
            "avg_packet_size_recv": avg_packet_size_recv,
            "sd_packet_size_recv": sd_packet_size_recv,
            "avg_time_between_outgoing_packets": avg_time_between_outgoing,
            "sd_time_between_outgoing_packets": sd_time_between_outgoing,
            "avg_time_between_incoming_packets": avg_time_between_incoming,
            "sd_time_between_incoming_packets": sd_time_between_incoming,
            "ratio_incoming_to_outgoing": ratio_incoming_to_outgoing,
            "number_incoming": number_incoming,
            "number_outgoing": number_outgoing,
            "dataset": dataset_name
        })

        current_start = current_end

    return results


In [21]:
all_features = []

for name, df in datasets.items():
    features = extract_features(df, name)
    all_features.extend(features)

features_df = pd.DataFrame(all_features)

In [22]:
features_df.head()

features_df.to_csv('network_traffic_features.csv', index=False)