In [None]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

file_path = './archive-2/02-21-2018.csv'

def calculate_forward_only_metrics(chunk, 
                                   all_flow_durations, 
                                   all_fwd_pkt_means):
    """
    Calculate desired metrics, including chunk label ('Benign' if all are Benign, else 'Attack'),
    percentile-based total flow duration, and percentile-based average of forward packets.
    """
    metrics = {}
    
    # 1. Benign vs Attack label
    if (chunk['Label'] == 'Benign').all():
        metrics['Chunk_Label'] = 'Benign'
    else:
        metrics['Chunk_Label'] = 'Attack'

    # 2. Percentile for 'Flow Duration'
    total_flow_duration = chunk['Flow Duration'].sum()
    metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)
    
    # 3. Percentile for 'Avg_Tot_Fwd_Pkts'
    avg_fwd_pkts = chunk['Tot Fwd Pkts'].mean()
    metrics['Avg_Tot_Fwd_Pkts_Percentile'] = percentileofscore(all_fwd_pkt_means, avg_fwd_pkts)
    
    # 4. Additional metrics
    metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()
    metrics['Unique_Ports'] = chunk['Dst Port'].nunique()
    metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()
    
    return metrics


def generate_summary(metrics):
    """
    Build a text summary that includes the chunk label, the flow duration percentile, 
    the forward packets percentile, and any other metrics.
    """
    text = (
        f"[{metrics['Chunk_Label']}] "
        f"flow duration percentile {metrics['Total_Flow_Duration_Percentile']:.2f}, "
        f"avg forward packets percentile {metrics['Avg_Tot_Fwd_Pkts_Percentile']:.2f}, "
        f"total hits across all ports {metrics['Total_Hits_All_Ports']}, "
        f"{metrics['Unique_Ports']} unique ports active"
    )
    return text


try:
    # 1. Load the data
    data = pd.read_csv(file_path)
    
    # 2. Convert Timestamp to a proper datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # remove rows with invalid Timestamps
    
    # 3. Keep all data (Benign + Attack) so chunks can be classified properly
    #    If your dataset definitely has a "Label" column, you can keep as is;
    #    otherwise, add a check or handle an error if it doesn't exist.

    # 4. (Optional) Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]
    
    # 5. Create 1-second chunk groupers 
    #    (Simply use the integer representation of the timestamp's second)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    data['Chunk Start'] = data['Timestamp_Seconds']  # 1-second chunks

    # 6. Identify all unique chunk starts
    unique_starts = data['Chunk Start'].unique()
    
    # -----------------------------------------------------------
    # Precompute metric distributions to use for percentile ranks
    # -----------------------------------------------------------
    all_flow_durations = []
    all_fwd_pkt_means = []
    
    for start in unique_starts:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # total flow duration
            total_flow_duration = chunk_data['Flow Duration'].sum()
            all_flow_durations.append(total_flow_duration)
            
            # average forward packets
            avg_fwd_pkts = chunk_data['Tot Fwd Pkts'].mean()
            all_fwd_pkt_means.append(avg_fwd_pkts)
        else:
            pass
    
    # -------------------------------
    # 7. Process each chunk in a loop
    # -------------------------------
    chunk_results = []

    for start in unique_starts:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_forward_only_metrics(chunk_data, 
                                                     all_flow_durations, 
                                                     all_fwd_pkt_means)
            metrics['Chunk_ID'] = start
            
            # Generate a summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})
    
    # 8. Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)
    
    # 9. Save to CSV
    output_file = './chunk_summaries_1sec.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for 1-second chunks with percentiles have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()


Summaries for 1-second chunks with percentiles have been saved to ./chunk_summaries_1sec.csv.


In [3]:


try:
    # 1. Load the data
    data = pd.read_csv(file_path)
    
    # 2. Convert Timestamp to a proper datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # remove rows with invalid Timestamps
    
    # 3. Keep all data (Benign + Attack) so chunks can be classified properly

    # 4. (Optional) Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]
    
    # 5. Create 1-second chunk groupers (you can adjust this if needed)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    data['Chunk Start'] = data['Timestamp_Seconds']

    # 6. Identify all unique chunk starts
    unique_starts = data['Chunk Start'].unique()
    
    # -----------------------------------------------------------
    # Precompute metric distributions to use for percentile ranks
    #  (Now we look at *average* flow durations instead of *total*)
    # -----------------------------------------------------------
    all_avg_flow_durations = []
    all_fwd_pkt_means = []
    
    for start in unique_starts:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Average flow duration
            avg_flow_duration = chunk_data['Flow Duration'].mean()
            all_avg_flow_durations.append(avg_flow_duration)
            
            # Average forward packets
            avg_fwd_pkts = chunk_data['Tot Fwd Pkts'].mean()
            all_fwd_pkt_means.append(avg_fwd_pkts)
    
    # -------------------------------
    # 7. Process each chunk in a loop
    # -------------------------------
    chunk_results = []

    for start in unique_starts:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_metrics(chunk_data, 
                                        all_avg_flow_durations, 
                                        all_fwd_pkt_means)
            metrics['Chunk_ID'] = start
            
            # Generate a summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})
    
    # 8. Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)
    
    # 9. Save to CSV
    output_file = './chunk_summaries_1sec.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for 1-second chunks with average flow duration percentile have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()


Summaries for 1-second chunks with average flow duration percentile have been saved to ./chunk_summaries_1sec.csv.
