In [None]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

# Load the dataset
file_path = './archive-2/02-14-2018.csv'

# Function to calculate metrics for a chunk
def calculate_forward_only_metrics(chunk, all_flow_durations):
    metrics = {}
    total_flow_duration = chunk['Flow Duration'].sum()
    metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)  # Percentile rank
    metrics['Avg_Tot_Fwd_Pkts'] = chunk['Tot Fwd Pkts'].mean()  # Average forward packets
    metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()  # Total hits across all ports
    metrics['Unique_Ports'] = chunk['Dst Port'].nunique()  # Number of unique ports
    metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()  # Variance in hits across ports
    return metrics

# Function to generate a summary from updated metrics
def generate_summary(metrics):
    text = (
        f"{metrics['Total_Flow_Duration_Percentile']:.2f}, "
        f"{metrics['Avg_Tot_Fwd_Pkts']:.2f}, "
        f" {metrics['Total_Hits_All_Ports']}, "
        f"{metrics['Unique_Ports']} "
       
    )
    return text

try:
    # Step 1: Load and preprocess the data
    data = pd.read_csv(file_path)

    # Convert Timestamp column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # Remove rows with invalid Timestamps

    # Filter out rows where the label isn't "Benign"
    if 'Label' in data.columns:
        data = data[data['Label'] == 'Benign']
    else:
        raise ValueError("The dataset does not contain a 'Label' column. Cannot filter for Benign data.")

    # Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]

    # Create time-based chunks (5 seconds with 50% overlap)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    chunk_start = (data['Timestamp_Seconds'] // 5) * 5
    data['Chunk Start'] = chunk_start

    # Extract all total flow durations for percentile computation
    all_total_flow_durations = [
        data[data['Chunk Start'] == start]['Flow Duration'].sum()
        for start in data['Chunk Start'].unique()
    ]

    # Step 2: Process overlapping 5-second chunks
    chunk_results = []
    chunk_ids = data['Chunk Start'].unique()

    for start in chunk_ids:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_forward_only_metrics(chunk_data, all_total_flow_durations)
            metrics['Chunk_ID'] = start  # Add chunk identifier for reference
            
            # Generate summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})

    # Step 3: Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)

    # Save the results to a CSV file
    output_file = './chunk_numeric1.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for overlapping 5-second chunks with percentiles have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()
