In [2]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

# Load the dataset
file_path = './archive-2/02-14-2018.csv'

# Function to calculate metrics for a chunk
def calculate_forward_only_metrics(chunk, all_flow_durations):
    metrics = {}
    total_flow_duration = chunk['Flow Duration'].sum()
    metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)  # Percentile rank
    metrics['Avg_Tot_Fwd_Pkts'] = chunk['Tot Fwd Pkts'].mean()  # Average forward packets
    metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()  # Total hits across all ports
    metrics['Unique_Ports'] = chunk['Dst Port'].nunique()  # Number of unique ports
    metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()  # Variance in hits across ports
    return metrics

# Function to generate a summary from updated metrics
def generate_summary(metrics):
    text = (
        f"Total flow duration percentile {metrics['Total_Flow_Duration_Percentile']:.2f}, "
        f"avg forward packets {metrics['Avg_Tot_Fwd_Pkts']:.2f}, "
        f"total hits across all ports {metrics['Total_Hits_All_Ports']}, "
        f"{metrics['Unique_Ports']} unique ports active, "
       
    )
    return text

try:
    # Step 1: Load and preprocess the data
    data = pd.read_csv(file_path)

    # Convert Timestamp column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # Remove rows with invalid Timestamps

    # Filter out rows where the label isn't "Benign"
    if 'Label' in data.columns:
        data = data[data['Label'] == 'Benign']
    else:
        raise ValueError("The dataset does not contain a 'Label' column. Cannot filter for Benign data.")

    # Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]

    # Create time-based chunks (5 seconds with 50% overlap)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    chunk_start = (data['Timestamp_Seconds'] // 5) * 5
    data['Chunk Start'] = chunk_start

    # Extract all total flow durations for percentile computation
    all_total_flow_durations = [
        data[data['Chunk Start'] == start]['Flow Duration'].sum()
        for start in data['Chunk Start'].unique()
    ]

    # Step 2: Process overlapping 5-second chunks
    chunk_results = []
    chunk_ids = data['Chunk Start'].unique()

    for start in chunk_ids:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_forward_only_metrics(chunk_data, all_total_flow_durations)
            metrics['Chunk_ID'] = start  # Add chunk identifier for reference
            
            # Generate summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})

    # Step 3: Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)

    # Save the results to a CSV file
    output_file = './chunk_summaries1.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for overlapping 5-second chunks with percentiles have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()


Summaries for overlapping 5-second chunks with percentiles have been saved to ./chunk_summaries1.csv.


In [3]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

# Load the dataset
file_path = './archive-2/02-15-2018.csv'

# Function to calculate metrics for a chunk
def calculate_forward_only_metrics(chunk, all_flow_durations):
    metrics = {}
    total_flow_duration = chunk['Flow Duration'].sum()
    metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)  # Percentile rank
    metrics['Avg_Tot_Fwd_Pkts'] = chunk['Tot Fwd Pkts'].mean()  # Average forward packets
    metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()  # Total hits across all ports
    metrics['Unique_Ports'] = chunk['Dst Port'].nunique()  # Number of unique ports
    metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()  # Variance in hits across ports
    return metrics

# Function to generate a summary from updated metrics
def generate_summary(metrics):
    text = (
        f"Total flow duration percentile {metrics['Total_Flow_Duration_Percentile']:.2f}, "
        f"avg forward packets {metrics['Avg_Tot_Fwd_Pkts']:.2f}, "
        f"total hits across all ports {metrics['Total_Hits_All_Ports']}, "
        f"{metrics['Unique_Ports']} unique ports active, "
        
    )
    return text

try:
    # Step 1: Load and preprocess the data
    data = pd.read_csv(file_path)

    # Convert Timestamp column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # Remove rows with invalid Timestamps

    # Filter out rows where the label isn't "Benign"
    if 'Label' in data.columns:
        data = data[data['Label'] == 'Benign']
    else:
        raise ValueError("The dataset does not contain a 'Label' column. Cannot filter for Benign data.")

    # Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]

    # Create time-based chunks (5 seconds with 50% overlap)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    chunk_start = (data['Timestamp_Seconds'] // 5) * 5
    data['Chunk Start'] = chunk_start

    # Extract all total flow durations for percentile computation
    all_total_flow_durations = [
        data[data['Chunk Start'] == start]['Flow Duration'].sum()
        for start in data['Chunk Start'].unique()
    ]

    # Step 2: Process overlapping 5-second chunks
    chunk_results = []
    chunk_ids = data['Chunk Start'].unique()

    for start in chunk_ids:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_forward_only_metrics(chunk_data, all_total_flow_durations)
            metrics['Chunk_ID'] = start  # Add chunk identifier for reference
            
            # Generate summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})

    # Step 3: Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)

    # Save the results to a CSV file
    output_file = './chunk_summaries2.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for overlapping 5-second chunks with percentiles have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()


Summaries for overlapping 5-second chunks with percentiles have been saved to ./chunk_summaries2.csv.


In [4]:
import pandas as pd
import numpy as np
from scipy.stats import percentileofscore

# Load the dataset
file_path = './archive-2/02-22-2018.csv'

# Function to calculate metrics for a chunk
def calculate_forward_only_metrics(chunk, all_flow_durations):
    metrics = {}
    total_flow_duration = chunk['Flow Duration'].sum()
    metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)  # Percentile rank
    metrics['Avg_Tot_Fwd_Pkts'] = chunk['Tot Fwd Pkts'].mean()  # Average forward packets
    metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()  # Total hits across all ports
    metrics['Unique_Ports'] = chunk['Dst Port'].nunique()  # Number of unique ports
    metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()  # Variance in hits across ports
    return metrics

# Function to generate a summary from updated metrics
def generate_summary(metrics):
    text = (
        f"Total flow duration percentile {metrics['Total_Flow_Duration_Percentile']:.2f}, "
        f"avg forward packets {metrics['Avg_Tot_Fwd_Pkts']:.2f}, "
        f"total hits across all ports {metrics['Total_Hits_All_Ports']}, "
        f"{metrics['Unique_Ports']} unique ports active, "
       
    )
    return text

try:
    # Step 1: Load and preprocess the data
    data = pd.read_csv(file_path)

    # Convert Timestamp column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
    data = data.dropna(subset=['Timestamp'])  # Remove rows with invalid Timestamps

    # Filter out rows where the label isn't "Benign"
    if 'Label' in data.columns:
        data = data[data['Label'] == 'Benign']
    else:
        raise ValueError("The dataset does not contain a 'Label' column. Cannot filter for Benign data.")

    # Exclude rows with zero byte values
    data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]

    # Create time-based chunks (5 seconds with 50% overlap)
    data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
    chunk_start = (data['Timestamp_Seconds'] // 5) * 5
    data['Chunk Start'] = chunk_start

    # Extract all total flow durations for percentile computation
    all_total_flow_durations = [
        data[data['Chunk Start'] == start]['Flow Duration'].sum()
        for start in data['Chunk Start'].unique()
    ]

    # Step 2: Process overlapping 5-second chunks
    chunk_results = []
    chunk_ids = data['Chunk Start'].unique()

    for start in chunk_ids:
        chunk_data = data[data['Chunk Start'] == start]
        if not chunk_data.empty:
            # Calculate metrics
            metrics = calculate_forward_only_metrics(chunk_data, all_total_flow_durations)
            metrics['Chunk_ID'] = start  # Add chunk identifier for reference
            
            # Generate summary text
            summary_text = generate_summary(metrics)
            chunk_results.append({'Chunk ID': start, 'Summary': summary_text})

    # Step 3: Convert results to a DataFrame
    chunk_summary_df = pd.DataFrame(chunk_results)

    # Save the results to a CSV file
    output_file = './chunk_summaries3.csv'
    chunk_summary_df.to_csv(output_file, index=False)

    print(f"Summaries for overlapping 5-second chunks with percentiles have been saved to {output_file}.")

except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()


Summaries for overlapping 5-second chunks with percentiles have been saved to ./chunk_summaries3.csv.


In [5]:
    import pandas as pd
    import numpy as np
    from scipy.stats import percentileofscore

    # Load the dataset
    file_path = './archive-2/02-23-2018.csv'

    # Function to calculate metrics for a chunk
    def calculate_forward_only_metrics(chunk, all_flow_durations):
        metrics = {}
        total_flow_duration = chunk['Flow Duration'].sum()
        metrics['Total_Flow_Duration_Percentile'] = percentileofscore(all_flow_durations, total_flow_duration)  # Percentile rank
        metrics['Avg_Tot_Fwd_Pkts'] = chunk['Tot Fwd Pkts'].mean()  # Average forward packets
        metrics['Total_Hits_All_Ports'] = chunk['Dst Port'].value_counts().sum()  # Total hits across all ports
        metrics['Unique_Ports'] = chunk['Dst Port'].nunique()  # Number of unique ports
        metrics['Port_Hit_Variance'] = chunk['Dst Port'].value_counts().var()  # Variance in hits across ports
        return metrics

    # Function to generate a summary from updated metrics
    def generate_summary(metrics):
        text = (
            f"Total flow duration percentile {metrics['Total_Flow_Duration_Percentile']:.2f}, "
            f"avg forward packets {metrics['Avg_Tot_Fwd_Pkts']:.2f}, "
            f"total hits across all ports {metrics['Total_Hits_All_Ports']}, "
            f"{metrics['Unique_Ports']} unique ports active, "
            
        )
        return text

    try:
        # Step 1: Load and preprocess the data
        data = pd.read_csv(file_path)

        # Convert Timestamp column to datetime
        data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce', format='%d/%m/%Y %H:%M:%S')
        data = data.dropna(subset=['Timestamp'])  # Remove rows with invalid Timestamps

        # Filter out rows where the label isn't "Benign"
        if 'Label' in data.columns:
            data = data[data['Label'] == 'Benign']
        else:
            raise ValueError("The dataset does not contain a 'Label' column. Cannot filter for Benign data.")

        # Exclude rows with zero byte values
        data = data[(data['TotLen Fwd Pkts'] > 0) | (data['TotLen Bwd Pkts'] > 0)]

        # Create time-based chunks (5 seconds with 50% overlap)
        data['Timestamp_Seconds'] = (data['Timestamp'].astype('int64') // 1e9).astype(int)
        chunk_start = (data['Timestamp_Seconds'] // 5) * 5
        data['Chunk Start'] = chunk_start

        # Extract all total flow durations for percentile computation
        all_total_flow_durations = [
            data[data['Chunk Start'] == start]['Flow Duration'].sum()
            for start in data['Chunk Start'].unique()
        ]

        # Step 2: Process overlapping 5-second chunks
        chunk_results = []
        chunk_ids = data['Chunk Start'].unique()

        for start in chunk_ids:
            chunk_data = data[data['Chunk Start'] == start]
            if not chunk_data.empty:
                # Calculate metrics
                metrics = calculate_forward_only_metrics(chunk_data, all_total_flow_durations)
                metrics['Chunk_ID'] = start  # Add chunk identifier for reference
                
                # Generate summary text
                summary_text = generate_summary(metrics)
                chunk_results.append({'Chunk ID': start, 'Summary': summary_text})

        # Step 3: Convert results to a DataFrame
        chunk_summary_df = pd.DataFrame(chunk_results)

        # Save the results to a CSV file
        output_file = './chunk_summaries4.csv'
        chunk_summary_df.to_csv(output_file, index=False)

        print(f"Summaries for overlapping 5-second chunks with percentiles have been saved to {output_file}.")

    except FileNotFoundError:
        print(f"File not found: {file_path}")
        exit()
    except Exception as e:
        print(f"An error occurred: {e}")
        exit()


Summaries for overlapping 5-second chunks with percentiles have been saved to ./chunk_summaries4.csv.


In [6]:
from transformers import AutoTokenizer, AutoModel

# Specify the model name
model_name = "allenai/scibert_scivocab_uncased"  # Or "allenai/scibert_scivocab_cased"

# Load and download the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Save the model and tokenizer locally
save_path = "./sciBERT_model_uncased"  # Specify the save path
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

print(f"SciBERT model and tokenizer saved to {save_path}.")

  from .autonotebook import tqdm as notebook_tqdm


SciBERT model and tokenizer saved to ./sciBERT_model_uncased.


In [10]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize

# Path to the downloaded SciBERT model
model_path = "./sciBERT_model_uncased"  # Replace with your actual path

# Load the locally saved tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Load the CSV file containing summaries
input_file = './chunk_summaries4.csv'
output_file = './Train_ddos4.csv'
summary_df = pd.read_csv(input_file)

# Function to generate embeddings for summaries
def convert_to_embeddings(summary_df):
    embeddings_list = []

    for summary in summary_df['Summary']:
        # Tokenize the summary
        inputs = tokenizer(summary, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Generate embeddings using SciBERT
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the [CLS] token embedding (first token) as the sentence representation
        sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()

        # Append the embedding to the list
        embeddings_list.append(sentence_embedding)

    # Normalize the embeddings
    embeddings_normalized = normalize(embeddings_list, axis=1)

    # Add normalized embeddings as a new column
    summary_df['Embeddings'] = [embedding.tolist() for embedding in embeddings_normalized]
    return summary_df

# Apply the embedding generation function
summary_df = convert_to_embeddings(summary_df)

# Save the updated DataFrame with embeddings
summary_df.to_csv(output_file, index=False)

print(f"Summaries converted to embeddings and saved to {output_file}.")


Summaries converted to embeddings and saved to ./Train_ddos4.csv.
