# Import Library

In [None]:
import pandas as pd
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
import random
from datetime import datetime, timedelta

# Data Augmentation

In [None]:
# Load your existing dataset
df = pd.read_csv('Cisco-netflow-10.131.6.1-28Sept-to-02Oct-24.csv')  # Update the path to your dataset

# Load known IPs from your CSV file
known_ips_df = pd.read_csv('Known IP.csv')  # Update the path to your known IPs CSV file
known_ips = known_ips_df['IP'].tolist()  # Assuming the column in the CSV is named 'IP'

# Constants for labeling
ABSTAIN = -1
NORMAL = 0
ANOMALY = 1

# Define Labeling Functions (LFs) using Snorkel based on the anomaly rules

@labeling_function()
def lf_in_pkts(x):
    if x['netflow.in_pkts'] < 3 or x['netflow.in_pkts'] > 1000:
        return ANOMALY
    return NORMAL

@labeling_function()
def lf_in_bytes(x):
    if x['netflow.in_bytes'] > 100 * 1024 * 1024 or x['netflow.in_bytes'] == 500:  # > 100 MB or exactly 500 bytes
        return ANOMALY
    return NORMAL

@labeling_function()
def lf_protocol(x):
    if x['netflow.protocol'] in [47, 50, 41]:  # GRE, ESP, IPv6 encapsulation
        return ANOMALY
    return NORMAL

@labeling_function()
def lf_l4_src_port(x):
    if x['netflow.l4_src_port'] < 1024 or x['netflow.l4_src_port'] in [4444, 6667]:  # < 1024 or malware ports
        return ANOMALY
    return NORMAL

@labeling_function()
def lf_l4_dst_port(x):
    if x['netflow.l4_dst_port'] not in range(33434, 33465):  # Outside Traceroute range
        return ANOMALY
    return NORMAL

# Combine all LFs into a list
lfs = [lf_in_pkts, lf_in_bytes, lf_protocol, lf_l4_src_port, lf_l4_dst_port]

# Apply the LFs to the dataset
applier = PandasLFApplier(lfs)
L_train = applier.apply(df)

# Analyze the outputs of the labeling functions
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

# Train a Label Model to combine the weak labels from the LFs
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100)

# Predict probabilistic labels using the label model
df['label'] = label_model.predict(L=L_train)

# Separate normal and anomalous data
df_normal = df[df['label'] == NORMAL]
df_anomalous = df[df['label'] == ANOMALY]

# Ensure we have the desired number of records
normal_data_count = 500000
anomaly_data_counts = [80000, 80000, 80000, 80000, 80000, 100000]  # 500,000 anomalies total

# Step 1: Sampling 500,000 normal data points
df_normal_sampled = df_normal.sample(n=normal_data_count, random_state=42)

# Helper function to handle insufficient records and add synthetic anomalies if needed
def safe_sample(df, n, rule_description):
    if len(df) >= n:
        return df.sample(n=n, random_state=42)
    else:
        print(f"Not enough records for {rule_description}, only {len(df)} available. Adding synthetic anomalies.")
        additional_data = generate_synthetic_anomalies(n - len(df), rule_description)
        return pd.concat([df, additional_data])

# Function to generate synthetic anomalies with known IP addresses
def generate_synthetic_anomalies(count, rule_description):
    synthetic_data = []
    for _ in range(count):
        # Generate values for necessary fields
        in_pkts = random.randint(1, 5000)
        in_bytes = random.randint(500, 500 * 1024 * 1024)  # up to 500 MB
        protocol = random.choice([6, 17, 47, 50, 41])  # Add anomalies for protocol
        src_port = random.randint(1, 65535)
        dst_port = random.randint(1, 65535)

        # Select random known IPs from the known IP list
        src_ip = random.choice(known_ips)
        dst_ip = random.choice(known_ips)

        # Generate timestamps
        now = datetime.now()
        timestamp = now.isoformat()  # Current timestamp
        last_switched = now - timedelta(seconds=random.randint(1, 1000))
        first_switched = last_switched - timedelta(seconds=random.randint(1, 1000))
        
        # Randomly generate TCP flags (common values like ACK, SYN-ACK)
        tcp_flags = random.choice([24, 27])  # 24 = ACK, 27 = SYN-ACK
        
        synthetic_data.append({
            '@timestamp': timestamp,
            'netflow.in_pkts': in_pkts,
            'netflow.in_bytes': in_bytes,
            'netflow.protocol': protocol,
            'netflow.l4_src_port': src_port,
            'netflow.l4_dst_port': dst_port,
            'netflow.ipv4_src_addr': src_ip,
            'netflow.ipv4_dst_addr': dst_ip,
            'netflow.first_switched': first_switched.isoformat(),
            'netflow.last_switched': last_switched.isoformat(),
            'netflow.tcp_flags': tcp_flags,
            'label': ANOMALY  # Anomalous label
        })
    return pd.DataFrame(synthetic_data)

# Step 2: Sampling anomalous data based on the different anomaly rules

# Rule-based sampling (for each rule and all rules combined)
df_anomalous_in_pkts = safe_sample(df_anomalous[(df_anomalous['netflow.in_pkts'] < 3) | 
                                                 (df_anomalous['netflow.in_pkts'] > 1000)],
                                   anomaly_data_counts[0], 'netflow.in_pkts')

df_anomalous_in_bytes = safe_sample(df_anomalous[(df_anomalous['netflow.in_bytes'] > 100 * 1024 * 1024) | 
                                                  (df_anomalous['netflow.in_bytes'] == 500)],
                                    anomaly_data_counts[1], 'netflow.in_bytes')

df_anomalous_protocol = safe_sample(df_anomalous[df_anomalous['netflow.protocol'].isin([47, 50, 41])],
                                    anomaly_data_counts[2], 'netflow.protocol')

df_anomalous_src_port = safe_sample(df_anomalous[(df_anomalous['netflow.l4_src_port'] < 1024) | 
                                                  (df_anomalous['netflow.l4_src_port'].isin([4444, 6667]))],
                                    anomaly_data_counts[3], 'netflow.l4_src_port')

df_anomalous_dst_port = safe_sample(df_anomalous[~df_anomalous['netflow.l4_dst_port'].between(33434, 33464)],
                                    anomaly_data_counts[4], 'netflow.l4_dst_port')

# For the 100,000 anomalies combining all rules, we'll sample from the entire anomalous dataset
df_anomalous_all_rules = safe_sample(df_anomalous, anomaly_data_counts[5], 'all rules')

# Combine all the anomalous datasets
df_anomalous_sampled = pd.concat([df_anomalous_in_pkts, df_anomalous_in_bytes, 
                                  df_anomalous_protocol, df_anomalous_src_port, 
                                  df_anomalous_dst_port, df_anomalous_all_rules])

# Final dataset with 500,000 normal and 500,000 anomalous records
df_final = pd.concat([df_normal_sampled, df_anomalous_sampled], ignore_index=True)

# Save to CSV
df_final.to_csv('dataset-5.csv', index=False)

# Display a sample of the final dataset
print(df_final.head())