In [1]:
import pandas as pd
import numpy as np
import os

# Set the directory containing your CSVs
data_dir = '/Users/user/Downloads/Table/downloaded_csvs_merge'  # <-- update this if needed
all_dfs = []

# Load each CSV file into a list of DataFrames
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        try:
            df = pd.read_csv(os.path.join(data_dir, file))
            all_dfs.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

# Combine all DataFrames
df = pd.concat(all_dfs, ignore_index=True)
print("Total combined data shape:", df.shape)


Total combined data shape: (45019243, 40)


In [1]:
# Drop columns where more than 95% of values are NaN or 0
threshold = 0.95
drop_cols = []

for col in df.columns:
    if df[col].isnull().mean() > threshold or (df[col] == 0).mean() > threshold:
        drop_cols.append(col)

df.drop(columns=drop_cols, inplace=True)
print("Shape after dropping low-signal features:", df.shape)

# Drop rows where 'Label' is missing
df.dropna(subset=['Label'], inplace=True)


NameError: name 'df' is not defined

In [1]:
import pandas as pd
import numpy as np
import os

# Step 1: Set the directory containing your CSVs
data_dir = '/Users/user/Downloads/Table/downloaded_csvs_merge'  # <-- update this if needed
print("Step 1: Data directory set to:", data_dir)

# Step 2: Initialize an empty list to store DataFrames
all_dfs = []
print("Step 2: Initialized empty list for DataFrames.")

# Step 3: Load each CSV file into a list of DataFrames
print("Step 3: Loading CSV files...")
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        print(f"  - Attempting to read: {file}")
        try:
            df = pd.read_csv(file_path)
            all_dfs.append(df)
            print(f"    Successfully loaded: {file} | Shape: {df.shape}")
        except Exception as e:
            print(f"    Error reading {file}: {e}")

# Step 4: Combine all DataFrames
print("Step 4: Concatenating all DataFrames...")
if all_dfs:
    df = pd.concat(all_dfs, ignore_index=True)
    print("Step 5: Total combined data shape:", df.shape)
else:
    print("No CSV files were successfully loaded.")


Step 1: Data directory set to: /Users/user/Downloads/Table/downloaded_csvs_merge
Step 2: Initialized empty list for DataFrames.
Step 3: Loading CSV files...
  - Attempting to read: Merged27.csv
    Successfully loaded: Merged27.csv | Shape: (720736, 40)
  - Attempting to read: Merged33.csv
    Successfully loaded: Merged33.csv | Shape: (793945, 40)
  - Attempting to read: Merged32.csv
    Successfully loaded: Merged32.csv | Shape: (704067, 40)
  - Attempting to read: Merged26.csv
    Successfully loaded: Merged26.csv | Shape: (878940, 40)
  - Attempting to read: Merged18.csv
    Successfully loaded: Merged18.csv | Shape: (714063, 40)
  - Attempting to read: Merged30.csv
    Successfully loaded: Merged30.csv | Shape: (695672, 40)
  - Attempting to read: Merged24.csv
    Successfully loaded: Merged24.csv | Shape: (743201, 40)
  - Attempting to read: Merged25.csv
    Successfully loaded: Merged25.csv | Shape: (700455, 40)
  - Attempting to read: Merged31.csv
    Successfully loaded: Merge

In [3]:
# Drop columns where more than 95% of values are NaN or 0
threshold = 0.95
drop_cols = []

for col in df.columns:
    if df[col].isnull().mean() > threshold or (df[col] == 0).mean() > threshold:
        drop_cols.append(col)

df.drop(columns=drop_cols, inplace=True)
print("Shape after dropping low-signal features:", df.shape)

# Drop rows where 'Label' is missing
df.dropna(subset=['Label'], inplace=True)


Shape after dropping low-signal features: (45019243, 32)


In [5]:
# Keep only numeric columns and the 'Label' column
numeric_features = df.select_dtypes(include='number').columns.tolist()
if 'Label' in df.columns:
    numeric_features += ['Label']

df = df[numeric_features]
df['Label'] = df['Label'].astype(str)

print("Columns kept:", df.columns.tolist())


Columns kept: ['Header_Length', 'Protocol Type', 'Time_To_Live', 'Rate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Variance', 'Label']


In [7]:
# Create major class group
df['ClassGroup'] = df['Label'].apply(lambda x: (
    'BenignTraffic' if x == 'BenignTraffic' else
    'Non-DDoS' if not x.startswith('DDoS') else
    'DDoS'
))

# Sample up to 1,000,000 from each major group
sampled_major = (
    df.groupby('ClassGroup')
    .apply(lambda x: x.sample(min(1_000_000, len(x)), random_state=42))
    .reset_index(drop=True)
)

print("Sampled major groups shape:", sampled_major.shape)
print("Major class distribution:\n", sampled_major['ClassGroup'].value_counts())


  .apply(lambda x: x.sample(min(1_000_000, len(x)), random_state=42))


Sampled major groups shape: (1000000, 33)
Major class distribution:
 ClassGroup
Non-DDoS    1000000
Name: count, dtype: int64


In [9]:
before_dedup = sampled_major.shape[0]
sampled_major = sampled_major.drop_duplicates()
after_dedup = sampled_major.shape[0]

print(f"Removed {before_dedup - after_dedup} duplicates")


Removed 269381 duplicates


In [11]:
# Filter DDoS class
ddos_df = df[df['ClassGroup'] == 'DDoS']

# Sample up to 100,000 from each DDoS subclass
ddos_subsampled = (
    ddos_df.groupby('Label')
    .apply(lambda x: x.sample(min(100_000, len(x)), random_state=42))
    .reset_index(drop=True)
)

ddos_dedup = ddos_subsampled.drop_duplicates()

print("DDoS subclass distribution before dedup:\n", ddos_subsampled['Label'].value_counts())
print("DDoS subclass distribution after dedup:\n", ddos_dedup['Label'].value_counts())


DDoS subclass distribution before dedup:
 Series([], Name: count, dtype: int64)
DDoS subclass distribution after dedup:
 Series([], Name: count, dtype: int64)


  .apply(lambda x: x.sample(min(100_000, len(x)), random_state=42))


In [37]:
# === Step 7: Log Normalization (excluding Label columns) ===
def log_normalize(df):
    """
    Apply log-normalization to numeric columns in the DataFrame.
    """
    df_numeric = df.select_dtypes(include='number')
    return df_numeric.apply(lambda x: np.log1p(x))  # log(1 + x)

# Normalize the major class balanced dataset
features = sampled_major.drop(columns=['Label', 'ClassGroup'])
labels = sampled_major[['Label']]
features_normalized = log_normalize(features)
processed_major = pd.concat([features_normalized, labels], axis=1)

# Normalize the DDoS subclasses dataset
features_ddos = ddos_dedup.drop(columns=['Label', 'ClassGroup'])
labels_ddos = ddos_dedup[['Label']]
features_ddos_norm = log_normalize(features_ddos)
processed_ddos = pd.concat([features_ddos_norm, labels_ddos], axis=1)

# === Save final outputs ===
processed_major.to_csv("CICIoT2023_major_balanced_cleaned.csv", index=False)
processed_ddos.to_csv("CICIoT2023_ddos_subclasses_cleaned.csv", index=False)

print("✅ Saved cleaned datasets:")
print("  • CICIoT2023_major_balanced_cleaned.csv")
print("  • CICIoT2023_ddos_subclasses_cleaned.csv")

✅ Saved cleaned datasets:
  • CICIoT2023_major_balanced_cleaned.csv
  • CICIoT2023_ddos_subclasses_cleaned.csv


In [17]:
processed_major.to_csv("CICIoT2023_major_balanced_cleaned.csv", index=False)
processed_ddos.to_csv("CICIoT2023_ddos_subclasses_cleaned.csv", index=False)

print("✅ Saved cleaned datasets:")
print("  • CICIoT2023_major_balanced_cleaned.csv")
print("  • CICIoT2023_ddos_subclasses_cleaned.csv")


✅ Saved cleaned datasets:
  • CICIoT2023_major_balanced_cleaned.csv
  • CICIoT2023_ddos_subclasses_cleaned.csv


In [19]:
# Set your target directory
save_dir = "/Users/user/Downloads/Table"  # Replace with your actual directory path

# Save the files
processed_major.to_csv(f"{save_dir}/CICIoT2023_major_balanced_cleaned.csv", index=False)
processed_ddos.to_csv(f"{save_dir}/CICIoT2023_ddos_subclasses_cleaned.csv", index=False)

print("✅ Saved cleaned datasets:")
print(f"  • {save_dir}/CICIoT2023_major_balanced_cleaned.csv")
print(f"  • {save_dir}/CICIoT2023_ddos_subclasses_cleaned.csv")


✅ Saved cleaned datasets:
  • /Users/user/Downloads/Table/CICIoT2023_major_balanced_cleaned.csv
  • /Users/user/Downloads/Table/CICIoT2023_ddos_subclasses_cleaned.csv


In [39]:
from sklearn.model_selection import train_test_split

# Split the processed major dataset into train and test sets (80% train, 20% test)
X_train_major, X_test_major, y_train_major, y_test_major = train_test_split(
    features_normalized, labels, test_size=0.2, random_state=42
)

# Split the processed DDoS dataset into train and test sets (80% train, 20% test)
X_train_ddos, X_test_ddos, y_train_ddos, y_test_ddos = train_test_split(
    features_ddos_norm, labels_ddos, test_size=0.2, random_state=42
)

print(f"Training set (Major Classes): {X_train_major.shape}, Test set (Major Classes): {X_test_major.shape}")
print(f"Training set (DDoS subclasses): {X_train_ddos.shape}, Test set (DDoS subclasses): {X_test_ddos.shape}")


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.