In [1]:
import os
import pandas as pd

# New, safe save location
safe_output_path = r"C:\Users\Otala\Desktop\cleaned_numeric_combined.csv"

# Dataset folder path (unchanged)
data_folder = r"C:\Users\Otala\Desktop\Osahon Research Work\anomaly-detection-isolationforest-cicids2017\data\MachineLearningCVE"

# Get list of .csv files
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".pcap_ISCX.csv")]

# Load and combine all CSVs
combined_df = pd.DataFrame()

for file in csv_files:
    file_path = os.path.join(data_folder, file)
    try:
        print(f"Loading: {file}")
        df = pd.read_csv(file_path, low_memory=False)
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")

print("✅ Combined dataset shape:", combined_df.shape)

# Drop rows with any missing values
combined_df = combined_df.dropna()

# Keep only numeric columns
numeric_df = combined_df.select_dtypes(include=['int64', 'float64'])

# Save to a safe location (Desktop)
numeric_df.to_csv(safe_output_path, index=False)
print(f"✅ Cleaned numeric data saved to:\n{safe_output_path}")


Loading: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: Monday-WorkingHours.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: Tuesday-WorkingHours.pcap_ISCX.csv
Loading: Wednesday-workingHours.pcap_ISCX.csv
✅ Combined dataset shape: (2830743, 79)
✅ Cleaned numeric data saved to:
C:\Users\Otala\Desktop\cleaned_numeric_combined.csv
