In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from scipy.stats import mstats  # For winsorization (capping extreme values)

# Load the dataset from the specified path
df = pd.read_csv(r"E:\Datasets\UNSW-NB15\Training and Testing Sets\UNSW_NB15_concatenated_dropped.csv")

# Create a copy of the dataframe to preserve original data
df_clean = df.copy()

### Step 1: Drop the 'id' column
# Rationale: The 'id' column is a near-unique identifier that doesn't contribute to predictive power
# and could cause model overfitting to specific observations. This is a standard preprocessing step
# for ML datasets with record identifiers.
df_clean.drop(columns=['id'], inplace=True)

### Step 2: Harmonize target variables
# Rationale: Prevent data leakage by keeping only one target variable. 'attack_cat' contains
# multiclass attack categories while 'label' contains binary classification (normal vs attack).
# We choose 'label' for binary classification to maintain consistency with common network
# intrusion detection approaches.
df_clean.drop(columns=['attack_cat'], inplace=True)

### Step 3: Handle extreme values using winsorization
# Rationale: Cap extreme values to reduce skewness while preserving outlier information.
# Winsorization is preferred over removal for network traffic data where extreme values
# may represent legitimate but rare network events. Top and bottom 1% capping is a common
# approach that balances outlier mitigation with data preservation.
# Define columns with extreme values based on EDA findings
extreme_value_cols = ['sload', 'dload', 'dur', 'rate', 'sinpkt', 'dinpkt', 
                     'sjit', 'djit', 'stcpb', 'dtcpb']

# Winsorize top and bottom 1% of values for each specified column
# Note: mstats.winsorize returns a masked array, so we extract the underlying data
for col in extreme_value_cols:
    df_clean[col] = mstats.winsorize(df_clean[col], limits=[0.01, 0.01]).data

### Step 4: Scale high-cardinality continuous features
# Rationale: Standardize feature scales to prevent dominance by large-magnitude features.
# Use RobustScaler due to presence of remaining outliers after winsorization. RobustScaler
# uses median and IQR instead of mean and standard deviation, making it more resistant to
# outliers that may still exist after winsorization.
high_cardinality_features = [
    'dur', 'rate', 'sload', 'dload', 'sinpkt', 'dinpkt',
    'sjit', 'djit', 'stcpb', 'dtcpb'
]

# Initialize RobustScaler (less sensitive to outliers than StandardScaler)
scaler = RobustScaler()

# Apply scaling to specified features
df_clean[high_cardinality_features] = scaler.fit_transform(df_clean[high_cardinality_features])

### Step 5: Validate binary columns
# Rationale: Ensure binary columns contain expected values and are properly formatted.
# Network traffic binary flags should only contain 0/1 values. Converting to integer
# ensures proper handling by machine learning algorithms.
binary_columns = ['is_sm_ips_ports', 'is_ftp_login', 'ct_ftp_cmd']

for col in binary_columns:
    # Check unique values and data type
    unique_vals = df_clean[col].unique()
    print(f"Column '{col}' unique values: {unique_vals}")
    
    # Validate that only 0 and 1 values are present
    if set(unique_vals).issubset({0, 1}):
        # Convert to integer if not already (ensures numerical representation)
        df_clean[col] = df_clean[col].astype(int)
    else:
        print(f"Warning: Column '{col}' contains unexpected values: {unique_vals}")

### Step 6: Validate zero-value handling
# Rationale: Confirm zeros are legitimate values (not missing data) in network metrics.
# In network traffic data, zero values often represent one-way communication or missing
# responses, which are legitimate patterns in network behavior.
zero_value_columns = ['dbytes', 'dload']

for col in zero_value_columns:
    zero_count = (df_clean[col] == 0).sum()
    print(f"Column '{col}' has {zero_count} zero values ({(zero_count/len(df_clean))*100:.2f}%)")

# No imputation needed - zeros represent valid one-way network traffic patterns

### Final Data Validation
print("\nFinal dataset shape:", df_clean.shape)
print("\nColumn dtypes:\n", df_clean.dtypes)
print("\nMissing values per column:\n", df_clean.isnull().sum())
print("\nFirst 5 rows of cleaned data:")
print(df_clean.head())

# The dataframe df_clean is now prepared for machine learning
# Features are scaled, extreme values are capped, and data types are validated
# Note: Target variable 'label' remains unchanged for binary classification

Column 'is_sm_ips_ports' unique values: [0 1]
Column 'is_ftp_login' unique values: [0 1 2 4]
Column 'ct_ftp_cmd' unique values: [0 1 2 4]
Column 'dbytes' has 120288 zero values (46.68%)
Column 'dload' has 1 zero values (0.00%)

Final dataset shape: (257673, 40)

Column dtypes:
 dur                  float64
spkts                  int64
dpkts                  int64
sbytes                 int64
dbytes                 int64
rate                 float64
sttl                   int64
dttl                   int64
sload                float64
dload                float64
sloss                  int64
dloss                  int64
sinpkt               float64
dinpkt               float64
sjit                 float64
djit                 float64
swin                   int64
stcpb                float64
dtcpb                float64
dwin                   int64
tcprtt               float64
synack               float64
ackdat               float64
smean                  int64
dmean                  in

In [4]:
# Export the cleaned dataframe to a new CSV file
df_final.to_csv('cleaned_data.csv', index=False)

print("Cleaned DataFrame exported to 'cleaned_data.csv'")

Cleaned DataFrame exported to 'cleaned_data.csv'
