In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# --- Configuration ---
# Set the file path for the dataset.
# IMPORTANT: This path must be updated to match the location of the dataset on your local machine.
FILE_PATH = "E:\\Datasets\\UNSW-NB15\\Training and Testing Sets\\UNSW_NB15_concatenated_dropped.csv"

# --- Data Loading ---
# Load the dataset from the specified CSV file into a pandas DataFrame.
# A try-except block is used to gracefully handle the case where the file is not found,
# preventing the script from crashing and providing a clear error message to the user.
try:
    df = pd.read_csv(FILE_PATH)
    print("--- Initial Data Load ---")
    print(f"Successfully loaded the dataset from: {FILE_PATH}")
    print(f"Initial dataset shape: {df.shape} (rows, columns)")
    print("\n")
except FileNotFoundError:
    print(f"Error: The file was not found at the specified path: {FILE_PATH}")
    print("Please update the FILE_PATH variable with the correct location of your dataset.")
    # Exit the script if the file cannot be loaded, as subsequent steps cannot proceed.
    exit()

# --- Data Cleaning and Preprocessing Plan Implementation ---

# **Step 1: Handle Duplicate Records**
# Objective: Remove rows that are exact copies of each other.
# Rationale: Duplicate records can introduce bias into a machine learning model, causing it
# to overweight the patterns present in those duplicated samples. Removing them ensures that
# each data point is unique and contributes independently to the model's training.
print("--- Step 1: Handling Duplicate Records ---")
initial_rows = df.shape[0]
# The pandas `drop_duplicates()` method identifies and removes rows that are identical across all columns.
# `inplace=True` modifies the DataFrame directly, saving memory by not creating a new object.
df.drop_duplicates(inplace=True)
remaining_rows = df.shape[0]
print(f"Identified and removed {initial_rows - remaining_rows} fully duplicate rows.")
print(f"Dataset shape after removing duplicates: {df.shape}")
print("\n")


# **Step 2: Select and Prepare the Target Variable**
# Objective: Isolate the desired target variable ('attack_cat') and remove redundant or leaky columns.
# Rationale: The 'label' column (binary: 0 or 1) is a direct derivative of 'attack_cat' (multi-class).
# For a multi-class classification task, 'attack_cat' is the correct target. Keeping 'label' in the
# feature set would represent a "data leak," giving the model a perfect predictor and leading to
# unrealistically high performance that would not generalize to new data.
print("--- Step 2: Selecting Target Variable ---")
if 'label' in df.columns:
    # Drop the 'label' column from the DataFrame.
    df.drop(columns=['label'], inplace=True)
    print("Dropped the redundant 'label' column to prevent data leakage.")
    print(f"Dataset shape after dropping 'label': {df.shape}")
else:
    print("'label' column not found, skipping.")
print("\n")


# **Step 3: Remove Identifier Column**
# Objective: Remove columns that serve as identifiers but have no predictive value.
# Rationale: The 'id' column is a unique identifier for each row (like a primary key). It contains
# no information about the nature of the network traffic itself and would only add noise if included
# as a feature in the model. It is standard practice to remove such identifiers.
print("--- Step 3: Removing Identifier Column ---")
if 'id' in df.columns:
    # Drop the 'id' column from the DataFrame.
    df.drop(columns=['id'], inplace=True)
    print("Dropped the 'id' column as it has no predictive value.")
    print(f"Dataset shape after dropping 'id': {df.shape}")
else:
    print("'id' column not found, skipping.")
print("\n")


# **Step 4: Re-evaluate and Correct Data Types (Feature Identification)**
# Objective: Systematically identify and separate features into numerical and categorical types.
# Rationale: Machine learning algorithms require numerical input. Therefore, we must apply different
# preprocessing techniques based on a feature's data type. Numerical features need scaling, while
# categorical features need encoding. This step is crucial for setting up the transformation pipeline.
print("--- Step 4: Identifying Feature Types ---")

# Separate the features (independent variables, X) from the target (dependent variable, y).
# This is a standard and necessary step before any feature transformation.
X = df.drop(columns=['attack_cat'])
y = df['attack_cat']

# Define which columns are categorical based on their data type ('object') and domain knowledge.
# Some features are encoded as integers but represent distinct categories rather than a continuous
# scale (e.g., 'sttl', 'dttl'). These must be explicitly identified and treated as categorical.
categorical_features = [
    'proto', 'service', 'state',  # Object types, clearly categorical
    'sttl', 'dttl', 'ct_state_ttl', 'is_sm_ips_ports', 'is_ftp_login',
    'ct_ftp_cmd', 'ct_flw_http_mthd' # Integer types that represent categories
]
# A robust check to ensure all features listed above are actually present in the DataFrame's columns.
# This prevents errors if the input data changes or a column was already removed.
categorical_features = [col for col in categorical_features if col in X.columns]

# Identify numerical features by taking all columns from X that were NOT identified as categorical.
# This is an efficient way to partition the feature set.
numerical_features = [col for col in X.columns if col not in categorical_features]

print(f"Identified {len(numerical_features)} numerical features.")
print(f"Identified {len(categorical_features)} categorical features.")
print("\n")


# **Steps 5 & 6: Encode Categorical Features and Scale Numerical Features**
# Objective: Apply transformations to convert all features into a suitable numerical format for ML models.
# Rationale: We use scikit-learn's ColumnTransformer to create a single, unified preprocessing
# pipeline. This is highly efficient and less error-prone than transforming feature subsets manually.
#   - One-Hot Encoding (Step 5): Converts categorical features into a binary vector format. This
#     prevents the model from assuming any ordinal relationship between categories.
#   - Standard Scaling (Step 6): Transforms numerical features to have a mean of 0 and a standard
#     deviation of 1. This is crucial for algorithms sensitive to feature scales, such as SVMs,
#     Logistic Regression, and Neural Networks, ensuring all features contribute fairly to the result.
print("--- Steps 5 & 6: Encoding Categorical and Scaling Numerical Features ---")

# Define the transformer for numerical features. StandardScaler is a robust choice.
numeric_transformer = StandardScaler()

# Define the transformer for categorical features.
# `handle_unknown='ignore'` is a critical parameter that prevents errors if the model encounters
# a category in new data that it didn't see during training. It will encode that new category as all zeros.
# `sparse_output=False` ensures the output is a standard NumPy array, which is easier to work with.
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create the ColumnTransformer. This object applies specified transformers to specified columns.
# The 'remainder' parameter is set to 'passthrough', which ensures that any columns not explicitly
# handled by the transformers are kept in the dataset. In our case, all columns are handled.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Apply the defined transformations to the feature set X.
# The `fit_transform` method first learns the parameters from the data (e.g., mean/std for scaling,
# unique categories for encoding) and then applies the transformation.
X_processed = preprocessor.fit_transform(X)

# --- Reconstruct the Processed DataFrame ---
# The output of the ColumnTransformer is a NumPy array, which lacks column names.
# We reconstruct a pandas DataFrame to maintain readability and for easier analysis.

# Get the new column names generated by the OneHotEncoder.
# `get_feature_names_out` creates meaningful names like 'proto_tcp', 'proto_udp', etc.
encoded_cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# Combine the original numerical feature names with the new one-hot encoded feature names.
all_feature_names = numerical_features + list(encoded_cat_feature_names)

# Create the new DataFrame containing the processed features.
# We use the original index from X to ensure correct alignment when we later combine it with the target series y.
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names, index=X.index)

print(f"Original number of features: {len(X.columns)}")
print(f"Number of features after one-hot encoding and scaling: {len(X_processed_df.columns)}")
print("\n")


# --- Final Preprocessed DataFrame ---
# Combine the processed features (X_processed_df) and the original target (y)
# into a single, fully preprocessed DataFrame ready for machine learning.
print("--- Finalizing the Preprocessed DataFrame ---")
# `pd.concat` is used to join the two DataFrames column-wise (`axis=1`).
# Because we preserved the index throughout the process, we can be confident that
# each row of features correctly aligns with its corresponding target value.
df_processed = pd.concat([X_processed_df, y], axis=1)

print("Successfully created the final preprocessed DataFrame.")
print(f"Final DataFrame shape: {df_processed.shape}")
print("\n--- Sample of the Final Preprocessed Data ---")
print(df_processed.head())
print("\n--- Info of the Final Preprocessed Data ---")
# .info() provides a concise summary, confirming data types are all numeric and there are no missing values.
df_processed.info()

--- Initial Data Load ---
Successfully loaded the dataset from: E:\Datasets\UNSW-NB15\Training and Testing Sets\UNSW_NB15_concatenated_dropped.csv
Initial dataset shape: (257673, 42) (rows, columns)


--- Step 1: Handling Duplicate Records ---
Identified and removed 0 fully duplicate rows.
Dataset shape after removing duplicates: (257673, 42)


--- Step 2: Selecting Target Variable ---
Dropped the redundant 'label' column to prevent data leakage.
Dataset shape after dropping 'label': (257673, 41)


--- Step 3: Removing Identifier Column ---
Dropped the 'id' column as it has no predictive value.
Dataset shape after dropping 'id': (257673, 40)


--- Step 4: Identifying Feature Types ---
Identified 32 numerical features.
Identified 7 categorical features.


--- Steps 5 & 6: Encoding Categorical and Scaling Numerical Features ---
Original number of features: 39
Number of features after one-hot encoding and scaling: 82


--- Finalizing the Preprocessed DataFrame ---
Successfully created the

In [4]:
# Export the cleaned dataframe to a new CSV file
df_processed.to_csv('cleaned_data.csv', index=False)

print("Cleaned DataFrame exported to 'cleaned_data.csv'")

Cleaned DataFrame exported to 'cleaned_data.csv'
