# Feature Engineering for Network Intrusion Detection

This notebook performs feature engineering on the BCCC-CSE-CIC-IDS2018 dataset.

## Objectives:
1. Load and preprocess raw network flow data
2. Handle missing values and outliers
3. Create derived features
4. Encode categorical variables
5. Scale numerical features
6. Handle class imbalance
7. Save processed features for model training

In [1]:
# Import libraries
import os
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Raw Data

In [None]:
# Load data
project_root = Path().resolve()
data_path = project_root / 'data' / 'raw' / 'friday_02_03_2018_combined_sample.csv'

df = pd.read_csv(data_path)

print(f"Loaded {len(df):,} records")
print(f"Number of features: {len(df.columns)}")
print(f"\nColumns: {list(df.columns)}")

  df = pd.read_csv(data_path)


  df = pd.read_csv(data_path)


KeyboardInterrupt: 

In [None]:
# Check data types and missing values
print("Data Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum().sort_values(ascending=False).head(10))

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289799 entries, 0 to 289798
Columns: 323 entries, flow_id to label
dtypes: float64(259), int64(56), object(8)
memory usage: 714.2+ MB
None

Missing values:
payload_bytes_cov                  108358
fwd_payload_bytes_cov               75401
bwd_payload_bytes_cov               59023
bwd_packets_IAT_skewness            49868
bwd_packets_IAT_cov                 49868
fwd_packets_IAT_cov                 34082
fwd_packets_IAT_skewness            34027
cov_payload_bytes_delta_len         28349
cov_fwd_payload_bytes_delta_len     18905
cov_bwd_payload_bytes_delta_len     15945
dtype: int64


## 2. Data Cleaning

In [None]:
# Separate features and target
# Identify the label column (could be 'label', 'Label', etc.)
label_col = "label"

print(f"Using '{label_col}' as target variable")

# Separate features and labels
X = df.drop(columns=[label_col])
y = df[label_col]

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nClass distribution:\n{y.value_counts()}")

Using 'label' as target variable

Feature matrix shape: (289799, 322)
Target shape: (289799,)

Class distribution:
label
Benign    278864
Bot        10935
Name: count, dtype: int64


In [None]:
# Handle missing values
print("Handling missing values...")

# Check for missing values
missing_counts = X.isnull().sum()
missing_cols = missing_counts[missing_counts > 0]

if len(missing_cols) > 0:
    print(f"\nColumns with missing values:\n{missing_cols}")
    
    # Strategy: Fill numeric columns with median, drop columns with >50% missing
    threshold = 0.5
    high_missing = missing_cols[missing_cols / len(X) > threshold]
    
    if len(high_missing) > 0:
        print(f"\nDropping columns with >{threshold*100}% missing: {list(high_missing.index)}")
        X = X.drop(columns=high_missing.index)
    
    # Fill remaining missing values with median for numeric columns
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if X[col].isnull().any():
            X[col] = X[col].fillna(X[col].median())
    
    print(f"\nRemaining missing values: {X.isnull().sum().sum()}")
else:
    print("No missing values found!")

print(f"\nFinal feature matrix shape: {X.shape}")

Handling missing values...

Columns with missing values:
payload_bytes_cov                  108358
fwd_payload_bytes_cov               75401
bwd_payload_bytes_cov               59023
packets_IAT_cov                       437
fwd_packets_IAT_skewness            34027
fwd_packets_IAT_cov                 34082
bwd_packets_IAT_skewness            49868
bwd_packets_IAT_cov                 49868
cov_packets_delta_time                437
cov_bwd_packets_delta_time              1
cov_fwd_packets_delta_time             55
cov_packets_delta_len                5696
cov_bwd_packets_delta_len            1398
cov_fwd_packets_delta_len            3102
cov_header_bytes_delta_len           7780
cov_bwd_header_bytes_delta_len       1464
cov_fwd_header_bytes_delta_len       4140
cov_payload_bytes_delta_len         28349
cov_bwd_payload_bytes_delta_len     15945
cov_fwd_payload_bytes_delta_len     18905
dtype: int64

Remaining missing values: 0

Final feature matrix shape: (289799, 322)


In [None]:
# Handle infinite values
print("Checking for infinite values...")

numeric_cols = X.select_dtypes(include=[np.number]).columns
inf_counts = {}

for col in numeric_cols:
    inf_count = np.isinf(X[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count
        # Replace inf with NaN, then fill with column median
        X[col] = X[col].replace([np.inf, -np.inf], np.nan)
        X[col] = X[col].fillna(X[col].median())

if inf_counts:
    print(f"\nReplaced infinite values in {len(inf_counts)} columns")
    for col, count in list(inf_counts.items())[:10]:
        print(f"  {col}: {count} infinite values")
else:
    print("No infinite values found!")

Checking for infinite values...

Replaced infinite values in 9 columns
  cov_packets_delta_len: 998 infinite values
  cov_bwd_packets_delta_len: 1093 infinite values
  cov_fwd_packets_delta_len: 141 infinite values
  cov_header_bytes_delta_len: 488 infinite values
  cov_bwd_header_bytes_delta_len: 1102 infinite values
  cov_fwd_header_bytes_delta_len: 63 infinite values
  cov_payload_bytes_delta_len: 204435 infinite values
  cov_bwd_payload_bytes_delta_len: 94260 infinite values
  cov_fwd_payload_bytes_delta_len: 183487 infinite values


## 3. Feature Engineering

In [None]:
print("="*80)
print("STEP 1: DROP TIMESTAMP COLUMN")
print("="*80)

# Store original columns for comparison at the end
original_columns = set(X.columns)

# Drop timestamp - not using temporal features for now
X = X.drop(columns=['timestamp'])
print(f"  ‚úì Dropped timestamp column")

print("="*80 + "\n")

# ============================================================
print("="*80)
print("STEP 2: PORT FEATURE ENGINEERING")
print("="*80)

print("Encoding destination port (target service)...")
    
# Well-known ports (these are the ones attackers often target)
X['dst_port_http'] = (X['dst_port'].isin([80, 8080, 8000, 8888])).astype(int)
X['dst_port_https'] = (X['dst_port'] == 443).astype(int)
X['dst_port_ssh'] = (X['dst_port'] == 22).astype(int)
X['dst_port_ftp'] = (X['dst_port'].isin([20, 21])).astype(int)
X['dst_port_smtp'] = (X['dst_port'].isin([25, 587, 465])).astype(int)
X['dst_port_dns'] = (X['dst_port'] == 53).astype(int)
X['dst_port_telnet'] = (X['dst_port'] == 23).astype(int)
X['dst_port_smb'] = (X['dst_port'].isin([139, 445])).astype(int)
X['dst_port_rdp'] = (X['dst_port'] == 3389).astype(int)
X['dst_port_mysql'] = (X['dst_port'] == 3306).astype(int)
X['dst_port_postgres'] = (X['dst_port'] == 5432).astype(int)

# Port range categories - create as separate binary features (one-hot encoding)
print("\nCreating port range one-hot features...")
X['dst_port_cat_well_known'] = (X['dst_port'] < 1024).astype(int)
X['dst_port_cat_registered'] = ((X['dst_port'] >= 1024) & (X['dst_port'] < 49152)).astype(int)
X['dst_port_cat_ephemeral'] = (X['dst_port'] >= 49152).astype(int)

# Print the counts for each category
print(f"  dst_port_cat_well_known: {X['dst_port_cat_well_known'].sum()} flows")
print(f"  dst_port_cat_registered: {X['dst_port_cat_registered'].sum()} flows")
print(f"  dst_port_cat_ephemeral: {X['dst_port_cat_ephemeral'].sum()} flows")

# Drop original dst_port
X = X.drop(columns=['dst_port'])

print(f"\n‚úì Created destination port features:")
print(f"  - Binary flags for common services: http, https, ssh, ftp, smtp, dns, etc.")
print(f"  - Port range one-hot features: well_known, registered, ephemeral")
print(f"  ‚úì Dropped original dst_port column")

# List all dst_port columns created
dst_port_cols = [col for col in X.columns if col.startswith('dst_port')]
print(f"\n‚úì All dst_port columns created ({len(dst_port_cols)}):")
for col in sorted(dst_port_cols):
    print(f"  - {col}")

print("\nHandling source port...")
    
# Source port is usually ephemeral (random), less predictive
# But we can create some useful features
X['src_port_is_privileged'] = (X['src_port'] < 1024).astype(int)
X['src_port_is_ephemeral'] = (X['src_port'] >= 49152).astype(int)
X = X.drop(columns=['src_port'])

print(f"‚úì Created source port features:")
print(f"  - src_port_is_privileged (<1024)")
print(f"  - src_port_is_ephemeral (>=49152)")
print(f"  ‚úì Dropped original src_port column")

print("="*80 + "\n")

# ============================================================
print("="*80)
print("STEP 3: PROTOCOL ENCODING")
print("="*80)

print(f"Protocol values: {X['protocol'].unique()}")
    
# Common protocols
X['protocol_tcp'] = (X['protocol'].str.upper() == 'TCP').astype(int)
X['protocol_udp'] = (X['protocol'].str.upper() == 'UDP').astype(int)
X['protocol_icmp'] = (X['protocol'].str.upper() == 'ICMP').astype(int)

# Drop original protocol column
X = X.drop(columns=['protocol'])

print(f"‚úì Created protocol features:")
print(f"  - protocol_tcp, protocol_udp, protocol_icmp")
print(f"  ‚úì Dropped original protocol column")

print("="*80 + "\n")

# ============================================================

print("="*80)
print("STEP 4: DROP IDENTIFIER COLUMNS")
print("="*80)

identifier_cols = ['flow_id', 'src_ip', 'dst_ip']
to_drop = [col for col in identifier_cols if col in X.columns]

if to_drop:
    print(f"Dropping identifier columns: {to_drop}")
    X = X.drop(columns=to_drop)
else:
    print("No identifier columns to drop")


print("="*80 + "\n")

# ============================================================

print("="*80)
print("STEP 5: HANDLE MIXED-TYPE NUMERIC COLUMNS")
print("="*80)

# Handle all columns that should be numeric but contain text values
# This includes delta_start and any other similar columns
print("Converting all mixed-type columns to numeric...")

# Get all non-numeric columns (excluding ones we just created which are binary)
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

if len(non_numeric_cols) > 0:
    print(f"\nFound {len(non_numeric_cols)} non-numeric columns to convert:")
    
    for col in non_numeric_cols:
        print(f"\nProcessing '{col}'...")
        
        # Count non-numeric values before conversion
        non_numeric_mask = pd.to_numeric(X[col], errors='coerce').isna() & X[col].notna()
        non_numeric_count = non_numeric_mask.sum()
        
        if non_numeric_count > 0:
            print(f"  Found {non_numeric_count} non-numeric values")
            sample_values = X.loc[non_numeric_mask, col].unique()[:3]
            print(f"  Sample values: {sample_values}")
        
        # Convert to numeric, coercing errors to NaN
        X[col] = pd.to_numeric(X[col], errors='coerce')
        
        # Fill NaN values with median of valid numeric values
        if X[col].notna().any():
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
            if non_numeric_count > 0:
                print(f"  ‚úì Converted to numeric, filled {non_numeric_count} values with median: {median_val:.6f}")
            else:
                print(f"  ‚úì Already numeric, filled NaN with median: {median_val:.6f}")
        else:
            # If all values were non-numeric, fill with 0
            X[col] = X[col].fillna(0)
            print(f"  ‚úì Converted to numeric, filled all NaN values with 0")
else:
    print("All columns are already numeric!")
print(f"\n‚úì Final feature count: {X.shape[1]}")
print("="*80 + "\n")

# ============================================================

print("="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)

# Count feature types
port_features = [col for col in X.columns if 'port' in col.lower()]
protocol_features = [col for col in X.columns if 'protocol' in col.lower()]
original_features = [col for col in X.columns if col not in port_features + protocol_features]

print(f"Total features: {X.shape[1]}")
print(f"  - Port features: {len(port_features)}")
print(f"  - Protocol features: {len(protocol_features)}")
print(f"  - Original/derived features: {len(original_features)}")
print("="*80 + "\n")

# ============================================================
print("="*80)
print("COLUMN CHANGES SUMMARY")
print("="*80)

# Calculate which columns were added and removed
final_columns = set(X.columns)
added_columns = sorted(final_columns - original_columns)
removed_columns = sorted(original_columns - final_columns)

print(f"\nüìä COLUMNS REMOVED ({len(removed_columns)}):")
if removed_columns:
    for col in removed_columns:
        print(f"  ‚ùå {col}")
else:
    print("  (none)")

print(f"\nüìä COLUMNS ADDED ({len(added_columns)}):")
if added_columns:
    for col in added_columns:
        print(f"  ‚úÖ {col}")
else:
    print("  (none)")

print(f"\nüìä NET CHANGE: {len(final_columns)} - {len(original_columns)} = {len(final_columns) - len(original_columns):+d} columns")
print("="*80 + "\n")


STEP 1: DROP TIMESTAMP COLUMN
  ‚úì Dropped timestamp column

STEP 2: PORT FEATURE ENGINEERING
Encoding destination port (target service)...

Creating port range one-hot features...
  dst_port_cat_well_known: 132938 flows
  dst_port_cat_registered: 115616 flows
  dst_port_cat_ephemeral: 41245 flows

‚úì Created destination port features:
  - Binary flags for common services: http, https, ssh, ftp, smtp, dns, etc.
  - Port range one-hot features: well_known, registered, ephemeral
  ‚úì Dropped original dst_port column

‚úì All dst_port columns created (14):
  - dst_port_cat_ephemeral
  - dst_port_cat_registered
  - dst_port_cat_well_known
  - dst_port_dns
  - dst_port_ftp
  - dst_port_http
  - dst_port_https
  - dst_port_mysql
  - dst_port_postgres
  - dst_port_rdp
  - dst_port_smb
  - dst_port_smtp
  - dst_port_ssh
  - dst_port_telnet

Handling source port...
‚úì Created source port features:
  - src_port_is_privileged (<1024)
  - src_port_is_ephemeral (>=49152)
  ‚úì Dropped original 

## 4. Feature Scaling

In [None]:
# Encode target labels
print("Encoding target labels...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nLabel mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {label}: {i}")

# Convert to binary if needed (benign vs attack)
y_binary = (y != 'Benign').astype(int)
print(f"\nBinary distribution (0=Benign, 1=Attack):")
print(pd.Series(y_binary).value_counts())

Encoding target labels...

Label mapping:
  Benign: 0
  Bot: 1

Binary distribution (0=Benign, 1=Attack):
label
0    278864
1     10935
Name: count, dtype: int64


In [None]:
# Split data before scaling to prevent data leakage
print("Splitting data into train/test sets...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTrain class distribution:\n{pd.Series(y_train).value_counts()}")
print(f"\nTest class distribution:\n{pd.Series(y_test).value_counts()}")

Splitting data into train/test sets...
Training set: (231839, 334)
Test set: (57960, 334)

Train class distribution:
label
0    223091
1      8748
Name: count, dtype: int64

Test class distribution:
label
0    55773
1     2187
Name: count, dtype: int64


In [None]:
# ============================================================
# SMART FEATURE SCALING
# ============================================================
# Don't scale binary/categorical features (already 0/1)
# Only scale continuous features
# ============================================================

print("="*80)
print("IDENTIFYING FEATURES TO SCALE")
print("="*80)

# Automatically detect binary/one-hot features by checking their values
# Binary features only contain 0 and 1 (and possibly NaN)
no_scale_features = []
for col in X_train.columns:
    unique_values = X_train[col].dropna().unique()
    # Check if column only contains 0 and 1 (binary/one-hot encoded)
    if len(unique_values) <= 2 and set(unique_values).issubset({0, 1, 0.0, 1.0}):
        no_scale_features.append(col)

# Features TO scale (continuous variables)
scale_features = [col for col in X_train.columns if col not in no_scale_features]

print(f"Features TO SCALE (continuous): {len(scale_features)}")
print(f"Features NOT to scale (binary/categorical): {len(no_scale_features)}")

if len(scale_features) > 0:
    print(f"\nSample continuous features to scale:")
    for feat in scale_features[:10]:
        print(f"  - {feat}")

if len(no_scale_features) > 0:
    print(f"\nBinary/categorical features (keeping as 0/1):")
    for feat in no_scale_features[:15]:
        print(f"  - {feat}")
    if len(no_scale_features) > 15:
        print(f"  ... and {len(no_scale_features) - 15} more")

print("="*80 + "\n")

# ============================================================
print("="*80)
print("APPLYING STANDARDSCALER")
print("="*80)

# Initialize scaler
scaler = StandardScaler()

# Create copies to avoid modifying original
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale ONLY the continuous features
if len(scale_features) > 0:
    print(f"Scaling {len(scale_features)} continuous features...")
    X_train_scaled[scale_features] = scaler.fit_transform(X_train[scale_features])
    X_test_scaled[scale_features] = scaler.transform(X_test[scale_features])
    print("‚úì Scaling complete")
else:
    print("‚ö†Ô∏è  No continuous features to scale")

# Binary features remain unchanged (already 0/1)
if len(no_scale_features) > 0:
    print(f"‚úì {len(no_scale_features)} binary/categorical features kept as 0/1")

print(f"\nScaled training set: {X_train_scaled.shape}")
print(f"Scaled test set: {X_test_scaled.shape}")

print("="*80 + "\n")

# ============================================================
print("="*80)
print("VERIFICATION: Sample of scaled vs unscaled features")
print("="*80)

# Show example of a scaled continuous feature
if len(scale_features) > 0:
    example_continuous = scale_features[0]
    print(f"\nContinuous feature '{example_continuous}':")
    print(f"  Original range: [{X_train[example_continuous].min():.3f}, {X_train[example_continuous].max():.3f}]")
    print(f"  Scaled range: [{X_train_scaled[example_continuous].min():.3f}, {X_train_scaled[example_continuous].max():.3f}]")
    print(f"  Scaled mean: {X_train_scaled[example_continuous].mean():.3f} (should be ~0)")
    print(f"  Scaled std: {X_train_scaled[example_continuous].std():.3f} (should be ~1)")

# Show example of a binary feature
if len(no_scale_features) > 0:
    example_binary = no_scale_features[0]
    print(f"\nBinary feature '{example_binary}':")
    print(f"  Original values: {X_train[example_binary].unique()}")
    print(f"  Scaled values: {X_train_scaled[example_binary].unique()}")
    print(f"  ‚úì Binary features remain unchanged!")

print("="*80 + "\n")

# Show sample of final scaled data
print("Sample of scaled features (first 5 rows, first 10 columns):")
print(X_train_scaled.iloc[:5, :10])


IDENTIFYING FEATURES TO SCALE
Features TO SCALE (continuous): 304
Features NOT to scale (binary/categorical): 30

Sample continuous features to scale:
  - duration
  - packets_count
  - fwd_packets_count
  - bwd_packets_count
  - total_payload_bytes
  - fwd_total_payload_bytes
  - bwd_total_payload_bytes
  - payload_bytes_max
  - payload_bytes_min
  - payload_bytes_mean

Binary/categorical features (keeping as 0/1):
  - urg_flag_counts
  - rst_flag_counts
  - fwd_urg_flag_counts
  - fwd_rst_flag_counts
  - bwd_urg_flag_counts
  - bwd_rst_flag_counts
  - urg_flag_percentage_in_total
  - fwd_urg_flag_percentage_in_total
  - bwd_urg_flag_percentage_in_total
  - fwd_urg_flag_percentage_in_fwd_packets
  - bwd_urg_flag_percentage_in_bwd_packets
  - dst_port_http
  - dst_port_https
  - dst_port_ssh
  - dst_port_ftp
  ... and 15 more

APPLYING STANDARDSCALER
Scaling 304 continuous features...
‚úì Scaling complete
‚úì 30 binary/categorical features kept as 0/1

Scaled training set: (231839, 334

In [None]:
# ============================================================
# FEATURE VALIDATION
# ============================================================
# Quick sanity checks on the engineered features
# ============================================================

print("="*80)
print("VALIDATING ENGINEERED FEATURES")
print("="*80)

# Check for any NaN or infinite values
nan_count = X_train_scaled.isna().sum().sum()
inf_count = np.isinf(X_train_scaled.select_dtypes(include=[np.number])).sum().sum()

print(f"\n‚úì NaN values: {nan_count}")
print(f"‚úì Infinite values: {inf_count}")

if nan_count > 0 or inf_count > 0:
    print("‚ö†Ô∏è  WARNING: Found NaN or infinite values!")
else:
    print("‚úì All features are valid!")

# Show distribution of some key features
print("\n" + "="*80)
print("SAMPLE FEATURE DISTRIBUTIONS")
print("="*80)

# Port features
port_features = [col for col in X_train_scaled.columns if 'dst_port' in col.lower() and any(s in col for s in ['http', 'ssh', 'https'])]
if port_features:
    print(f"\nPort feature distributions:")
    for feat in port_features[:3]:
        print(f"\n{feat}:")
        print(X_train_scaled[feat].value_counts())

print("\n" + "="*80)
print("‚úì FEATURE ENGINEERING COMPLETE!")
print("="*80)
print(f"\nFinal dataset ready for training:")
print(f"  Training samples: {X_train_scaled.shape[0]:,}")
print(f"  Test samples: {X_test_scaled.shape[0]:,}")
print(f"  Total features: {X_train_scaled.shape[1]}")
print(f"  Class distribution (train): {pd.Series(y_train).value_counts().to_dict()}")
print("="*80)


VALIDATING ENGINEERED FEATURES

‚úì NaN values: 0
‚úì Infinite values: 0
‚úì All features are valid!

SAMPLE FEATURE DISTRIBUTIONS

Port feature distributions:

dst_port_http:
dst_port_http
0    198839
1     33000
Name: count, dtype: int64

dst_port_https:
dst_port_https
0    180625
1     51214
Name: count, dtype: int64

dst_port_ssh:
dst_port_ssh
0    230976
1       863
Name: count, dtype: int64

‚úì FEATURE ENGINEERING COMPLETE!

Final dataset ready for training:
  Training samples: 231,839
  Test samples: 57,960
  Total features: 334
  Class distribution (train): {0: 223091, 1: 8748}


## 5. Save Processed Data

In [None]:
# Save processed data
print("Saving processed data...")

processed_dir = project_root / 'data' / 'processed'
processed_dir.mkdir(parents=True, exist_ok=True)

# Save train/test splits
X_train_scaled.to_csv(processed_dir / 'X_train.csv', index=False)
X_test_scaled.to_csv(processed_dir / 'X_test.csv', index=False)
pd.Series(y_train, name='label').to_csv(processed_dir / 'y_train.csv', index=False)
pd.Series(y_test, name='label').to_csv(processed_dir / 'y_test.csv', index=False)

# Save scaler and label encoder for later use
import joblib
joblib.dump(scaler, processed_dir / 'scaler.pkl')
joblib.dump(label_encoder, processed_dir / 'label_encoder.pkl')

print(f"\nProcessed data saved to: {processed_dir}")
print("Files created:")
print("  - X_train.csv")
print("  - X_test.csv")
print("  - y_train.csv")
print("  - y_test.csv")
print("  - scaler.pkl")
print("  - label_encoder.pkl")


Saving processed data...

Processed data saved to: /Users/matthewweaver/Repositories/nidstream/data/processed
Files created:
  - X_train.csv
  - X_test.csv
  - y_train.csv
  - y_test.csv
  - scaler.pkl
  - label_encoder.pkl


## 6. Prepare SMOTE Data

Apply SMOTE to balance classes for model training comparison.

In [None]:
# Apply SMOTE to create balanced training data
print("="*80)
print("PREPARING SMOTE DATA")
print("="*80)

from imblearn.over_sampling import SMOTE

print(f"\nBefore SMOTE:")
print(f"  Training samples: {X_train_scaled.shape[0]:,}")
print(f"  Benign: {(y_train == 0).sum():,}")
print(f"  Attack: {(y_train == 1).sum():,}")
print(f"  Imbalance ratio: {(y_train == 0).sum()/(y_train == 1).sum():.2f}:1")

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE:")
print(f"  Training samples: {X_train_smote.shape[0]:,}")
print(f"  Benign: {(y_train_smote == 0).sum():,}")
print(f"  Attack: {(y_train_smote == 1).sum():,}")
print(f"  ‚úÖ Classes balanced 1:1")

# Save SMOTE data
X_train_smote.to_csv(processed_dir / 'X_train_smote.csv', index=False)
pd.Series(y_train_smote, name='label').to_csv(processed_dir / 'y_train_smote.csv', index=False)

print(f"\n‚úÖ SMOTE data saved:")
print(f"  - X_train_smote.csv")
print(f"  - y_train_smote.csv")
print("="*80)

PREPARING SMOTE DATA

Before SMOTE:
  Training samples: 231,839
  Benign: 223,091
  Attack: 8,748
  Imbalance ratio: 25.50:1

After SMOTE:
  Training samples: 446,182
  Benign: 223,091
  Attack: 223,091
  ‚úÖ Classes balanced 1:1

‚úÖ SMOTE data saved:
  - X_train_smote.csv
  - y_train_smote.csv


## 7. Summary Statistics

In [None]:
# Summary of feature engineering process
print("=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)
print(f"\nOriginal dataset: {len(df):,} samples, {len(df.columns)} features")
print(f"Final feature count: {X_train_scaled.shape[1]}")
print(f"\nTraining set: {len(X_train_scaled):,} samples")
print(f"Test set: {len(X_test_scaled):,} samples")
print(f"\nClass distribution (train):")
print(f"  Benign: {(y_train == 0).sum():,} ({(y_train == 0).sum() / len(y_train) * 100:.1f}%)")
print(f"  Attack: {(y_train == 1).sum():,} ({(y_train == 1).sum() / len(y_train) * 100:.1f}%)")
print(f"\nData ready for model training!")
print("=" * 60)

FEATURE ENGINEERING SUMMARY

Original dataset: 289,799 samples, 323 features
Final feature count: 334

Training set: 231,839 samples
Test set: 57,960 samples

Class distribution (train):
  Benign: 223,091 (96.2%)
  Attack: 8,748 (3.8%)

Data ready for model training!


## Next Steps

The processed data is now ready for:
1. Model training
2. Hyperparameter tuning
3. Model evaluation and comparison

**Note:** You may want to:
- Perform feature selection to reduce dimensionality
- Experiment with different scaling methods
- Create more domain-specific features based on network traffic analysis