# PCA for NSL-KDD

Dimensionality reduction built on top of the NSL-KDD preprocessing pipeline.

In [None]:
# Imports and plotting defaults
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('talk')

DATA_DIR = Path('../data')
RANDOM_STATE = 42

## Load preprocessed datasets
Use the existing normalized tables as the baseline for PCA.

In [None]:
# Load train/test splits and inspect class balance
train_df = pd.read_csv(DATA_DIR / 'preproc_kdd_train.csv')
test_df = pd.read_csv(DATA_DIR / 'preproc_kdd_test.csv')

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")
print("Train attack distribution (top 5):")
print(train_df['attack_type'].value_counts().head())
print("Test attack distribution (top 5):")
print(test_df['attack_type'].value_counts().head())

train_df.head()

## Feature scaling & label prep
Re-apply `StandardScaler` so PCA sees zero-mean/unit-variance features, and create binary attack labels for evaluation.

In [None]:
feature_cols = [col for col in train_df.columns if col != 'attack_type']

scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[feature_cols])
X_test = scaler.transform(test_df[feature_cols])

y_train = (train_df['attack_type'] != 'normal').astype(int)
y_test = (test_df['attack_type'] != 'normal').astype(int)

print(f"Scaled feature matrix shape (train/test): {X_train.shape} / {X_test.shape}")
print(f"Attack rate train/test: {y_train.mean():.2%} / {y_test.mean():.2%}")

## PCA variance analysis
Fit a full PCA to understand the variance curve and pick the number of components hitting ~95% cumulative variance.

In [None]:
# Fit PCA on the standardized training set to inspect explained variance
pca_full = PCA().fit(X_train)
explained = pca_full.explained_variance_ratio_
cum_explained = np.cumsum(explained)
n_components_95 = int(np.argmax(cum_explained >= 0.95) + 1)

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(range(1, len(cum_explained) + 1), cum_explained, marker='o')
ax.axhline(0.95, color='r', linestyle='--', label='95% target')
ax.axvline(n_components_95, color='g', linestyle='--', label=f'{n_components_95} comps')
ax.set_xlabel('Number of principal components')
ax.set_ylabel('Cumulative explained variance')
ax.set_title('PCA cumulative variance (train set)')
ax.legend()
plt.show()

print(f"Components needed for >=95% variance: {n_components_95}")

## Transform & save PCA datasets
Project both splits into the PCA space and persist the tables following the `PCA-[Name]` convention.

In [None]:
# Create the PCA transformer using the selected number of components
pca = PCA(n_components=n_components_95, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

pca_columns = [f'pc{i:02d}' for i in range(1, X_train_pca.shape[1] + 1)]
train_pca_df = pd.DataFrame(X_train_pca, columns=pca_columns)
train_pca_df['attack_type'] = train_df['attack_type'].values

test_pca_df = pd.DataFrame(X_test_pca, columns=pca_columns)
test_pca_df['attack_type'] = test_df['attack_type'].values

train_path = DATA_DIR / 'PCA-nsl_kdd_train.csv'
test_path = DATA_DIR / 'PCA-nsl_kdd_test.csv'
train_pca_df.to_csv(train_path, index=False)
test_pca_df.to_csv(test_path, index=False)

print(f"Saved PCA train set -> {train_path} ({train_pca_df.shape})")
print(f"Saved PCA test set  -> {test_path} ({test_pca_df.shape})")

In [None]:
loadings = pca.components_
feature_names = train_df.columns

In [None]:
for i, pc_loadings in enumerate(loadings):
    # Get the absolute values of loadings for the current PC
    abs_loadings = np.abs(pc_loadings)
    
    # Get the indices of features sorted by their absolute loadings (descending)
    sorted_indices = np.argsort(abs_loadings)[::-1]
    
    print(f"\nMost influential features for PC{i+1}:")
    for j in range(min(5, len(sorted_indices))): # Print top 5 or fewer
        feature_index = sorted_indices[j]
        feature_name = feature_names[feature_index]
        loading_value = pc_loadings[feature_index]
        print(f"- {feature_name}: {loading_value:.4f}")