# PCA for NSL-KDD

Dimensionality reduction pipeline that feeds later anomaly detection steps.

## Imports and plotting defaults

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('talk')

DATA_DIR = Path('../data')
RANDOM_STATE = 42

## Load preprocessed datasets
Use the normalized tables from the preprocessing notebook as the starting point.

In [None]:
train_df = pd.read_csv(DATA_DIR / 'preproc_kdd_train.csv')
test_df = pd.read_csv(DATA_DIR / 'preproc_kdd_test.csv')

print(f'Train: {train_df.shape} | Test: {test_df.shape}')
print(f'\nAttack distribution (top 5):')
print('TRAIN:', train_df['attack_type'].value_counts().head().to_dict())
print('TEST: ', test_df['attack_type'].value_counts().head().to_dict())

## Extract features and labels
Separate features from labels and convert attack_type to binary attack_flag (0=normal, 1=attack).

In [None]:
# Extract features (drop attack_type column)
X_train = train_df.drop(columns='attack_type').to_numpy()
X_test = test_df.drop(columns='attack_type').to_numpy()

# Convert attack_type to binary: 0=normal, 1=attack
y_train = (train_df['attack_type'] != 'normal').astype(int)
y_test = (test_df['attack_type'] != 'normal').astype(int)

print(f'Features: {X_train.shape} (train), {X_test.shape} (test)')
print(f'Attack rate: {y_train.mean():.2%} (train), {y_test.mean():.2%} (test)')

## Determine optimal components
Fit PCA and find number of components needed for 95% variance.

In [None]:
# Fit full PCA and compute cumulative variance
pca_full = PCA().fit(X_train)
cum_var = np.cumsum(pca_full.explained_variance_ratio_)
n_components_95 = int(np.argmax(cum_var >= 0.95) + 1)

# Plot variance curve
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(range(1, len(cum_var) + 1), cum_var, marker='o')
ax.axhline(0.95, color='r', linestyle='--', label='95% target')
ax.axvline(n_components_95, color='g', linestyle='--', label=f'{n_components_95} components')
ax.set_xlabel('Number of components')
ax.set_ylabel('Cumulative variance explained')
ax.set_title('PCA Variance Analysis')
ax.legend()
plt.show()

print(f'Components for 95% variance: {n_components_95}')

## Apply PCA and save datasets
Transform data and save as [pc01, pc02, ..., pcN, attack_flag].

In [None]:
# Fit PCA with optimal components and transform both datasets
pca = PCA(n_components=n_components_95, random_state=RANDOM_STATE)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Helper function to create PCA dataframe with attack_flag
def create_pca_df(X_pca, y):
    cols = [f'pc{i:02d}' for i in range(1, X_pca.shape[1] + 1)]
    df = pd.DataFrame(X_pca, columns=cols)
    df['attack_flag'] = y.to_numpy()
    return df

# Create and save dataframes
train_pca_df = create_pca_df(X_train_pca, y_train)
test_pca_df = create_pca_df(X_test_pca, y_test)

train_pca_df.to_csv(DATA_DIR / 'PCA-nsl_kdd_train.csv', index=False)
test_pca_df.to_csv(DATA_DIR / 'PCA-nsl_kdd_test.csv', index=False)

print(f'Saved train: {train_pca_df.shape} -> {DATA_DIR / "PCA-nsl_kdd_train.csv"}')
print(f'Saved test:  {test_pca_df.shape} -> {DATA_DIR / "PCA-nsl_kdd_test.csv"}')
print(f'Columns: {list(train_pca_df.columns)}')