In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

intrusion_data = pd.read_csv("cybersecurity_intrusion_data.csv", sep=',')

# Varirable Separation 
cols_to_drop = [
    'attack_detected',      
    'session_id' 
]

X = intrusion_data.drop(cols_to_drop, axis=1)
y = intrusion_data['attack_detected']



# Split Train/Test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identification of columns type
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

#Pipeline construction

# Standard Scaler
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Encoding by  numerotation (Ordinal)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), # Gestion des manquants
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) # Encodage 0, 1, 2...
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Application
# Fitting
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Pré-traitement terminé !")
print(f"Nouvelle taille de X_train : {X_train_processed.shape}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
feature_names = list(numerical_features) + list(categorical_features)

# ACP
pca = PCA()
X_pca = pca.fit_transform(X_train_processed)

# Variance calculus
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Visualisation
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(2, 2)

# Explained Variance
ax1 = fig.add_subplot(gs[0, 0])
ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='blue')
ax1.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, color='blue', label='Variance par axe')
ax1.axhline(y=0.95, color='r', linestyle=':', label='Treeshold 95%')
ax1.set_title('How much information is conserved ?')
ax1.set_xlabel('Number of components')
ax1.set_ylabel('Cumulated variance')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Projection 2D
ax2 = fig.add_subplot(gs[0, 1])

pca_df = pd.DataFrame(data=X_pca[:, :2], columns=['PC1', 'PC2'])
pca_df['Target'] = y_train.values

scatter = sns.scatterplot(
    x='PC1', y='PC2', hue='Target', data=pca_df, 
    palette={0: 'tab:blue', 1: 'tab:red'}, alpha=0.6, ax=ax2
)
ax2.set_title(f'Carte des Connexions (PC1: {explained_variance[0]:.1%} | PC2: {explained_variance[1]:.1%})')
ax2.set_xlabel('Main  axis 1 (PC1)')
ax2.set_ylabel('Main axis 2 (PC2)')

handles, labels = ax2.get_legend_handles_labels()
ax2.legend(handles, ['Normal', 'Attack'], title="Type of Traffic")

# Matrix of points
ax3 = fig.add_subplot(gs[1, :]) # Prend toute la largeur du bas

loadings = pd.DataFrame(
    pca.components_.T, 
    columns=[f'PC{i+1}' for i in range(X_train_processed.shape[1])],
    index=feature_names
)
# Heatmap
sns.heatmap(loadings.T, annot=True, cmap='coolwarm', center=0, fmt=".2f", ax=ax3)
ax3.set_title("Which variables create axis? (Red = Positive Influence, Bleu = Négative)")
ax3.set_ylabel("Main Components (Axis)")
ax3.set_xlabel("Original variables")

plt.tight_layout()
plt.show()

In [None]:
# Same code but with less column of the dataset

intrusion_data = pd.read_csv("cybersecurity_intrusion_data.csv", sep=',')


# Deleting columns
cols_to_drop = [
    'attack_detected',      
    'session_id',           
    'protocol_type',       
    'unusual_time_access',  
    'encryption_used',     
    'network_packet_size'   
]

X = intrusion_data.drop(cols_to_drop, axis=1)
y = intrusion_data['attack_detected']



# Split Train/Test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identification of columns types
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

# Pipeline construction 

# Standard Scaler 
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Numerotation (Ordinal)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), # Gestion des manquants
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)) # Encodage 0, 1, 2...
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Application
# Fitting
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Pré-traitement terminé !")
print(f"Nouvelle taille de X_train : {X_train_processed.shape}")

feature_names = list(numerical_features) + list(categorical_features)

# ACP
pca = PCA()
X_pca = pca.fit_transform(X_train_processed)

# Variance Calculus
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Visualisation
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(2, 2)

# Explained Variance
ax1 = fig.add_subplot(gs[0, 0])
ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='blue')
ax1.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, color='blue', label='Variance par axe')
ax1.axhline(y=0.95, color='r', linestyle=':', label='Treeshold 95%')
ax1.set_title('How much information is conserved ?')
ax1.set_xlabel('Number of components')
ax1.set_ylabel('Cumulated variance')
ax1.legend()
ax1.grid(True, alpha=0.3)

#Projection 2D 
ax2 = fig.add_subplot(gs[0, 1])

pca_df = pd.DataFrame(data=X_pca[:, :2], columns=['PC1', 'PC2'])
pca_df['Target'] = y_train.values

scatter = sns.scatterplot(
    x='PC1', y='PC2', hue='Target', data=pca_df, 
    palette={0: 'tab:blue', 1: 'tab:red'}, alpha=0.6, ax=ax2
)
ax2.set_title(f'Carte des Connexions (PC1: {explained_variance[0]:.1%} | PC2: {explained_variance[1]:.1%})')
ax2.set_xlabel('Main  axis 1 (PC1)')
ax2.set_ylabel('Main axis 2 (PC2)')

handles, labels = ax2.get_legend_handles_labels()
ax2.legend(handles, ['Normal', 'Attack'], title="Type of Traffic")

# Matrix of points
ax3 = fig.add_subplot(gs[1, :]) 

loadings = pd.DataFrame(
    pca.components_.T, 
    columns=[f'PC{i+1}' for i in range(X_train_processed.shape[1])],
    index=feature_names
)
# Heatmap
sns.heatmap(loadings.T, annot=True, cmap='coolwarm', center=0, fmt=".2f", ax=ax3)
ax3.set_title("Which variables create axis? (Red = Positive Influence, Bleu = Négative)")
ax3.set_ylabel("Main Components (Axis)")
ax3.set_xlabel("Original variables")

plt.tight_layout()
plt.show()

## Analysis of PCA on Reduced dataset
The variance is distributed almost evenly across the 5 remaining components (approx. 20-25% each). This is unusual and indicates that there is no single "dominant" feature. All retained variables contribute equally to the dataset's structure, implying that the information is dense and cannot be easily compressed further without loss.

For the projection, we are only choosing PC1 and PC2 because they are the one with the best cumulated variance. So, the projection exhibits a distinct "barcode" or vertical striping pattern. This is an artifact of the Ordinal Encoding applied to the categorical feature (likely browser_type), creating discrete columns at x = -1, 0, 1, etc. Within these vertical bands, the "Normal" (blue) and "Attack" (red) points are heavily overlapped. There is no clear linear boundary separating the two classes. However, we observe that the red points (attacks) tend to reach higher extremes on the Y-axis (PC2), suggesting that attacks are characterized by extreme values in behavioral features (like session_duration or failed_logins).

## Analysis of PCA on Full dataset (except the session_id column) 
The complexity is significantly higher here. It takes about 8 out of 9 components to explain 95% of the variance. This fragmentation confirms that every feature in the original dataset carries unique, non-redundant information.
PC1 is almost entirely driven by browser_type (correlation of 1.00).PC2 captures behavioral anomalies (failed_logins, session_duration).
Similar to the reduced dataset, we see the "barcode" effect driven by the browser type on PC1. The separation remains non-linear: attacks are hidden inside the traffic of each browser type, distinguishable mainly by their vertical spread .

## Comparison of the two 
Comparing the two results highlights the risks of manual feature selection. In the Reduced Dataset, the PCA showed a simplified structure, but the Full Dataset PCA revealed that the variables we removed (protocol_type, encryption_used) actually constituted independent axes of variance (PC8 and PC9). By removing them, we were effectively discarding entire dimensions of the problem, potentially blinding the model to specific attack vectors relying on protocols or encryption types.

## Conclusion 
The visualization of the Full Dataset demonstrates that the problem is highly non-linear. A simple line cannot separate the blue and red points clustered within the vertical bands.The fragmented variance proves that every column is necessary to capture the full picture of network traffic.
The strong overlap of classes confirms that linear models (like the baseline Logistic Regression) will underperform. The visual evidence strongly supports the move to non-linear algorithms like Random Forest (which can isolate the specific "bands")