# Prediction using different models

## Pre-Processing

In this section we will apply different pre-processing methods to see which one works best

### One-Hot Encoding per column `data_binary`

#### Filtered version `data_binary_filtered`

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

According to the correlation matrix of the previous data analysis, let's remove theses columns : protocol_type, unusual_time_access, encryption_used, network_packet_size

In [2]:
data = pd.read_csv("cybersecurity_intrusion_data.csv")
cols_to_remove = ["protocol_type", "network_packet_size", "encryption_used","unusual_time_access", "session_id"]

data_clean = data.drop(columns=cols_to_remove)
categorical_cols = data_clean.select_dtypes(include=['object']).columns

In [3]:
onehot_encoder = OneHotEncoder(
    sparse_output=False,      
    handle_unknown="ignore",  
    drop=None                
)

encoded_array = onehot_encoder.fit_transform(data_clean[categorical_cols])

encoded_df = pd.DataFrame(
    encoded_array,
    columns=onehot_encoder.get_feature_names_out(categorical_cols),
    index=data_clean.index
)

data_binary_filtered = pd.concat(
    [data_clean.drop(columns=categorical_cols), encoded_df],
    axis=1
)

print("Initial data shape:", data.shape)
print("Cleaned (removed cols) shape:", data_clean.shape)
print("Final encoded shape:", data_binary_filtered.shape)

Initial data shape: (9537, 11)
Cleaned (removed cols) shape: (9537, 6)
Final encoded shape: (9537, 10)


In [4]:
data_binary_filtered.head()

Unnamed: 0,login_attempts,session_duration,ip_reputation_score,failed_logins,attack_detected,browser_type_Chrome,browser_type_Edge,browser_type_Firefox,browser_type_Safari,browser_type_Unknown
0,4,492.983263,0.606818,1,1,0.0,1.0,0.0,0.0,0.0
1,3,1557.996461,0.301569,0,0,0.0,0.0,1.0,0.0,0.0
2,3,75.044262,0.739164,2,1,1.0,0.0,0.0,0.0,0.0
3,4,601.248835,0.123267,0,1,0.0,0.0,0.0,0.0,1.0
4,5,532.540888,0.054874,1,0,0.0,0.0,1.0,0.0,0.0


#### Unfiltered version `data_binary_unfiltered`

In [5]:
categorical_cols = data.select_dtypes(include=['object']).columns

In [6]:
onehot_encoder = OneHotEncoder(
    sparse_output=False,      
    handle_unknown="ignore",  
    drop=None                
)

encoded_array = onehot_encoder.fit_transform(data[categorical_cols])

encoded_df = pd.DataFrame(
    encoded_array,
    columns=onehot_encoder.get_feature_names_out(categorical_cols),
    index=data_clean.index
)

data_binary_unfiltered = pd.concat(
    [data.drop(columns=categorical_cols), encoded_df],
    axis=1
)

print("Initial data shape:", data.shape)
print("Final encoded shape:", data_binary_unfiltered.shape)

Initial data shape: (9537, 11)
Final encoded shape: (9537, 9555)


In [7]:
data_binary_unfiltered.head()

Unnamed: 0,network_packet_size,login_attempts,session_duration,ip_reputation_score,failed_logins,unusual_time_access,attack_detected,session_id_SID_00001,session_id_SID_00002,session_id_SID_00003,...,protocol_type_TCP,protocol_type_UDP,encryption_used_AES,encryption_used_DES,encryption_used_nan,browser_type_Chrome,browser_type_Edge,browser_type_Firefox,browser_type_Safari,browser_type_Unknown
0,599,4,492.983263,0.606818,1,0,1,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,472,3,1557.996461,0.301569,0,0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,629,3,75.044262,0.739164,2,0,1,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,804,4,601.248835,0.123267,0,0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,453,5,532.540888,0.054874,1,0,0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Models

In this section we will apply different models to all the pre-processed dataframes that we obtained previously in the preprocessing step

### SVM

#### Unsing One-Hot Encoded & filetered dataframe `data_binary_filtered`

In [9]:
from sklearn.model_selection import train_test_split

y = data_binary_filtered["attack_detected"]
X = data_binary_filtered.drop(columns=["attack_detected"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  
)


Standardisation

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Train with RBF Kernel. 
We use an SVM with an RBF kernel because intrusion detection is generally a non-linear classification problem. The RBF kernel can capture complex boundaries between malicious and normal traffic, making it more effective than a linear kernel. It also works well with high-dimensional data, which fits our dataset after one-hot encoding.

In [14]:
from sklearn.svm import SVC

svm_clf = SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42)
svm_clf.fit(X_train_scaled, y_train)


In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = svm_clf.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))


Test accuracy: 0.8721

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89      1055
           1       0.97      0.74      0.84       853

    accuracy                           0.87      1908
   macro avg       0.90      0.86      0.87      1908
weighted avg       0.89      0.87      0.87      1908


Confusion matrix:
[[1035   20]
 [ 224  629]]


Grid Search

In [16]:
param_grid = {
    "C": [0.1, 1, 5, 10, 20],
    "gamma": ["scale", "auto", 0.01, 0.001, 0.0001],
    "kernel": ["rbf"]  
}

In [17]:
from sklearn.model_selection import GridSearchCV

svm_model = SVC()

grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    cv=5,  
    scoring="accuracy",  
    n_jobs=-1,           
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


KeyboardInterrupt: 

Results of the Grid Search

In [None]:
print("Best parameters found:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)


Best parameters found: {'C': 20, 'gamma': 'auto', 'kernel': 'rbf'}
Best CV accuracy: 0.8834699954880433


In [None]:
best_svm = grid_search.best_estimator_

y_pred = best_svm.predict(X_test_scaled)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test accuracy: 0.8752620545073375

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1055
           1       0.98      0.74      0.84       853

    accuracy                           0.88      1908
   macro avg       0.90      0.86      0.87      1908
weighted avg       0.89      0.88      0.87      1908


Confusion Matrix:
[[1042   13]
 [ 225  628]]


#### Using One-Hot Encoded & unfiltered dataframe `data_binary_unfiltered`

In [18]:
y = data_binary_unfiltered["attack_detected"]
X = data_binary_unfiltered.drop(columns=["attack_detected"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y 
)

In [19]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
svm_clf = SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42)
svm_clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = svm_clf.predict(X_test_scaled)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Grid Search

In [None]:
grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    cv=3,  
    scoring="accuracy",  
    n_jobs=-1,           
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


KeyboardInterrupt: 

In [None]:
print("Best parameters (full data):", grid_search.best_params_)
print("Best CV accuracy (full data):", grid_search.best_score_)


Grid Search Results

In [None]:
best_svm = grid_search.best_estimator_

y_pred = best_svm.predict(X_test_scaled)

print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### RandomForestClassifier

In [10]:
y = data_binary_filtered["attack_detected"]
X = data_binary_filtered.drop(columns=["attack_detected"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintien de l'équilibre des classes
)