# Intrusion Detection System using Random Forest

This notebook builds an intrusion detection system using the **NSL-KDD dataset**, a refined version of the KDD'99 dataset. The goal is to build a network intrusion detector, a predictive model capable of distinguishing between 'bad' connections (intrusions or attacks) and 'good' normal connections.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

## 1. Load the Data

In [None]:
train_df = pd.read_csv('Train_data.csv')
test_df = pd.read_csv('Test_data.csv')

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

# The test set has a garbage column at the end, let's fix the column names
train_cols = train_df.columns
if len(test_df.columns) == len(train_cols):
    test_df.columns = train_cols
    test_df.drop('class', axis=1, inplace=True)
else:
    test_df.columns = train_cols[:-1]

train_df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
plt.figure(figsize=(8, 6))
sns.countplot(x='class', data=train_df, palette='viridis')
plt.title('Distribution of Connection Types (Class)', fontsize=16)
plt.xlabel('Class', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

## 3. Preprocess the Data

In [None]:
X = train_df.drop('class', axis=1)
y = train_df['class']

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['number']).columns

X_all = pd.concat([X, test_df], axis=0)
X_all = pd.get_dummies(X_all, columns=categorical_features, drop_first=True)

X_processed = X_all[:len(X)]
test_processed = X_all[len(X):]

train_cols = X_processed.columns
test_cols = test_processed.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_processed[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_processed[c] = 0
test_processed = test_processed[train_cols]

X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_val[numerical_features] = scaler.transform(X_val[numerical_features])
test_processed[numerical_features] = scaler.transform(test_processed[numerical_features])

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

## 4. Train the Model

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train_encoded)

## 5. Evaluate the Model (on Validation Set)

In [None]:
y_pred_val = model.predict(X_val)
accuracy = accuracy_score(y_val_encoded, y_pred_val)
print(f"Validation Accuracy: {accuracy:.4f}")
print("\nClassification Report on Validation Set:")
print(classification_report(y_val_encoded, y_pred_val, target_names=le.classes_))

# Plotting the confusion matrix
cm = confusion_matrix(y_val_encoded, y_pred_val)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix on Validation Set', fontsize=16)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.show()

## 6. View Predictions on Validation Set

In [None]:
# Take a few samples from the validation set
n_samples = 10
sample_df = X_val.head(n_samples).copy()

# Get true labels
true_labels_encoded = y_val_encoded[:n_samples]
true_labels = le.inverse_transform(true_labels_encoded)
sample_df['Actual Class'] = true_labels

# Make predictions
predictions_encoded = model.predict(X_val.head(n_samples))
predictions = le.inverse_transform(predictions_encoded)
sample_df['Predicted Class'] = predictions

print(f"Showing predictions for the first {n_samples} samples of the validation set:")
sample_df[['Actual Class', 'Predicted Class']]

## 7. Save the Model and Preprocessors

In [None]:
joblib.dump(model, 'intrusion_detection_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(X_train.columns, 'model_columns.pkl')
print("Model, scaler, label encoder, and columns saved.")

## 8. Make Predictions on Test Data

In [None]:
# Load the model and preprocessors
loaded_model = joblib.load('intrusion_detection_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
loaded_le = joblib.load('label_encoder.pkl')
loaded_columns = joblib.load('model_columns.pkl')

# Example: Take one sample from the processed test set
sample = test_processed.iloc[[0]]

# Make prediction
prediction_encoded = loaded_model.predict(sample[loaded_columns])
prediction = loaded_le.inverse_transform(prediction_encoded)

print(f"Prediction for the first sample in the test set: {prediction[0]}")