# KNN-Based Network Intrusion Detection System
## Detecting Normal vs Anomalous Packets using UNSW Dataset

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 2. Load and Prepare Data

In [None]:
# Load UNSW training dataset
print("Loading UNSW NB15 Training Dataset...")
df_train = pd.read_csv('UNSW_Train_Test Datasets/UNSW_NB15_training-set.csv')

# Load UNSW testing dataset
print("Loading UNSW NB15 Testing Dataset...")
df_test = pd.read_csv('UNSW_Train_Test Datasets/UNSW_NB15_testing-set.csv')

print("\n" + "="*60)
print("TRAINING DATASET INFORMATION")
print("="*60)
print(f"Shape: {df_train.shape}")
print(f"\nFirst few rows:")
print(df_train.head())
print(f"\nData types:\n{df_train.dtypes}")
print(f"\nMissing values:\n{df_train.isnull().sum()}")

print("\n" + "="*60)
print("TESTING DATASET INFORMATION")
print("="*60)
print(f"Shape: {df_test.shape}")
print(f"Data types:\n{df_test.dtypes}")

## 3. Feature Selection and Preprocessing

In [None]:
# Define feature sets
features = ['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload',
            'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb',
            'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len',
            'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
            'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']

non_numeric = ['is_sm_ips_ports', 'is_ftp_login']
numeric_features = list(set(features) - set(non_numeric))
non_log = ['sttl', 'dttl', 'swin', 'dwin', 'trans_depth', 'ct_state_ttl', 'ct_flw_http_mthd']

print("Feature preprocessing started...")
print(f"Total features: {len(features)}")
print(f"Non-numeric features: {len(non_numeric)}")
print(f"Numeric features: {len(numeric_features)}")

# Apply log transform to training data
df_logs_train = np.log10(df_train[list(set(numeric_features) - set(non_log))] + 1)
df_numeric_train = pd.concat([df_logs_train, df_train[non_log]], axis=1)
df_transformed_train = pd.concat([df_numeric_train, df_train[non_numeric]], axis=1)[features]

# Apply log transform to testing data
df_logs_test = np.log10(df_test[list(set(numeric_features) - set(non_log))] + 1)
df_numeric_test = pd.concat([df_logs_test, df_test[non_log]], axis=1)
df_transformed_test = pd.concat([df_numeric_test, df_test[non_numeric]], axis=1)[features]

# Calculate mutual information for feature selection
print("\nCalculating mutual information scores...")
mi_arr = mutual_info_classif(X=df_transformed_train, y=df_train['label'], random_state=42)
df_mi = pd.DataFrame(np.array([df_transformed_train.columns, mi_arr]).T, columns=['feature', 'mi'])
df_mi['mi'] = df_mi['mi'].astype(float)
df_mi = df_mi.sort_values('mi', ascending=False)

print("\nTop 15 Features by Mutual Information:")
print(df_mi.head(15))

# Select features with MI > 0.2
mi_cutoff = 0.2
selected_features = df_mi[df_mi['mi'] > mi_cutoff]['feature'].tolist()
print(f"\nSelected {len(selected_features)} features with MI > {mi_cutoff}")
print(f"Selected features: {selected_features}")

# Prepare final datasets
df_train_processed = pd.concat([df_transformed_train[selected_features], df_train['label']], axis=1)
df_test_processed = pd.concat([df_transformed_test[selected_features], df_test['label']], axis=1)

print("\nPreprocessing complete!")
print(f"Training data shape: {df_train_processed.shape}")
print(f"Testing data shape: {df_test_processed.shape}")

## 4. Prepare Training and Testing Data

In [None]:
# Separate features and labels
X_train = df_train_processed[selected_features]
y_train = df_train_processed['label']

X_test = df_test_processed[selected_features]
y_test = df_test_processed['label']

print("="*60)
print("DATA SPLIT SUMMARY")
print("="*60)
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"Number of features: {len(selected_features)}")

# Check label distribution
print("\n" + "="*60)
print("LABEL DISTRIBUTION")
print("="*60)
print("\nTraining set:")
print(y_train.value_counts())
print(f"Normal packets: {(y_train == 0).sum()} ({(y_train == 0).sum()/len(y_train)*100:.2f}%)")
print(f"Anomalous packets: {(y_train == 1).sum()} ({(y_train == 1).sum()/len(y_train)*100:.2f}%)")

print("\nTesting set:")
print(y_test.value_counts())
print(f"Normal packets: {(y_test == 0).sum()} ({(y_test == 0).sum()/len(y_test)*100:.2f}%)")
print(f"Anomalous packets: {(y_test == 1).sum()} ({(y_test == 1).sum()/len(y_test)*100:.2f}%)")

## 5. Standardize Features

In [None]:
# Standardize features using StandardScaler
# KNN requires feature scaling because it uses distance metrics
print("Standardizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=selected_features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=selected_features)

print("Feature standardization complete!")
print(f"\nTraining data statistics after scaling:")
print(f"Mean: {X_train_scaled.mean().mean():.6f}")
print(f"Std: {X_train_scaled.std().mean():.6f}")
print(f"\nSample of scaled data:")
print(X_train_scaled.head())

## 6. Train KNN Model

In [None]:
# Train KNN classifier with n_neighbors=7 (determined from kSelection analysis)
print("="*60)
print("TRAINING KNN CLASSIFIER")
print("="*60)
print(f"\nTraining KNN with n_neighbors=7...")
print(f"Training samples: {len(X_train_scaled)}")
print(f"Feature dimensions: {X_train_scaled.shape[1]}")

knn_model = KNN(n_neighbors=7, metric='euclidean', n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)

print("\nKNN Model trained successfully!")
print(f"Model: {knn_model}")
print(f"Classes: {knn_model.classes_}")

## 7. Generate Predictions

In [None]:
# Generate predictions on test set
print("="*60)
print("GENERATING PREDICTIONS")
print("="*60)
print("\nGenerating predictions on test set...")

y_pred = knn_model.predict(X_test_scaled)

print(f"Predictions generated for {len(y_pred)} test samples")
print(f"\nPrediction distribution:")
print(f"Normal packets: {(y_pred == 0).sum()} ({(y_pred == 0).sum()/len(y_pred)*100:.2f}%)")
print(f"Anomalous packets: {(y_pred == 1).sum()} ({(y_pred == 1).sum()/len(y_pred)*100:.2f}%)")

# Get prediction probabilities for additional analysis
y_pred_proba = knn_model.predict_proba(X_test_scaled)
print(f"\nPrediction probabilities shape: {y_pred_proba.shape}")
print(f"Sample predictions with confidence:")
for i in range(5):
    label = "Normal" if y_pred[i] == 0 else "Anomalous"
    confidence = max(y_pred_proba[i]) * 100
    print(f"  Sample {i+1}: {label} (confidence: {confidence:.2f}%)")

## 8. Evaluate Model Performance

In [None]:
# Calculate evaluation metrics
print("="*60)
print("MODEL PERFORMANCE EVALUATION")
print("="*60)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nAccuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1-Score:  {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)
print("\nRaw counts:")
print(cm)

# Detailed breakdown
tn, fp, fn, tp = cm.ravel()
print(f"\nTrue Negatives (TN - Normal correctly identified):  {tn}")
print(f"False Positives (FP - Normal misclassified as anomalous): {fp}")
print(f"False Negatives (FN - Anomalous misclassified as normal): {fn}")
print(f"True Positives (TP - Anomalous correctly identified):  {tp}")

# Additional metrics
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"\nSensitivity (True Positive Rate): {sensitivity:.4f} ({sensitivity*100:.2f}%)")
print(f"Specificity (True Negative Rate): {specificity:.4f} ({specificity*100:.2f}%)")

# Summary table
metrics_summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Sensitivity', 'Specificity'],
    'Score': [accuracy, precision, recall, f1, sensitivity, specificity]
})

print("\n" + "="*60)
print("METRICS SUMMARY")
print("="*60)
print(metrics_summary.to_string(index=False))

## 9. Visualize Results

In [None]:
# Visualization 1: Confusion Matrix Heatmap
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion matrix heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[0],
            xticklabels=['Normal', 'Anomalous'],
            yticklabels=['Normal', 'Anomalous'])
axes[0].set_title('KNN Confusion Matrix\n(Test Set)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('True Label', fontweight='bold')
axes[0].set_xlabel('Predicted Label', fontweight='bold')

# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Greens', cbar=False, ax=axes[1],
            xticklabels=['Normal', 'Anomalous'],
            yticklabels=['Normal', 'Anomalous'])
axes[1].set_title('KNN Confusion Matrix (Normalized)\n(Test Set)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('True Label', fontweight='bold')
axes[1].set_xlabel('Predicted Label', fontweight='bold')

plt.tight_layout()
plt.savefig('knn_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Confusion matrix visualization saved as 'knn_confusion_matrix.png'")

In [None]:
# Visualization 2: Performance Metrics Bar Chart
fig, ax = plt.subplots(figsize=(10, 6))

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Sensitivity', 'Specificity']
metrics_values = [accuracy, precision, recall, f1, sensitivity, specificity]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

bars = ax.bar(metrics_names, metrics_values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bar, value in zip(bars, metrics_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{value:.4f}\n({value*100:.2f}%)',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylim([0, 1.1])
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('KNN Model Performance Metrics\n(UNSW Dataset)', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

plt.tight_layout()
plt.savefig('knn_metrics.png', dpi=300, bbox_inches='tight')
plt.show()
print("Metrics visualization saved as 'knn_metrics.png'")

In [None]:
# Visualization 3: Label Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Actual label distribution
actual_counts = pd.Series(y_test).value_counts()
axes[0].bar(['Normal', 'Anomalous'], [actual_counts[0], actual_counts[1]], 
            color=['#2ca02c', '#d62728'], alpha=0.8, edgecolor='black', linewidth=1.5)
axes[0].set_title('Actual Label Distribution\n(Test Set)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Count', fontweight='bold')
for i, (label, count) in enumerate([(0, actual_counts[0]), (1, actual_counts[1])]):
    pct = count / len(y_test) * 100
    axes[0].text(i, count, f'{count}\n({pct:.1f}%)', ha='center', va='bottom', fontweight='bold')

# Predicted label distribution
pred_counts = pd.Series(y_pred).value_counts()
axes[1].bar(['Normal', 'Anomalous'], [pred_counts[0], pred_counts[1]], 
            color=['#1f77b4', '#ff7f0e'], alpha=0.8, edgecolor='black', linewidth=1.5)
axes[1].set_title('Predicted Label Distribution\n(Test Set)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Count', fontweight='bold')
for i, (label, count) in enumerate([(0, pred_counts[0]), (1, pred_counts[1])]):
    pct = count / len(y_pred) * 100
    axes[1].text(i, count, f'{count}\n({pct:.1f}%)', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('knn_label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Label distribution visualization saved as 'knn_label_distribution.png'")

In [None]:
import joblib

# Create results summary
results_summary = {
    'model_info': {
        'algorithm': 'K-Nearest Neighbors (KNN)',
        'n_neighbors': 7,
        'metric': 'euclidean',
        'selected_features': selected_features,
        'num_features': len(selected_features)
    },
    'dataset_info': {
        'dataset': 'UNSW NB15',
        'training_samples': len(X_train),
        'testing_samples': len(X_test),
        'total_samples': len(X_train) + len(X_test)
    },
    'performance_metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'sensitivity': sensitivity,
        'specificity': specificity
    },
    'confusion_matrix': {
        'true_negatives': int(tn),
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'true_positives': int(tp)
    }
}

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE - SUMMARY")
print("="*60)
print(json.dumps(results_summary, indent=2))

# Save summary
with open('knn_model_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\nSummary saved to 'knn_model_summary.json'")

## 11. Save Model for Project Integration

In [None]:
# Save trained KNN model and artifacts for backend integration
import os
import json

print("="*60)
print("SAVING MODEL ARTIFACTS")
print("="*60)

# Define output directory
model_dir = os.path.join('..', '..', 'model', 'unsw_tabular')
os.makedirs(model_dir, exist_ok=True)

# Save model
model_path = os.path.join(model_dir, 'model_knn.pkl')
joblib.dump(knn_model, model_path)
print(f"✓ Model saved to: {model_path}")

# Save scaler
scaler_path = os.path.join(model_dir, 'scaler_knn.pkl')
joblib.dump(scaler, scaler_path)
print(f"✓ Scaler saved to: {scaler_path}")

# Save selected features
features_path = os.path.join(model_dir, 'features_knn.json')
with open(features_path, 'w') as f:
    json.dump({'selected_features': selected_features}, f, indent=2)
print(f"✓ Features saved to: {features_path}")

# Save metrics
metrics_path = os.path.join(model_dir, 'metrics_knn.json')
metrics_data = {
    'accuracy': float(accuracy),
    'precision': float(precision),
    'recall': float(recall),
    'f1_score': float(f1),
    'sensitivity': float(sensitivity),
    'specificity': float(specificity),
    'n_neighbors': 7,
    'n_features': len(selected_features),
    'test_samples': len(y_test)
}
with open(metrics_path, 'w') as f:
    json.dump(metrics_data, f, indent=2)
print(f"✓ Metrics saved to: {metrics_path}")

print("\n" + "="*60)
print("MODEL ARTIFACTS READY FOR BACKEND INTEGRATION")
print("="*60)
print(f"\nAll artifacts saved in: {os.path.abspath(model_dir)}")
print("\nFiles created:")
print(f"  - model_knn.pkl (KNN classifier)")
print(f"  - scaler_knn.pkl (StandardScaler)")
print(f"  - features_knn.json (Selected features)")
print(f"  - metrics_knn.json (Model metrics)")
print("\nThe backend API can now load these artifacts for real-time predictions.")

## 12. Project Integration Instructions

The trained KNN model is now integrated with your Django backend. Here's how to use it:

### Quick Start

#### 1. API Endpoint Usage
```
POST /api/traffic/detect-anomaly/

Request Body:
{
  "features": {
    "dur": 0.5,
    "spkts": 10,
    "dpkts": 8,
    "sbytes": 1024,
    ...
  }
}

Response:
{
  "status": "success",
  "prediction": {
    "prediction": 0,
    "label": "Normal",
    "confidence": 0.95,
    "probabilities": {...}
  },
  "severity": "Low"
}
```

#### 2. Python Integration
```python
from api.utils.knn_classifier import KNNAnomalyDetector

detector = KNNAnomalyDetector(
    model_path='path/to/model_knn.pkl',
    features_path='path/to/features_knn.json',
    scaler_path='path/to/scaler_knn.pkl'
)

result = detector.predict(features_dict)
```

#### 3. Django Command
```bash
python manage.py train_knn_model
```

### Next Steps

1. ✓ Model trained and saved
2. → Test the API endpoint with sample data
3. → Integrate with frontend
4. → Set up real-time monitoring
5. → Create alert system

See `KNN_INTEGRATION_GUIDE.md` for detailed documentation.

## 10. Sample Predictions and Summary

In [None]:
# Create a detailed sample predictions table
sample_size = 20
sample_indices = np.random.choice(len(y_test), sample_size, replace=False)

sample_data = pd.DataFrame({
    'Actual': ['Normal' if y == 0 else 'Anomalous' for y in y_test.iloc[sample_indices]],
    'Predicted': ['Normal' if y == 0 else 'Anomalous' for y in y_pred[sample_indices]],
    'Normal_Prob': y_pred_proba[sample_indices, 0],
    'Anomalous_Prob': y_pred_proba[sample_indices, 1],
    'Confidence': np.max(y_pred_proba[sample_indices], axis=1),
    'Correct': y_test.iloc[sample_indices].values == y_pred[sample_indices]
})

print("="*80)
print("SAMPLE PREDICTIONS (20 random samples from test set)")
print("="*80)
print(sample_data.to_string(index=False))

# Calculate correctness
correct_preds = (sample_data['Correct'].sum() / len(sample_data)) * 100
print(f"\nCorrect predictions in sample: {correct_preds:.1f}%")

print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"""
KNN Anomaly Detection Model Performance Summary:

Dataset:
  - UNSW NB15 Training Set Size: {len(X_train):,} packets
  - UNSW NB15 Testing Set Size: {len(X_test):,} packets
  - Selected Features: {len(selected_features)} (from {len(features)} total)

Model Configuration:
  - Algorithm: K-Nearest Neighbors (KNN)
  - K Value: 7
  - Distance Metric: Euclidean
  - Feature Scaling: StandardScaler

Performance Metrics:
  - Accuracy:    {accuracy*100:6.2f}% (correctly classified packets)
  - Precision:   {precision*100:6.2f}% (anomalies correctly identified)
  - Recall:      {recall*100:6.2f}% (actual anomalies detected)
  - F1-Score:    {f1:.4f}
  - Sensitivity: {sensitivity*100:6.2f}% (true positive rate)
  - Specificity: {specificity*100:6.2f}% (true negative rate)

Classification Results:
  - Total Test Packets: {len(y_test):,}
  - Normal Packets (True Negatives): {tn:,} correctly classified, {fp:,} misclassified
  - Anomalous Packets (True Positives): {tp:,} correctly detected, {fn:,} missed

Key Takeaway:
  The KNN model successfully detects {sensitivity*100:.1f}% of intrusions while maintaining
  a {specificity*100:.1f}% normal traffic detection rate, making it effective for
  real-time anomaly detection in network traffic.
""")

# Feature importance based on selection
print("="*80)
print("SELECTED FEATURES (by Mutual Information score)")
print("="*80)
selected_mi = df_mi[df_mi['feature'].isin(selected_features)].sort_values('mi', ascending=False)
print(selected_mi[['feature', 'mi']].to_string(index=False))