In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# ✅ Load Dataset
try:
    df = pd.read_csv("dataset/intrusion_data.csv")  # 🔹 Ensure correct path
    print("✅ Dataset Loaded Successfully!")
except FileNotFoundError:
    print("❌ Error: Dataset File Not Found! Check Path.")
    exit()

# ✅ Ensure Required Columns Exist
required_columns = ["src_ip", "dst_ip", "src_port", "dst_port", "protocol", "packet_size", "connection_duration", "attack_type"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"❌ Error: Missing columns in dataset -> {missing_columns}")
    exit()

# ✅ Data Type Check
print(df.dtypes)

# ✅ Preview First 5 Rows
print(df.head())

# ✅ Label Encoding for 'protocol'
protocol_encoder = LabelEncoder()
df["protocol"] = protocol_encoder.fit_transform(df["protocol"])
with open("protocol_encoder.pkl", "wb") as f:
    pickle.dump(protocol_encoder, f)
print("✅ Protocol Encoder Saved Successfully!")

# ✅ Label Encoding for 'attack_type'
attack_encoder = LabelEncoder()
df["attack_type"] = attack_encoder.fit_transform(df["attack_type"])
with open("attack_encoder.pkl", "wb") as f:
    pickle.dump(attack_encoder, f)
print("✅ Attack Type Encoder Saved Successfully!")

# ✅ Feature Selection
X = df.drop(columns=["attack_type"])
y = df["attack_type"]

# ✅ Debugging: Shape Check
print("🔍 X shape:", X.shape)
print("🔍 y shape:", y.shape)

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ X_train shape:", X_train.shape)
print("✅ X_test shape:", X_test.shape)
print("✅ y_train shape:", y_train.shape)
print("✅ y_test shape:", y_test.shape)

# ✅ Handle Data Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("🔍 Class Distribution After SMOTE:", Counter(y_resampled))

# ✅ Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")
print("✅ Scaler Saved Successfully!")

# ✅ Train Model (Hyperparameter Tuning for Accuracy)
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=20, 
    min_samples_split=5, 
    random_state=42
)
model.fit(X_train_scaled, y_resampled)

# ✅ Save Model
joblib.dump(model, "best_model.pkl")
print("✅ AI Model Trained & Saved Successfully! 🚀")

# ✅ Feature Importance Debugging
importances = model.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f"⭐ {feature}: {importance:.4f}")

# ✅ Debug: Ensure Feature Count Matches
print(f"✅ Model Trained with {X_train.shape[1]} Features")

# ✅ Load Model & Perform Prediction
scaler = joblib.load("scaler.pkl")
model = joblib.load("best_model.pkl")

# ✅ New Test Sample (Ensure Feature Order Matches Training Data)
sample = np.array([[150, 130, 35000, 34500, 0, 800, 7]])
sample_df = pd.DataFrame(sample, columns=X.columns)  # Match Feature Names
scaled_sample = scaler.transform(sample_df)
prediction = model.predict(scaled_sample)

# ✅ Decode Prediction
decoded_prediction = attack_encoder.inverse_transform(prediction)
print("🔍 New Prediction:", prediction)
print("🔍 Decoded Prediction:", decoded_prediction)


✅ Dataset Loaded Successfully!
src_ip                   int64
dst_ip                   int64
src_port                 int64
dst_port                 int64
protocol                object
packet_size              int64
connection_duration    float64
attack_type             object
dtype: object
   src_ip  dst_ip  src_port  dst_port protocol  packet_size  \
0     103     157     63576     59183     ICMP          931   
1     180     141     55563     27765      UDP          461   
2      93      46     12942     21628     ICMP         1225   
3      15      35     62010     39311     ICMP          738   
4     107     253     54169     56089      UDP          450   

   connection_duration attack_type  
0             8.145865      normal  
1             7.097695      normal  
2             9.675910         u2r  
3             8.283855         dos  
4             9.040628      normal  
✅ Protocol Encoder Saved Successfully!
✅ Attack Type Encoder Saved Successfully!
🔍 X shape: (1000, 7)
🔍 y 



✅ AI Model Trained & Saved Successfully! 🚀
⭐ src_ip: 0.1489
⭐ dst_ip: 0.1507
⭐ src_port: 0.1714
⭐ dst_port: 0.1689
⭐ protocol: 0.0462
⭐ packet_size: 0.1612
⭐ connection_duration: 0.1526
✅ Model Trained with 7 Features
🔍 New Prediction: [0]
🔍 Decoded Prediction: ['dos']
