In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# ‚úÖ Load Dataset
try:
    df = pd.read_csv("dataset/intrusion_data.csv")  # üîπ Ensure correct path
    print("‚úÖ Dataset Loaded Successfully!")
except FileNotFoundError:
    print("‚ùå Error: Dataset File Not Found! Check Path.")
    exit()

# ‚úÖ Ensure Required Columns Exist
required_columns = ["src_ip", "dst_ip", "src_port", "dst_port", "protocol", "packet_size", "connection_duration", "attack_type"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"‚ùå Error: Missing columns in dataset -> {missing_columns}")
    exit()

# ‚úÖ Data Type Check
print(df.dtypes)

# ‚úÖ Preview First 5 Rows
print(df.head())

# ‚úÖ Label Encoding for 'protocol'
protocol_encoder = LabelEncoder()
df["protocol"] = protocol_encoder.fit_transform(df["protocol"])
with open("protocol_encoder.pkl", "wb") as f:
    pickle.dump(protocol_encoder, f)
print("‚úÖ Protocol Encoder Saved Successfully!")

# ‚úÖ Label Encoding for 'attack_type'
attack_encoder = LabelEncoder()
df["attack_type"] = attack_encoder.fit_transform(df["attack_type"])
with open("attack_encoder.pkl", "wb") as f:
    pickle.dump(attack_encoder, f)
print("‚úÖ Attack Type Encoder Saved Successfully!")

# ‚úÖ Feature Selection
X = df.drop(columns=["attack_type"])
y = df["attack_type"]

# ‚úÖ Debugging: Shape Check
print("üîç X shape:", X.shape)
print("üîç y shape:", y.shape)

# ‚úÖ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("‚úÖ X_train shape:", X_train.shape)
print("‚úÖ X_test shape:", X_test.shape)
print("‚úÖ y_train shape:", y_train.shape)
print("‚úÖ y_test shape:", y_test.shape)

# ‚úÖ Handle Data Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("üîç Class Distribution After SMOTE:", Counter(y_resampled))

# ‚úÖ Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")
print("‚úÖ Scaler Saved Successfully!")

# ‚úÖ Train Model (Hyperparameter Tuning for Accuracy)
model = RandomForestClassifier(
    n_estimators=200, 
    max_depth=20, 
    min_samples_split=5, 
    random_state=42
)
model.fit(X_train_scaled, y_resampled)

# ‚úÖ Save Model
joblib.dump(model, "best_model.pkl")
print("‚úÖ AI Model Trained & Saved Successfully! üöÄ")

# ‚úÖ Feature Importance Debugging
importances = model.feature_importances_
for feature, importance in zip(X.columns, importances):
    print(f"‚≠ê {feature}: {importance:.4f}")

# ‚úÖ Debug: Ensure Feature Count Matches
print(f"‚úÖ Model Trained with {X_train.shape[1]} Features")

# ‚úÖ Load Model & Perform Prediction
scaler = joblib.load("scaler.pkl")
model = joblib.load("best_model.pkl")

# ‚úÖ New Test Sample (Ensure Feature Order Matches Training Data)
sample = np.array([[150, 130, 35000, 34500, 0, 800, 7]])
sample_df = pd.DataFrame(sample, columns=X.columns)  # Match Feature Names
scaled_sample = scaler.transform(sample_df)
prediction = model.predict(scaled_sample)

# ‚úÖ Decode Prediction
decoded_prediction = attack_encoder.inverse_transform(prediction)
print("üîç New Prediction:", prediction)
print("üîç Decoded Prediction:", decoded_prediction)


‚úÖ Dataset Loaded Successfully!
src_ip                   int64
dst_ip                   int64
src_port                 int64
dst_port                 int64
protocol                object
packet_size              int64
connection_duration    float64
attack_type             object
dtype: object
   src_ip  dst_ip  src_port  dst_port protocol  packet_size  \
0     103     157     63576     59183     ICMP          931   
1     180     141     55563     27765      UDP          461   
2      93      46     12942     21628     ICMP         1225   
3      15      35     62010     39311     ICMP          738   
4     107     253     54169     56089      UDP          450   

   connection_duration attack_type  
0             8.145865      normal  
1             7.097695      normal  
2             9.675910         u2r  
3             8.283855         dos  
4             9.040628      normal  
‚úÖ Protocol Encoder Saved Successfully!
‚úÖ Attack Type Encoder Saved Successfully!
üîç X shape: (1000



‚úÖ AI Model Trained & Saved Successfully! üöÄ
‚≠ê src_ip: 0.1489
‚≠ê dst_ip: 0.1507
‚≠ê src_port: 0.1714
‚≠ê dst_port: 0.1689
‚≠ê protocol: 0.0462
‚≠ê packet_size: 0.1612
‚≠ê connection_duration: 0.1526
‚úÖ Model Trained with 7 Features
üîç New Prediction: [0]
üîç Decoded Prediction: ['dos']
