- Step 1: Import Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Load the labeled dataset
df = pd.read_csv("CSV_Files/Training and Testing Sets/UNSW_NB15_training-set.csv", low_memory=False)
df.head()


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [7]:
# Drop columns that don't help in classification
df = df.drop(columns=['srcip', 'sport', 'dstip', 'dsport', 'attack_cat'], errors='ignore')

# Encode all categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

selected_features = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'smean', 'dmean', 'service']
X = df[selected_features]

y = df['label']

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Train Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)


In [9]:
# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9451937608714249

Confusion Matrix:
 [[10118  1082]
 [  840 23029]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91     11200
           1       0.96      0.96      0.96     23869

    accuracy                           0.95     35069
   macro avg       0.94      0.93      0.94     35069
weighted avg       0.94      0.95      0.95     35069



In [10]:
import joblib

# Save model and scaler
joblib.dump(clf, "IsAffected/rf_model.pkl")
joblib.dump(scaler, "IsAffected/scaler.pkl")


['IsAffected/scaler.pkl']