In [2]:
# 🔹 Cell 1: Load Clean Dataset
import pandas as pd
import numpy as np

# Load the cleaned dataset
df = pd.read_csv(r"C:\Users\Otala\Desktop\cleaned_numeric_combined.csv")

# Clean column names (strip leading/trailing spaces)
df.columns = df.columns.str.strip()

# Quick look
df.head()


Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,0,20,0.0,0.0,0,0,0.0,0.0,0,0
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,0,20,0.0,0.0,0,0,0.0,0.0,0,0
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,0,20,0.0,0.0,0,0,0.0,0.0,0,0
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,1,20,0.0,0.0,0,0,0.0,0.0,0,0


In [3]:
# 🔹 Cell 2: Clean the data
# Replace infinity with NaN, then drop missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print(f"✅ Cleaned DataFrame shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")


✅ Cleaned DataFrame shape: (2827876, 78)
Missing values: 0


In [4]:
# 🔹 Cell 3: Feature Engineering (rolling mean and ratio)
if 'Flow Duration' in df.columns:
    df['flow_duration_rolling_mean'] = df['Flow Duration'].rolling(window=10, min_periods=1).mean()

if 'Total Fwd Packets' in df.columns and 'Total Backward Packets' in df.columns:
    df['fwd_bwd_ratio'] = df['Total Fwd Packets'] / (df['Total Backward Packets'] + 1e-6)

df.fillna(0, inplace=True)


In [5]:
# 🔹 Cell 4: Regenerate 'anomaly' column using Isolation Forest
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Drop old 'anomaly' column if it exists
features = df.drop(columns=['anomaly'], errors='ignore')

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Train Isolation Forest
model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
df['anomaly'] = model.fit_predict(scaled_features)

# Convert prediction labels to text
df['anomaly'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Display anomaly distribution
print("✅ 'anomaly' column added:")
print(df['anomaly'].value_counts())


✅ 'anomaly' column added:
anomaly
Normal     2771318
Anomaly      56558
Name: count, dtype: int64


In [None]:
# 🔹 Cell 5: Feature Selection using RFE + Random Forest
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Prepare input and target
X = df.drop(columns=['anomaly'])
y = df['anomaly'].map({'Normal': 0, 'Anomaly': 1})  # Binary encoding

# RFE
model = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(model, n_features_to_select=10)
selector = selector.fit(X, y)

# Print top features
print("🎯 Top 10 Selected Features:")
print(X.columns[selector.support_])
