In [None]:
# 🔹 Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.feature_selection import RFE

# 🔹 Step 2: Load and Prepare Dataset
df = pd.read_csv(r"C:\Users\Otala\Desktop\cleaned_numeric_combined.csv")
df.columns = df.columns.str.strip()  # Clean column names

# Handle infinite values and missing data
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print(f"✅ Loaded dataset. Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")

# 🔹 Step 3: Feature Engineering (Optional Enhancements)
if 'Flow Duration' in df.columns:
    df['flow_duration_rolling_mean'] = df['Flow Duration'].rolling(window=10, min_periods=1).mean()

if 'Total Fwd Packets' in df.columns and 'Total Backward Packets' in df.columns:
    df['fwd_bwd_ratio'] = df['Total Fwd Packets'] / (df['Total Backward Packets'] + 1e-6)

df.fillna(0, inplace=True)  # Fill any new NaNs after rolling

# 🔹 Step 4: Regenerate Anomaly Column with Isolation Forest
features = df.drop(columns=['anomaly'], errors='ignore')  # Drop old anomaly if exists
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

model = IsolationForest(n_estimators=100, contamination=0.02, random_state=42)
df['anomaly'] = model.fit_predict(scaled_features)
df['anomaly'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

print("\n✅ 'anomaly' column created:")
print(df['anomaly'].value_counts())

# 🔹 Step 5: Feature Selection with Random Forest + RFE
X = df.drop(columns=['anomaly'])
y = df['anomaly'].map({'Normal': 0, 'Anomaly': 1})

model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(model_rf, n_features_to_select=10)
selector = selector.fit(X, y)

print("\n🎯 Top 10 Selected Features:")
print(X.columns[selector.support_])

# 🔹 Optional: Plot Correlation Heatmap
plt.figure(figsize=(16, 10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap="coolwarm", vmax=1.0, vmin=-1.0)
plt.title("🔍 Feature Correlation Matrix")
plt.tight_layout()
plt.show()


✅ Loaded dataset. Shape: (2827876, 78)
Missing values: 0

✅ 'anomaly' column created:
anomaly
Normal     2771318
Anomaly      56558
Name: count, dtype: int64
