<a href="https://colab.research.google.com/github/parthgiri01/WEEK-4-TEST/blob/main/Welcome_to_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import warnings
import numpy as np # Import numpy

warnings.filterwarnings('ignore')

# --- Data Loading (Placeholder) ---
# NOTE: Replace this with your actual data loading code for the Bot-IoT dataset.
# Example: df = pd.read_csv('your_bot_iot_dataset.csv')
# For demonstration, we will create a simple mock dataset with class imbalance.
print("Loading dataset...")
np.random.seed(42)
num_samples = 10000
num_features = 20
X = pd.DataFrame(np.random.rand(num_samples, num_features))

# Create imbalanced labels: 95% benign (0), 5% malicious (1)
y = np.zeros(num_samples, dtype=int)
num_anomalies = int(num_samples * 0.05)
y[:num_anomalies] = 1
np.random.shuffle(y)
y = pd.Series(y)

print(f"Dataset loaded. Class distribution: {y.value_counts()}")

# --- Pre-processing ---
print("Pre-processing data...")
# Split data before scaling to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scaling numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance using SMOTE on the training data
print("Applying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
print(f"Training data class distribution after SMOTE: {pd.Series(y_train_res).value_counts()}")

# --- Model Training ---
print("Training LightGBM model...")
lgbm_model = lgb.LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_res, y_train_res)
print("Model training complete.")

# --- Evaluation ---
print("Evaluating model performance on the test set...")
y_pred = lgbm_model.predict(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Calculate False Positive Rate (FP / (FP + TN))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
fp_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

# Print results
print("\n--- Model Performance Metrics ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"False Positive Rate: {fp_rate:.4f}")

Loading dataset...
Dataset loaded. Class distribution: 0    9500
1     500
Name: count, dtype: int64
Pre-processing data...
Applying SMOTE to balance the training data...
Training data class distribution after SMOTE: 0    6650
1    6650
Name: count, dtype: int64
Training LightGBM model...
[LightGBM] [Info] Number of positive: 6650, number of negative: 6650
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 13300, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Model training complete.
Evaluating model performance on the test set...

--- Model Performance Metrics ---
Accuracy: 0.8443
Precision: 0.9065
Recall: 0.8443
F1-Score: 0.8733
False Positive Rate: 0.1186
