In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset from Kaggle
file_path = "dataset_invade.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "bobaaayoung/dataset-invade",
    file_path,
)

# Display first few rows to verify loading
print("First 5 records:")
print(df.head())

# Preprocessing Steps
# 1. Convert categorical features into numerical using one-hot encoding
df_cleaned = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'], drop_first=True)

# 2. Encode target variable: 'No' -> 0, 'Yes' -> 1
df_cleaned['attack'] = df_cleaned['attack'].map({'No': 0, 'Yes': 1})

# 3. Separate features (X) and target variable (y)
X = df_cleaned.drop(columns=['attack'])
y = df_cleaned['attack']

# 4. Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Display dataset shapes after preprocessing
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets] scikit-learn

import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
file_path = "dataset_invade.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "bobaaayoung/dataset-invade",
    file_path,
)

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Encode target variable
df['attack'] = df['attack'].map({'Yes': 1, 'No': 0})

# Split features and target
X = df.drop('attack', axis=1)
y = df['attack']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Define preprocessing steps
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessor for Logistic Regression (with scaling)
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

# Preprocessor for Random Forest (without scaling)
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

# Create pipelines
pipeline_lr = Pipeline([
    ('preprocessor', preprocessor_lr),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ))
])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor_rf),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    ))
])

# Train models
print("\nTraining Logistic Regression...")
pipeline_lr.fit(X_train, y_train)

print("\nTraining Random Forest...")
pipeline_rf.fit(X_train, y_train)

# Generate predictions
y_pred_lr = pipeline_lr.predict(X_test)
y_pred_proba_lr = pipeline_lr.predict_proba(X_test)[:, 1]

y_pred_rf = pipeline_rf.predict(X_test)
y_pred_proba_rf = pipeline_rf.predict_proba(X_test)[:, 1]

# Evaluate models
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_proba):.4f}")

evaluate_model("Logistic Regression", y_test, y_pred_lr, y_pred_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_proba_rf)

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets] scikit-learn pandas numpy

import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
file_path = "dataset_invade.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "bobaaayoung/dataset-invade",
    file_path,
)

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# ----------------------------
# Preprocessing Steps
# ----------------------------

# 1. Convert categorical features into numerical using one-hot encoding
df_cleaned = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'], drop_first=True)

# 2. Encode target variable: 'No' -> 0, 'Yes' -> 1
df_cleaned['attack'] = df_cleaned['attack'].map({'No': 0, 'Yes': 1})

# 3. Separate features (X) and target variable (y)
X = df_cleaned.drop(columns=['attack'])
y = df_cleaned['attack']

# 4. Normalize numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split dataset into training (80%) and testing (20%) sets (stratified on y)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Display dataset shapes after preprocessing
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# ----------------------------
# Model Training and Evaluation
# ----------------------------

# Initialize models
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Train Logistic Regression
print("\nTraining Logistic Regression...")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_pred_proba_lr = log_reg.predict_proba(X_test)[:, 1]

# Train Random Forest
print("\nTraining Random Forest...")
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

# Evaluation function
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_proba):.4f}")

# Evaluate models
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_pred_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_proba_rf)


In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets] scikit-learn pandas numpy

import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  # New import for KNN
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
file_path = "dataset_invade.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "bobaaayoung/dataset-invade",
    file_path,
)

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# ----------------------------
# Preprocessing Steps
# ----------------------------

# 1. Convert categorical features into numerical using one-hot encoding
df_cleaned = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'], drop_first=True)

# 2. Encode target variable: 'No' -> 0, 'Yes' -> 1
df_cleaned['attack'] = df_cleaned['attack'].map({'No': 0, 'Yes': 1})

# 3. Separate features (X) and target variable (y)
X = df_cleaned.drop(columns=['attack'])
y = df_cleaned['attack']

# 4. Normalize numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split dataset into training (80%) and testing (20%) sets (stratified on y)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Display dataset shapes after preprocessing
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# ----------------------------
# Model Training and Evaluation
# ----------------------------

# Initialize models
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5)  # New KNN classifier with default parameters

# Train Logistic Regression
print("\nTraining Logistic Regression...")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_pred_proba_lr = log_reg.predict_proba(X_test)[:, 1]

# Train Random Forest
print("\nTraining Random Forest...")
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

# Train K-Nearest Neighbors
print("\nTraining K-Nearest Neighbors...")
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
# For KNN, use predict_proba for ROC AUC if available
y_pred_proba_knn = knn_clf.predict_proba(X_test)[:, 1]

# Evaluation function
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_proba):.4f}")

# Evaluate models
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_pred_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_proba_rf)
evaluate_model("K-Nearest Neighbors", y_test, y_pred_knn, y_pred_proba_knn)


In [6]:
# Initialize models
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5) 
#first run
# Training Logistic Regression
print("\nTraining Logistic Regression...")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_pred_proba_lr = log_reg.predict_proba(X_test)[:, 1]

# Training Random Forest
print("\nTraining Random Forest...")
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

# Training K-Nearest Neighbors
print("\nTraining K-Nearest Neighbors...")
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
y_pred_proba_knn = knn_clf.predict_proba(X_test)[:, 1]

# Evaluation function
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_proba):.4f}")
    
# Evaluate models 
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_pred_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_proba_rf)
evaluate_model("K-Nearest Neighbors", y_test, y_pred_knn, y_pred_proba_knn)

import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
#NOW WE ADD OUR DATASETS
# firstly lets Load dataset 
file_path = "dataset_invade.csv"
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "bobaaayoung/dataset-invade",
    file_path,
)

print("Missing values per column:")
print(df.isnull().sum())
#lets process first
# 1. Convert categorical features into numerical using one-hot encoding
df_cleaned = pd.get_dummies(df, columns=['protocol_type', 'service', 'flag'], drop_first=True)

# 2. Encode target variable: 'No' -> 0, 'Yes' -> 1
df_cleaned['attack'] = df_cleaned['attack'].map({'No': 0, 'Yes': 1})

# 3. Separate features (X) and target variable (y)
X = df_cleaned.drop(columns=['attack'])
y = df_cleaned['attack']

# 4. Normalize numerical features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Split dataset into training (80%) and testing (20%) sets (stratified on y)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Display dataset shapes after preprocessing
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

#again initialise models
# Initialize models
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
knn_clf = KNeighborsClassifier(n_neighbors=5)  # New KNN classifier with default parameters
# Initialize XGBoost Classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)


# Train Logistic Regression
print("\nTraining Logistic Regression...")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_pred_proba_lr = log_reg.predict_proba(X_test)[:, 1]

# Train Random Forest
print("\nTraining Random Forest...")
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1]

# Train K-Nearest Neighbors
print("\nTraining K-Nearest Neighbors...")
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
# For KNN, use predict_proba for ROC AUC if available
y_pred_proba_knn = knn_clf.predict_proba(X_test)[:, 1]

# Train XGBoost
print("\nTraining XGBoost...")
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
y_pred_proba_xgb = xgb_clf.predict_proba(X_test)[:, 1]

# Evaluation function
def evaluate_model(name, y_true, y_pred, y_proba):
    print(f"\n{name} Evaluation:")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print(f"ROC AUC Score: {roc_auc_score(y_true, y_proba):.4f}")
#finally evaluate ALL models
# Evaluate models
evaluate_model("Logistic Regression", y_test, y_pred_lr, y_pred_proba_lr)
evaluate_model("Random Forest", y_test, y_pred_rf, y_pred_proba_rf)
evaluate_model("K-Nearest Neighbors", y_test, y_pred_knn, y_pred_proba_knn)
evaluate_model("XGBoost", y_test, y_pred_xgb, y_pred_proba_xgb)


Training Logistic Regression...

Training Random Forest...

Training K-Nearest Neighbors...

Logistic Regression Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     15411
           1       0.96      0.94      0.95     14293

    accuracy                           0.95     29704
   macro avg       0.95      0.95      0.95     29704
weighted avg       0.95      0.95      0.95     29704

Confusion Matrix:
[[14859   552]
 [  847 13446]]
ROC AUC Score: 0.9897

Random Forest Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     15411
           1       1.00      0.99      1.00     14293

    accuracy                           1.00     29704
   macro avg       1.00      1.00      1.00     29704
weighted avg       1.00      1.00      1.00     29704

Confusion Matrix:
[[15363    48]
 [   87 14206]]
ROC AUC Score: 0.9998

K-Ne

  df = kagglehub.load_dataset(


Missing values per column:
duration                  0
protocol_type             0
service                   0
flag                      0
src_bytes                 0
dst_bytes                 0
land                      0
wrong_fragment            0
urgent                    0
hot                       0
logged_in                 0
num_compromised           0
count                     0
srv_count                 0
serror_rate               0
rerror_rate               0
same_srv_rate             0
diff_srv_rate             0
srv_diff_host_rate        0
dst_host_count            0
dst_host_srv_count        0
dst_host_same_srv_rate    0
dst_host_diff_srv_rate    0
attack                    0
dtype: int64
Training set size: (118813, 101)
Testing set size: (29704, 101)

Training Logistic Regression...

Training Random Forest...

Training K-Nearest Neighbors...

Training XGBoost...

Logistic Regression Evaluation:
Classification Report:
              precision    recall  f1-score   support
