In [2]:
import pandas as pd


In [3]:
df_train= pd.read_csv("kdd_train.csv")
df_test= pd.read_csv("kdd_test.csv")

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
X_train= df_train.drop("labels", axis=1)
y_train= df_train["labels"]
X_test= df_test.drop("labels", axis=1)
y_test= df_test["labels"]

In [6]:
categorical_cols= ["protocol_type", "service", "flag"]
preprocessor= ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("num", StandardScaler(), [col for col in X_train.columns if col not in categorical_cols])
])
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
model_pipeline.fit(X_train, y_train)
y_pred= model_pipeline.predict(X_test)
print("Accuracy score RF", accuracy_score(y_test, y_pred))
import psutil
print(psutil.virtual_memory())

Accuracy score RF 0.9139017033356991
svmem(total=4151689216, available=707395584, percent=83.0, used=3444293632, free=707395584)


In [7]:
import matplotlib.pyplot as plt

In [8]:
importances = model_pipeline.named_steps["classifier"].feature_importances_


In [9]:
one_hot_columns = model_pipeline.named_steps["preprocessor"].transformers_[0][1].get_feature_names_out(input_features=["protocol_type", "service", "flag"])

In [10]:
numeric_columns = [col for col in X_train.columns if col not in ["protocol_type", "service", "flag"]]

In [11]:
all_features = list(one_hot_columns) + numeric_columns

In [12]:
feature_importance_df = pd.DataFrame({
    "feature": all_features,
    "importance": importances
}).sort_values(by="importance", ascending=False)
top_features = feature_importance_df["feature"].iloc[:20].tolist()

In [13]:
def reduce_features(df, top_feat_list):
    df_encoded = pd.get_dummies(df, columns=categorical_cols)
    return df_encoded[top_feat_list]
X_train_reduced = reduce_features(X_train, top_features)
X_test_reduced = reduce_features(X_test, top_features)

In [14]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
base_model= DecisionTreeClassifier(max_depth=1)
model = AdaBoostClassifier( 
    estimator=base_model,
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)
model.fit(X_train_reduced, y_train)
y_pred= model.predict(X_test_reduced)
print("AdaF Feature selection score:", accuracy_score(y_test,y_pred))
import psutil
print(psutil.virtual_memory())



AdaF Feature selection score: 0.7231192334989354
svmem(total=4151689216, available=656621568, percent=84.2, used=3495067648, free=656621568)


In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier,StackingClassifier
from sklearn.tree import DecisionTreeClassifier

label_encoder= LabelEncoder()
y_train_encoded= label_encoder.fit_transform(y_train)
valid_mask = y_test.isin(label_encoder.classes_)
X_test_filtered = X_test_reduced[valid_mask]
y_test_filtered = y_test[valid_mask]

# Encode filtered test labels
y_test_encoded = label_encoder.transform(y_test_filtered)
base_model= DecisionTreeClassifier(max_depth=2)
base_learners=[
    ("RF", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("ADA", AdaBoostClassifier(estimator=base_model, random_state=42))
]
meta_learner=LogisticRegression()
stack_model = StackingClassifier( 
    estimators=base_learners,
    final_estimator=meta_learner,
    passthrough=True,
)
stack_model.fit(X_train_reduced, y_train_encoded)
y_pred= stack_model.predict(X_test_filtered)
accuracy= accuracy_score(y_test_encoded, y_pred)
print(f"XGB Accuracy score: {accuracy:.3f}")
print(psutil.virtual_memory())




In [None]:
import numpy as np
import networkx as nx
from sklearn.feature_selection import mutual_info_classif

In [None]:
mi = mutual_info_classif(X_train_reduced, y_train_encoded)
mi_series = pd.Series(mi, index=X_train_reduced.columns).sort_values(ascending=False)
corr_matrix = X_train_reduced.corr().abs()
G= nx.Graph()

In [None]:
import matplotlib.pyplot as plt
G.add_nodes_from(X_train_reduced.columns[:20])
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > 0.85:  # set your threshold
            G.add_edge(corr_matrix.columns[i], corr_matrix.columns[j])
components = list(nx.connected_components(G))
clusters = [list(c) for c in components if len(c) > 1]
independents = [list(c)[0] for c in components if len(c) == 1]



In [None]:
features_to_drop= set()
for component in nx.connected_components(G):
    if len(component) > 1:
        top_feature = max(component, key=lambda f: mi_series[f])
        features_to_drop.update(set(component) - {top_feature})
X_graph_filtered = X_train_reduced.drop(columns=features_to_drop)

In [None]:
mi_scores = pd.Series(
    mutual_info_classif(X_train_reduced, y_train_encoded),
    index=X_train_reduced.columns
)
# Most informative feature from each cluster
best_from_clusters = []
for group in clusters:
    group_scores = mi_scores[group]
    best_feature = group_scores.idxmax()
    best_from_clusters.append(best_feature)
# Combine independent features and selected bests from clusters
final_features = list(set(independents + best_from_clusters))
X_selected = X_train_reduced[final_features]



In [None]:
X_train_selected = X_train_reduced[final_features]
X_test_selected = X_test_reduced[final_features]

In [None]:
model = xgb.XGBClassifier(
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False  # for newer versions of xgboost
)

In [None]:

X_test_filtered = X_test_reduced[valid_mask]

# Then select only the final features from your cleaned subset
X_test_selected = X_test_filtered[final_features]

# Now safely realign labels
y_test_filtered = y_test[valid_mask]
y_test_encoded = label_encoder.transform(y_test_filtered)



In [None]:
model.fit(X_train, y_train,
          eval_set=[(X_test, y_test)],
          verbose=False)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy with refined features: {accuracy:.3f}")


In [None]:
import time
import psutil
import xgboost as xgb
from sklearn.metrics import accuracy_score

def benchmark_model(X_train, y_train, X_test, y_test, label="Model"):
    process = psutil.Process()
    start_mem = process.memory_info().rss / (1024 ** 2)  # in MB
    start_time = time.time()

    model = xgb.XGBClassifier(
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    )
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    end_time = time.time()
    end_mem = process.memory_info().rss / (1024 ** 2)

    print(f" {label} -- Accuracy: {acc:.4f}")
    print(f" Runtime: {end_time - start_time:.2f} seconds")
    print(f" Memory Used: {end_mem - start_mem:.2f} MB\n")

# Run for full feature set
benchmark_model(X_train_reduced, y_train_encoded, X_test_reduced, y_test_encoded, label="Full Features")

# Run for pruned feature set
benchmark_model(X_train_selected, y_train_encoded, X_test_selected, y_test_encoded, label="Pruned Features")
