In [1]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, f1_score,precision_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
from sklearn.feature_selection import RFECV

from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import ConfusionMatrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

In [2]:
train=pd.read_csv("DataSet/Train.csv")

In [6]:
features=train.drop(columns=['ID','Label']).columns
target='Label'


In [7]:
X_train,X_test,y_train,y_test=train_test_split(train[features], 
                                                    train[target].to_frame(),
                                                    stratify=train[target], #to account for class imbalance
                                                    test_size=0.2,
                                                    random_state=42
                                                    )

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced'
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        eval_metric='logloss',
        random_state=42
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42
    )
}

In [9]:
from sklearn.metrics import f1_score, precision_score, recall_score

results = []

for name, model in models.items():
    
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    
    results.append({
        "Model": name,
        "Features": X_train.shape[1],
        "F1": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results)
print(results_df)

[LightGBM] [Info] Number of positive: 1923, number of negative: 5203
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3109
[LightGBM] [Info] Number of data points in the train set: 7126, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
                 Model  Features        F1  Precision    Recall
0  Logistic Regression        14  0.527347   0.434140  0.671518
1        Random Forest        14  0.942454   0.986364  0.902287
2              XGBoost        14  0.957265   0.984615  0.931393
3             LightGBM        14  0.957717   0.974194  0.941788


In [10]:
from sklearn.feature_selection import RFECV

def select_features(X_train,y_train,X_test,model):
    rfecv=RFECV(
        estimator=model,
        step=1,
        cv=5,
        scoring='f1',
        min_features_to_select=5,
        n_jobs=-1
    )

    rfecv.fit(X_train,y_train)

    selected=X_train.columns[rfecv.support_]
    X_train_fs=X_train[selected]
    X_test_fs=X_test[selected]

    return X_train_fs,X_test_fs,selected;

In [None]:
results=[]
for name,model in models.items():

    X_train_fs,X_test_fs,selected=select_features(X_train,y_train,X_test,model)
    
    model.fit(X_train_fs,y_train)
    y_pred = model.predict(X_test_fs)
    
    results.append({
        "Model": name,
        "Features": len(selected),
        "F1": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred)
    })
    print(f"{name} selected features:")
    print(list(selected))
    print("-" * 40)

results_df=pd.DataFrame(results)
print(results_df)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Logistic Regression selected features:
['I/O Data Operations', ' I/O Data Bytes', 'Number of subprocesses', 'Time on processor', 'Disk Reading/sec', 'Disc Writing/sec', 'Bytes Sent/sent', 'Received Bytes (HTTP)', 'Network packets sent', 'Network packets received', 'Pages Read/sec', 'Pages Input/sec', 'Page Errors/sec', 'Confirmed byte radius']
----------------------------------------
