<a href="https://colab.research.google.com/github/omar-omar-om/gradProject-notebooks/blob/main/frequency_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Libraries

In [None]:
import os
import pickle
import json
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier





# Define Paths & Load Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Define paths
base_path = "/content/drive/My Drive/frequency-encoding/"
model_save_path = os.path.join(base_path, "best_model/")

# Ensure model directory exists
os.makedirs(model_save_path, exist_ok=True)

# Load dataset
train = pd.read_csv(os.path.join(base_path, "train_frequency.csv"))
val = pd.read_csv(os.path.join(base_path, "val_frequency.csv"))
test = pd.read_csv(os.path.join(base_path, "test_frequency.csv"))

# Identify features and target
target = "HasDetections"
X_train, y_train = train.drop(columns=[target]), train[target]
X_val, y_val = val.drop(columns=[target]), val[target]
X_test, y_test = test.drop(columns=[target]), test[target]

print(f" Data loaded. Train shape: {X_train.shape}, Validation shape: {X_val.shape}, Test shape: {X_test.shape}")




Mounted at /content/drive
 Data loaded. Train shape: (2062484, 61), Validation shape: (257810, 61), Test shape: (257811, 61)


# Define Models & Hyperparameters for Grid Search


In [None]:
# Define models and their respective expanded hyperparameter grids
models = {
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        "params": {
            "n_estimators": [100, 300],
            "max_depth": [3, 9],
            "learning_rate": [0.01, 0.1],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0]
        }
    },
    "LightGBM": {
        "model": LGBMClassifier(),
        "params": {
            "n_estimators": [100, 300],
            "max_depth": [-1, 6],
            "learning_rate": [0.01, 0.1],
            "num_leaves": [31, 50]
            }
    },
    "DecisionTree": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None, 6, 12, 20],
            "criterion": ["gini", "entropy"],
            "min_samples_split": [2, 5, 10]
        }
    }
}

print("Models and expanded hyperparameter grids defined.")


Models and expanded hyperparameter grids defined.


# Perform Grid Search & Select the Best Model



In [None]:
# Initialize tracking variables
best_model = None
best_auc = 0
best_model_name = None
timing_results = []

for model_name, config in models.items():
    print(f" Running Grid Search for {model_name}...")

    # Track Grid Search time
    start_grid_search = time.time()

    grid_search = GridSearchCV(config["model"], config["params"], cv=3, scoring="roc_auc", n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    end_grid_search = time.time()
    grid_search_time = end_grid_search - start_grid_search  # Total Grid Search time

    # Get the best estimator
    best_estimator = grid_search.best_estimator_

    # Track final model training time (with best hyperparameters)
    start_train = time.time()
    best_estimator.fit(X_train, y_train)  # Retraining the best model
    end_train = time.time()
    training_time = end_train - start_train  # Training time for best model only

    # Track evaluation time
    start_eval = time.time()
    y_pred = best_estimator.predict(X_val)
    y_prob = best_estimator.predict_proba(X_val)[:, 1]  # Probability scores for AUC
    end_eval = time.time()
    evaluation_time = end_eval - start_eval  # Total evaluation time

    # Compute AUC Score
    val_auc = roc_auc_score(y_val, y_prob)

    print(f" {model_name} Best AUC on Validation Set: {val_auc:.4f}")

    # Store timing results
    timing_results.append({
        "Model": model_name,
        "Grid Search Time (s)": round(grid_search_time, 2),
        "Final Training Time (s)": round(training_time, 2),
        "Evaluation Time (s)": round(evaluation_time, 2),
        "AUC Score": round(val_auc, 4)
    })

    # Check if this is the best model
    if val_auc > best_auc:
        best_auc = val_auc
        best_model = best_estimator
        best_model_name = model_name

# Convert to DataFrame and display
timing_df = pd.DataFrame(timing_results)
print("\n Training & Evaluation Time Results:")
print(timing_df.to_string(index=False))

print(f"\nBest Model: {best_model_name} with AUC: {best_auc:.4f}")


 Running Grid Search for XGBoost...
Fitting 3 folds for each of 32 candidates, totalling 96 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



 XGBoost Best AUC on Validation Set: 0.7304
 Running Grid Search for LightGBM...
Fitting 3 folds for each of 16 candidates, totalling 48 fits




[LightGBM] [Info] Number of positive: 1034588, number of negative: 1027896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.609664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 2062484, number of used features: 61
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501622 -> initscore=0.006489
[LightGBM] [Info] Start training from score 0.006489
[LightGBM] [Info] Number of positive: 1034588, number of negative: 1027896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.838718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4958
[LightGBM] [Info] Number of data points in the train set: 2062484, number of used features: 61
[LightGB

# Save the Best Model as a Pickle File



In [None]:
# Define model save path
best_model_path = os.path.join(model_save_path, f"best_{best_model_name}.pkl")

# Save the model
with open(best_model_path, "wb") as f:
    pickle.dump(best_model, f)

print(f" Best model ({best_model_name}) saved to {best_model_path}")


 Best model (XGBoost) saved to /content/drive/My Drive/frequency-encoding/best_model/best_XGBoost.pkl


# Load & Evaluate the Best Model (All Metrics)



In [None]:

# Load the best model
with open(best_model_path, "rb") as f:
    loaded_model = pickle.load(f)

# Make predictions
y_test_pred = loaded_model.predict(X_test)
y_test_prob = loaded_model.predict_proba(X_test)[:, 1]  # Probability scores for AUC

# Calculate metrics
test_auc = roc_auc_score(y_test, y_test_prob)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Print all metrics
print(f"\n Best Model: {best_model_name} Performance on Test Set")
print(f"AUC Score: {test_auc:.4f}")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

print("Model evaluation completed.")



 Best Model: XGBoost Performance on Test Set
AUC Score: 0.7295
Accuracy: 0.6631
Precision: 0.6658
Recall: 0.6594
F1 Score: 0.6626

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.67      0.66    128487
           1       0.67      0.66      0.66    129324

    accuracy                           0.66    257811
   macro avg       0.66      0.66      0.66    257811
weighted avg       0.66      0.66      0.66    257811

Model evaluation completed.
