# Feature Selection

### Environment Setup

In [None]:
import os
import sys
import json

import numpy as np
import pandas as pd

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

import xgboost as xgb
import joblib

print("Environment check:")
print("Python:", sys.version.split()[0])
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)
print("XGBoost:", xgb.__version__)


### Get Project Root and Key Paths

In [None]:
cwd = os.getcwd()
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

print("Project root:", project_root)

data_dir = os.path.join(project_root, "data", "processed")
models_dir = os.path.join(project_root, "models")

train_path = os.path.join(data_dir, "train.csv")
val_path = os.path.join(data_dir, "val.csv")
tuned_model_path = os.path.join(models_dir, "xgb_tuned_model.joblib")
best_params_path = os.path.join(models_dir, "xgb_best_params.json")

for p in [train_path, val_path, tuned_model_path, best_params_path]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing required file: {p}")

print("Train path:", train_path)
print("Val path:", val_path)
print("Tuned model path:", tuned_model_path)
print("Best params path:", best_params_path)


### Load Data

In [None]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
train_df.head()


In [None]:
if "label" not in train_df.columns:
    raise KeyError("Expected 'label' column in train/val CSVs")

y_train = train_df["label"].astype(int)
y_val = val_df["label"].astype(int)

drop_cols = ["label"]
if "insider" in train_df.columns:
    drop_cols.append("insider")

X_train = train_df.drop(columns=drop_cols)
X_val = val_df.drop(columns=drop_cols)

feature_names = X_train.columns.tolist()

print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("\nTrain label distribution:")
print(y_train.value_counts())


### Load Tuned Model and Compute Feature Importance

In [None]:
xgb_tuned = joblib.load(tuned_model_path)
xgb_tuned

In [None]:
if not hasattr(xgb_tuned, "feature_importances_"):
    raise AttributeError("Loaded model has no feature_importances_ attribute.")

importances = xgb_tuned.feature_importances_

importance_df = (
    pd.DataFrame({"feature": feature_names, "importance": importances})
      .sort_values(by="importance", ascending=False)
      .reset_index(drop=True)
)

print("Top 15 features by importance:")
importance_df.head(15)


### Keep only most important features

In [None]:
threshold = 0.95 

importance_df["cum_importance"] = importance_df["importance"].cumsum()

selected = importance_df[importance_df["cum_importance"] <= threshold]
selected_features = selected["feature"].tolist()

print(f"Selected {len(selected_features)} features out of {len(feature_names)}")
selected.head(10)


### Build new matrices with only reduced features

In [None]:
X_train_reduced = X_train[selected_features]
X_val_reduced = X_val[selected_features]

print("X_train_reduced:", X_train_reduced.shape)
print("X_val_reduced:", X_val_reduced.shape)


### Train model with best params and reduced features

In [None]:
best_params = json.load(open("models/xgb_best_params.json"))

xgb_reduced = xgb.XGBClassifier(
    **best_params,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

xgb_reduced.fit(X_train_reduced, y_train)


### Evaluate Reduced model on validation set

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

val_probs = xgb_reduced.predict_proba(X_val_reduced)[:, 1]
val_preds = (val_probs >= 0.5).astype(int)

print(classification_report(y_val, val_preds))
print(confusion_matrix(y_val, val_preds))


### Save Reduced Features and Model

In [None]:
feature_list = list(X_train_reduced.columns)

with open("models/feature_list.json", "w") as f:
    json.dump(feature_list, f, indent=4)

print("Saved reduced feature list. Count:", len(feature_list))

reduced_model_path = os.path.join("models", "xgb_reduced_model.joblib")
joblib.dump(xgb_reduced, reduced_model_path)

print("Saved reduced model to:", reduced_model_path)


# Conclusion

These results confirm that the reduced model preserves (and in some metrics improves) the performance of the full-feature model.
    
This reduced model now serves as the foundation for Notebook 5, where we perform final test-set evaluation and threshold calibration to define the modelâ€™s operational alert modes.
