In [1]:
# === Setup ===
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------------
# 1) Load data
# ------------------------------------------------------
csv_path = Path("../data/raw/exoplanets_2025.csv")
data = pd.read_csv(csv_path)
print("Initial shape:", data.shape)

# ------------------------------------------------------
# 2) Targets (no leakage)
# ------------------------------------------------------
# Binary target: 1 if pipeline labeled CANDIDATE, else 0
data["ExoplanetCandidate"] = (data["koi_pdisposition"] == "CANDIDATE").astype(int)

# Optional multi-level label (not used for prediction here)
# 2=CONFIRMED, 1=CANDIDATE, 0=FALSE POSITIVE/OTHER
data["ExoplanetConfirmed"] = np.select(
    [
        data["koi_disposition"] == "CONFIRMED",
        data["koi_disposition"] == "CANDIDATE",
    ],
    [2, 1],
    default=0,
)

# ------------------------------------------------------
# 3) Columns to drop (IDs, strings, labels)
# ------------------------------------------------------
drop_cols = [
    "kepler_name",           # string
    "kepoi_name",            # string
    "koi_disposition",       # label text (drop)
    "koi_pdisposition",      # label text (drop)
    "ExoplanetCandidate",    # drop from X later via column selection to be safe
    "ExoplanetConfirmed",    # drop from X later to avoid leakage
    "koi_tce_delivname",     # string delivery tag
]

# ID / weakly-informative
maybe_drop_cols = [
    "kepid",       # identifier
    "ra", "dec",   # sky coordinates (optional, drop to simplify)
    "koi_kepmag",  # can keep or drop; keeping is fine; drop for physics-ish
]

drop_cols = [c for c in drop_cols if c in data.columns]
maybe_drop_cols = [c for c in maybe_drop_cols if c in data.columns]

# ------------------------------------------------------
# 4) Basic cleaning: numeric-only features or encode categoricals
#    (Here we choose "numeric-only" for simplicity)
# ------------------------------------------------------
# Build candidate X with numeric columns only
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()

# Remove the targets from numeric feature list if present
for t in ["ExoplanetCandidate", "ExoplanetConfirmed"]:
    if t in numeric_cols:
        numeric_cols.remove(t)

# Optionally ensure we don't include IDs / coords
for c in maybe_drop_cols:
    if c in numeric_cols:
        numeric_cols.remove(c)

X_all = data[numeric_cols].copy()
y_all = data["ExoplanetCandidate"].copy()

# Fill numeric NaNs with median
# Make sure weird values won't break the scaler/estimators
X_all = X_all.replace([np.inf, -np.inf], np.nan)

# Drop columns that are entirely NaN or constant (those break scaling/learning)
all_nan_cols = X_all.columns[X_all.isna().all()]
if len(all_nan_cols):
    X_all = X_all.drop(columns=all_nan_cols)

const_cols = X_all.columns[X_all.nunique(dropna=True) <= 1]
if len(const_cols):
    X_all = X_all.drop(columns=const_cols)

print(f"Dropped {len(all_nan_cols)} all-NaN cols, {len(const_cols)} constant cols. New feature count: {X_all.shape[1]}")


print(f"Feature count: {X_all.shape[1]}")

# ------------------------------------------------------
# 5) Train / test split (outer held-out)
# ------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.4, random_state=1, stratify=y_all
)

print(f"[Split] Train: {X_train.shape}, Test: {X_test.shape}")
print("Positive rate (train/test):", y_train.mean().round(3), y_test.mean().round(3))

# ------------------------------------------------------
# 6) Define models with appropriate preprocessing
# ------------------------------------------------------

# For linear / distance-based models: impute -> scale
scale_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
])

# For tree/ensemble models: impute only (no scaling)
impute_only = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
])

lr_pipe = Pipeline([
    ("prep", scale_pipeline),
    ("clf", LogisticRegression(max_iter=5000, class_weight="balanced", random_state=1)),
])

knn_pipe = Pipeline([
    ("prep", scale_pipeline),
    ("clf", KNeighborsClassifier(n_neighbors=5, metric="manhattan")),
])

dt_pipe = Pipeline([
    ("prep", impute_only),
    ("clf", DecisionTreeClassifier(random_state=1, class_weight="balanced")),
])

rf_pipe = Pipeline([
    ("prep", impute_only),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=1, class_weight="balanced", n_jobs=-1)),
])

models = {
    "Logistic Regression": lr_pipe,
    "KNN": knn_pipe,
    "Decision Tree": dt_pipe,
    "Random Forest": rf_pipe,
}

# ------------------------------------------------------
# 7) Fit on TRAIN; evaluate on a validation split or via simple holdout (we'll use a small inner split)
# ------------------------------------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.4, random_state=1, stratify=y_train
)

def eval_model(name, model, X, y):
    y_pred = model.predict(X)
    return {
        "Model": name,
        "Accuracy": accuracy_score(y, y_pred),
        "Precision": precision_score(y, y_pred, zero_division=0),
        "Recall": recall_score(y, y_pred, zero_division=0),
        "F1": f1_score(y, y_pred, zero_division=0),
    }

val_rows = []
for name, pipe in models.items():
    pipe.fit(X_tr, y_tr)
    val_rows.append(eval_model(name, pipe, X_val, y_val))

val_df = pd.DataFrame(val_rows).sort_values("F1", ascending=False).reset_index(drop=True)
print("\n=== Validation (on 40% of train) ===")
display(val_df)

# ------------------------------------------------------
# 8) Pick the best (by F1), refit on full TRAIN, and evaluate on held-out TEST
# ------------------------------------------------------
best_name = val_df.iloc[0]["Model"]
best_pipe = models[best_name]
best_pipe.fit(X_train, y_train)

y_pred_test = best_pipe.predict(X_test)

print(f"\n=== Held-out Test: {best_name} ===")
print(classification_report(y_test, y_pred_test, digits=4))
cm = confusion_matrix(y_test, y_pred_test, labels=[0,1])
print("Confusion matrix (labels 0,1):\n", cm)

# Optional: save splits and predictions for your notebook files
out_dir = Path("notebooks")
out_dir.mkdir(parents=True, exist_ok=True)
pd.concat([X_train, y_train.rename("ExoplanetCandidate")], axis=1).to_csv(out_dir/"train_baseline.csv", index=False)
X_test.to_csv(out_dir/"test_baseline.csv", index=False)
pd.Series(y_test, name="ExoplanetCandidate").to_csv(out_dir/"test_solution_baseline.csv", index=False)


Initial shape: (8054, 153)
Dropped 24 all-NaN cols, 5 constant cols. New feature count: 100
Feature count: 100
[Split] Train: (4832, 100), Test: (3222, 100)
Positive rate (train/test): 0.501 0.501

=== Validation (on 40% of train) ===


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.998448,0.998966,0.997934,0.99845
1,Random Forest,0.99224,0.998953,0.985537,0.9922
2,Decision Tree,0.991723,0.992754,0.990702,0.991727
3,KNN,0.937403,0.90839,0.97314,0.939651



=== Held-out Test: Logistic Regression ===
              precision    recall  f1-score   support

           0     0.9969    0.9994    0.9981      1608
           1     0.9994    0.9969    0.9981      1614

    accuracy                         0.9981      3222
   macro avg     0.9981    0.9981    0.9981      3222
weighted avg     0.9981    0.9981    0.9981      3222

Confusion matrix (labels 0,1):
 [[1607    1]
 [   5 1609]]


In [2]:
# Remove the KOI pipeline's built-in false-positive flags
flags_to_drop = ["koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec"]
X_all_improved = X_all.drop(columns=[c for c in flags_to_drop if c in X_all.columns], errors="ignore")

# Drop koi_score (pipeline confidence score — too correlated with label, leaks info)
if "koi_score" in X_all_improved.columns:
    X_all_improved = X_all_improved.drop(columns=["koi_score"])
    print("Dropped koi_score (pipeline confidence, avoids leakage)")

print("Baseline features:", X_all.shape[1])
print("Improved features:", X_all_improved.shape[1])

Dropped koi_score (pipeline confidence, avoids leakage)
Baseline features: 100
Improved features: 95


In [3]:
# ------------------------------------------------------
# Improved Model Training & Evaluation
# ------------------------------------------------------

# 1) Train / test split (outer held-out)
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(
    X_all_improved, y_all, test_size=0.4, random_state=1, stratify=y_all
)

# 2) Validation split (inner)
X_tr_imp, X_val_imp, y_tr_imp, y_val_imp = train_test_split(
    X_train_imp, y_train_imp, test_size=0.4, random_state=1, stratify=y_train_imp
)

# 3) Train all models on improved dataset
val_rows_imp = []
for name, pipe in models.items():
    pipe.fit(X_tr_imp, y_tr_imp)
    val_rows_imp.append(eval_model(name, pipe, X_val_imp, y_val_imp))

val_df_imp = pd.DataFrame(val_rows_imp).sort_values("F1", ascending=False).reset_index(drop=True)
print("\n=== Validation (Improved) ===")
display(val_df_imp)

# 4) Best model on improved dataset
best_name_imp = val_df_imp.iloc[0]["Model"]
best_pipe_imp = models[best_name_imp]
best_pipe_imp.fit(X_train_imp, y_train_imp)

y_pred_test_imp = best_pipe_imp.predict(X_test_imp)

print(f"\n=== Held-out Test (Improved): {best_name_imp} ===")
print(classification_report(y_test_imp, y_pred_test_imp, digits=4))
cm_imp = confusion_matrix(y_test_imp, y_pred_test_imp, labels=[0,1])
print("Confusion matrix (labels 0,1):\n", cm_imp)

# ------------------------------------------------------
# Save Improved Results
# ------------------------------------------------------
# 1) Validation results
val_df_imp.to_csv("notebooks/validation_results_improved.csv", index=False)

# 2) Held-out classification report
report_txt_imp = classification_report(y_test_imp, y_pred_test_imp, digits=4)
with open("notebooks/test_report_improved.txt", "w") as f:
    f.write(report_txt_imp)

# Also save as JSON
report_dict_imp = classification_report(y_test_imp, y_pred_test_imp, digits=4, output_dict=True)

# 3) Confusion matrix
cm_df_imp = pd.DataFrame(cm_imp, index=["Actual 0","Actual 1"], columns=["Pred 0","Pred 1"])
cm_df_imp.to_csv("notebooks/confusion_matrix_improved.csv")

# Combine everything into one JSON file
results_imp = {
    "validation": val_df_imp.to_dict(orient="records"),
    "best_model": best_name_imp,
    "classification_report": report_dict_imp,
    "confusion_matrix": cm_df_imp.to_dict()
}

with open("notebooks/improved_results.json", "w") as f:
    json.dump(results_imp, f, indent=2)

print("✅ Improved results saved to notebooks/ (CSV, TXT, JSON)")



=== Validation (Improved) ===


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Random Forest,0.917227,0.927061,0.905992,0.916405
1,Logistic Regression,0.895499,0.883,0.91219,0.897358
2,Decision Tree,0.872219,0.872802,0.871901,0.872351
3,KNN,0.861873,0.813226,0.940083,0.872065



=== Held-out Test (Improved): Random Forest ===
              precision    recall  f1-score   support

           0     0.9186    0.9198    0.9192      1608
           1     0.9200    0.9188    0.9194      1614

    accuracy                         0.9193      3222
   macro avg     0.9193    0.9193    0.9193      3222
weighted avg     0.9193    0.9193    0.9193      3222

Confusion matrix (labels 0,1):
 [[1479  129]
 [ 131 1483]]
✅ Improved results saved to notebooks/ (CSV, TXT, JSON)


In [4]:
print("koi_disposition counts:\n", data["koi_disposition"].value_counts(dropna=False))
print("\nkoi_pdisposition counts:\n", data["koi_pdisposition"].value_counts(dropna=False))
print("\nBinary target (ExoplanetCandidate) value counts:\n", y_all.value_counts())

# How pipeline disposition maps into binary target
print("\nCross-tab of koi_pdisposition vs ExoplanetCandidate:")
print(pd.crosstab(data["koi_pdisposition"], data["ExoplanetCandidate"]))

koi_disposition counts:
 koi_disposition
FALSE POSITIVE    3965
CONFIRMED         2729
CANDIDATE         1360
Name: count, dtype: int64

koi_pdisposition counts:
 koi_pdisposition
CANDIDATE         4034
FALSE POSITIVE    4020
Name: count, dtype: int64

Binary target (ExoplanetCandidate) value counts:
 ExoplanetCandidate
1    4034
0    4020
Name: count, dtype: int64

Cross-tab of koi_pdisposition vs ExoplanetCandidate:
ExoplanetCandidate     0     1
koi_pdisposition              
CANDIDATE              0  4034
FALSE POSITIVE      4020     0


In [5]:
# Step 2 — Residual missingness & weird values check

# 1) NaN percentage per feature
na_rate = X_all_improved.isna().mean().sort_values(ascending=False)
print("Top 10 features by NaN rate:\n")
print(na_rate.head(10))

# 2) Any infinities left? (should be none, replaced them earlier)
has_inf = np.isinf(X_all_improved.to_numpy()).any()
print("\nAny +/- inf in X_all_improved?:", has_inf)

# 3) Columns with 100% missing (shouldn't exist after cleaning)
all_nan_cols = X_all_improved.columns[X_all_improved.isna().all()]
print("\nColumns entirely NaN:", list(all_nan_cols))


Top 10 features by NaN rate:

koi_fwm_stat_sig      0.088403
koi_zmag              0.062453
koi_fwm_prao          0.058356
koi_fwm_prao_err      0.058356
koi_fwm_pdeco_err     0.056742
koi_fwm_pdeco         0.056742
koi_dicco_msky_err    0.023715
koi_dicco_mra_err     0.023715
koi_dicco_mra         0.023715
koi_dicco_msky        0.023715
dtype: float64

Any +/- inf in X_all_improved?: False

Columns entirely NaN: []


In [6]:
# Step 3 — Correlation with the target (sanity check for leakage)

# Compute correlation of every feature with the binary target
corr = pd.concat([X_all_improved, y_all.rename("ExoplanetCandidate")], axis=1)\
          .corr(numeric_only=True)["ExoplanetCandidate"]

# Drop the target itself
corr = corr.drop(labels=["ExoplanetCandidate"])

# Sort by absolute correlation
corr_sorted = corr.abs().sort_values(ascending=False)

print("Top 15 |correlation| with target:\n")
print(corr_sorted.head(15))

# Flag suspicious features (very high correlation)
suspicious = corr_sorted[corr_sorted > 0.98]
print("\n⚠️ Suspicious features with |corr| > 0.98:")
print(list(suspicious.index))


Top 15 |correlation| with target:

koi_count           0.378026
koi_dicco_msky      0.371521
koi_smet_err2       0.370637
koi_dikco_msky      0.368971
koi_steff_err1      0.363041
koi_incl            0.354289
koi_smet_err1       0.345000
koi_steff_err2      0.327367
koi_teq             0.306543
koi_fwm_stat_sig    0.305555
koi_depth           0.297551
koi_num_transits    0.292487
koi_smass_err1      0.289274
koi_smet            0.280806
koi_model_snr       0.265737
Name: ExoplanetCandidate, dtype: float64

⚠️ Suspicious features with |corr| > 0.98:
[]


In [7]:
# Step 4 — Permutation importance for improved best model (Random Forest)

from sklearn.inspection import permutation_importance

# Make sure we’re using the improved Random Forest model
assert best_name_imp == "Random Forest", "Best improved model is not Random Forest — check results first."
rf_imp = best_pipe_imp

# Compute permutation importance on the validation set
perm = permutation_importance(
    rf_imp, X_val_imp, y_val_imp,
    scoring="f1", n_repeats=10, random_state=1, n_jobs=-1
)

pi = pd.Series(perm.importances_mean, index=X_val_imp.columns).sort_values(ascending=False)

print("Top 15 features by permutation importance (mean ΔF1 when shuffled):")
display(pi.head(15))


Top 15 features by permutation importance (mean ΔF1 when shuffled):


koi_dikco_msky      0.022729
koi_fwm_stat_sig    0.001497
koi_dicco_msky      0.000929
koi_srho_err1       0.000516
koi_model_snr       0.000465
koi_fwm_pdeco       0.000413
koi_max_mult_ev     0.000310
koi_num_transits    0.000258
koi_prad            0.000207
koi_dor             0.000207
koi_ror             0.000103
koi_prad_err2       0.000052
koi_smass_err2      0.000000
koi_srad            0.000000
koi_fwm_prao        0.000000
dtype: float64