In [1]:
# install joblib
!pip install joblib



In [2]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# import train_test_split
from sklearn.model_selection import train_test_split
# import MinMaxScaler to scale data
from sklearn.preprocessing import MinMaxScaler
# import LogisticRegression for model1
from sklearn.linear_model import LogisticRegression
# import KNeighborsClassifier for model2
from sklearn.neighbors import KNeighborsClassifier
# import for decision tree for model3
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# import GridSearch
from sklearn.model_selection import GridSearchCV

In [10]:
fp = f'{os.getcwd()}/Dataset/TOI_2025.10.03_10.51.46.csv'  # despite .csv, it's TSV+comments
df = pd.read_csv(fp, comment='#')

In [13]:
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df

Unnamed: 0,rowid,toi,toipfx,tid,ctoi_alias,pl_pnum,tfopwg_disp,rastr,ra,decstr,...,st_loggerr2,st_logglim,st_loggsymerr,st_rad,st_raderr1,st_raderr2,st_radlim,st_radsymerr,toi_created,rowupdate
0,1,1000.01,1000,50365310,5.036531e+07,1,FP,07h29m25.85s,112.357708,-12d41m45.46s,...,-0.07,0,1,2.169860,0.072573,-0.072573,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
1,2,1001.01,1001,88863718,8.886372e+07,1,PC,08h10m19.31s,122.580465,-05d30m49.87s,...,-0.09,0,1,2.010000,0.090000,-0.090000,0,1,2019-07-24 15:58:33,2023-04-03 14:31:04
4,5,1004.01,1004,238597883,2.385979e+08,1,FP,08h08m42.77s,122.178195,-48d48m10.12s,...,-0.07,0,1,2.150000,0.060000,-0.060000,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
7,8,1007.01,1007,65212867,6.521287e+07,1,PC,07h31m00.57s,112.752393,-04d27m48.09s,...,-0.09,0,1,2.700000,0.130000,-0.130000,0,1,2019-07-24 15:58:33,2021-10-29 12:59:15
12,13,1011.01,1011,114018671,1.140187e+08,1,PC,07h35m56.34s,113.984761,-32d50m31.2s,...,-0.09,0,1,0.940000,0.050000,-0.050000,0,1,2019-07-24 15:58:33,2023-03-22 16:02:02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7696,7697,993.01,993,259353953,2.593540e+08,1,PC,07h35m50.6s,113.960841,-15d29m58.04s,...,-0.08,0,1,1.670000,0.050000,-0.050000,0,1,2019-07-24 15:58:33,2023-07-12 16:02:01
7697,7698,994.01,994,93963408,9.396341e+07,1,FP,07h40m11.12s,115.046333,-09d05m03.37s,...,-0.07,0,1,1.877390,0.078985,-0.078985,0,1,2019-07-24 15:58:33,2021-10-29 12:59:15
7700,7701,997.01,997,341729521,3.417295e+08,1,FP,08h05m16.69s,121.319521,-59d34m47.27s,...,-0.08,0,1,0.926261,0.045789,-0.045789,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
7701,7702,998.01,998,54390047,5.439005e+07,1,FP,07h53m16.69s,118.319555,-14d13m07.76s,...,-0.07,0,1,2.349860,0.091578,-0.091578,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01


In [33]:
target_column = 'tfopwg_disp'
X = df.drop(target_column,axis=1)
y = df[target_column]

In [34]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric:", numeric_cols[:10], "...")
print("Non-numeric:", non_numeric_cols[:10], "...")

Numeric: ['toipfx', 'pl_pnum', 'ra', 'dec', 'st_pmra', 'st_pmraerr1', 'st_pmraerr2', 'st_pmralim', 'st_pmrasymerr', 'st_pmdec'] ...
Non-numeric: [] ...


In [35]:
drop_cols = [
    "rowid", "toi", "tid", "ctoi_alias", 
    "rastr", "decstr", "toi_created", "rowupdate"
]
X = X.drop(columns=drop_cols, errors="ignore")


In [36]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [col for col in non_numeric_cols if col not in drop_cols]

# One-hot encode them later inside pipeline


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Example pipeline with Random Forest
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=20))
])

clf.fit(X_train, y_train)
print("Train acc:", clf.score(X_train, y_train))
print("Test acc:", clf.score(X_test, y_test))


Train acc: 0.997812879708384
Test acc: 0.6452866861030127


In [49]:
# after you've trained `clf` and split X_train, y_train as before
import joblib, json, time
from pathlib import Path

timestamp = time.strftime("%Y%m%d_%H%M%S")
ART = Path("artifacts") / timestamp
ART.mkdir(parents=True, exist_ok=True)

# 1) save fitted pipeline
joblib.dump(clf, ART / "pipeline.joblib")

# 2) save the feature columns in order (AFTER dropping columns, BEFORE fit)
feature_columns = X_train.columns.tolist()
with open(ART / "feature_columns.json", "w") as f:
    json.dump(feature_columns, f, indent=2)

# 3) save metadata
meta = {
    "target": "tfopwg_disp",
    "drop_cols": ["rowid","toi","tid","ctoi_alias","rastr","decstr","toi_created","rowupdate"],
    "classes": clf.classes_.tolist(),
    "created_at": timestamp,
    "sklearn_version": __import__("sklearn").__version__,
    "notes": "RF pipeline with StandardScaler+OHE; handle_unknown='ignore'"
}
with open(ART / "metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved:", ART.resolve())


Saved: C:\Users\yatharth\Documents\Exoplanets\artifacts\20251005_013453


In [47]:
df_sample = df.sample(n=100)

In [48]:
df_sample.to_csv('testing.csv')

In [50]:
#inferencing
import pandas as pd, numpy as np, json, joblib
from pathlib import Path

ART_DIR = Path("artifacts/20251005_013453")  # <- put your timestamp folder

# load artifacts
clf = joblib.load(ART_DIR / "pipeline.joblib")
feature_columns = json.load(open(ART_DIR / "feature_columns.json"))
meta = json.load(open(ART_DIR / "metadata.json"))

# load new data
new_df = pd.read_csv("testing.csv", low_memory=False)

# apply same drops
drop_cols = meta["drop_cols"]
new_df = new_df.drop(columns=drop_cols, errors="ignore")

# align columns exactly like training
# - missing cols -> filled with NaN (imputer will handle)
# - extra cols -> discarded
X_new = new_df.reindex(columns=feature_columns, fill_value=np.nan)

# predict
y_pred = clf.predict(X_new)
y_prob = clf.predict_proba(X_new)  # same class order as meta["classes"]

# attach outputs if helpful
out = new_df.copy()
out["pred_label"] = y_pred
# optional: top-1 prob
import numpy as np
out["pred_confidence"] = np.max(y_prob, axis=1)

# save results
out.to_csv(ART_DIR / "predictions_on_new_unseen.csv", index=False)
print("Predictions saved to:", (ART_DIR / "predictions_on_new_unseen.csv").resolve())

# if you want probabilities per class as columns:
proba_df = pd.DataFrame(y_prob, columns=[f"proba_{c}" for c in meta["classes"]])
pd.concat([new_df.reset_index(drop=True), proba_df], axis=1)\
  .to_csv(ART_DIR / "predictions_with_proba.csv", index=False)


Predictions saved to: C:\Users\yatharth\Documents\Exoplanets\artifacts\20251005_013453\predictions_on_new_unseen.csv


In [43]:
rf_model = clf.named_steps["model"]

In [45]:
joblib.dump(rf_model, 'rf.joblib')

['rf.joblib']

In [16]:
# Use train_test_split to create test and train sets of data
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=12)

In [24]:
X_train.head()

Unnamed: 0,rowid,toi,toipfx,tid,ctoi_alias,pl_pnum,rastr,ra,decstr,dec,...,st_loggerr2,st_logglim,st_loggsymerr,st_rad,st_raderr1,st_raderr2,st_radlim,st_radsymerr,toi_created,rowupdate
6911,6912,705.01,705,391904697,391904700.0,1,07h05m54.52s,106.477148,-72d33m31.55s,-72.558765,...,-0.119192,0,1,0.877877,0.04401,-0.04401,0,1,2019-04-30 13:04:30,2024-09-11 10:08:01
5142,5143,5491.01,5491,247166992,247167000.0,1,07h34m22.99s,113.595788,+17d17m39.38s,17.294273,...,-0.09,0,1,0.77,0.05,-0.05,0,1,2022-04-20 19:54:45,2024-08-22 10:08:01
6951,6952,7084.01,7084,165337974,165338000.0,1,21h05m59.72s,316.498826,+37d39m54.95s,37.665263,...,-0.08,0,1,0.87,0.05,-0.05,0,1,2024-10-24 17:28:59,2025-09-13 12:03:34
6906,6907,7045.01,7045,21720215,21720220.0,1,17h04m12.97s,256.054062,+31d33m55.33s,31.565369,...,-0.08,0,1,1.3,0.05,-0.05,0,1,2024-08-28 20:47:11,2025-02-14 12:03:07
3590,3591,4119.01,4119,160618074,160618100.0,1,16h25m04.9s,246.270436,+73d17m00.58s,73.283495,...,-0.09,0,1,1.18,0.08,-0.08,0,1,2021-06-23 15:28:25,2024-09-20 12:02:42


In [32]:
import os, json, time, joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, balanced_accuracy_score, f1_score,
                             classification_report, confusion_matrix)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# OPTIONAL: use SMOTE for imbalance
try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    USE_SMOTE = True
except Exception:
    USE_SMOTE = False


USE_SMOTE = False

# -----------------------------
# 1) Prepare data (EDIT paths as needed)
# -----------------------------
TARGET = "tfopwg_disp"

# Example: load directly from NASA TAP (clean CSV)
# url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+*+from+toi&format=csv"
# df = pd.read_csv(url, low_memory=False)

# OR: if you already have df in memory, just keep it
# df = df

# Drop obvious IDs / text duplicates of numeric coords
DROP = ["rowid", "toi", "tid", "ctoi_alias", "rastr", "decstr", "toi_created", "rowupdate"]
df = df.drop(columns=[c for c in DROP if c in df.columns], errors="ignore")

# Keep only rows with target
df = df[~df[TARGET].isna()].copy()

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(str)  # ensure categorical

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# column lists
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

# -----------------------------
# 2) Preprocessing
# -----------------------------
numeric = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

pre = ColumnTransformer(
    transformers=[
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ]
)

# Choose Pipeline class (with or without SMOTE)
Pipe = ImbPipeline if USE_SMOTE else Pipeline

def base_pipe(estimator):
    if USE_SMOTE:
        # SMOTE must come AFTER preprocessing (so data is numeric)
        return Pipe(steps=[("pre", pre), ("smote", SMOTE(random_state=42)), ("clf", estimator)])
    else:
        return Pipe(steps=[("pre", pre), ("clf", estimator)])

# -----------------------------
# 3) Models + Param grids
#    Use balanced metrics; keep grids modest first
# -----------------------------
models_and_grids = {
    "logreg": (
        base_pipe(LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=None)),
        {
            "clf__C": [0.1, 1.0, 3.0, 10.0],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "saga"],
        }
    ),
    "svm": (
        base_pipe(SVC(class_weight="balanced")),
        {
            "clf__kernel": ["rbf", "linear"],
            "clf__C": [0.5, 1, 2, 5],
            "clf__gamma": ["scale", "auto"],
        }
    ),
    "rf": (
        base_pipe(RandomForestClassifier(random_state=42, class_weight="balanced")),
        {
            "clf__n_estimators": [200, 400],
            "clf__max_depth": [None, 10, 20],
            "clf__min_samples_split": [2, 5],
            "clf__min_samples_leaf": [1, 2],
            "clf__max_features": ["sqrt", "log2"],
        }
    ),
}

# (Optional) add XGBoost if available:
try:
    from xgboost import XGBClassifier
    models_and_grids["xgb"] = (
        base_pipe(XGBClassifier(
            objective="multi:softprob",
            eval_metric="mlogloss",
            tree_method="hist",
            random_state=42
        )),
        {
            "clf__n_estimators": [300, 600],
            "clf__max_depth": [3, 6, 10],
            "clf__learning_rate": [0.05, 0.1],
            "clf__subsample": [0.8, 1.0],
            "clf__colsample_bytree": [0.7, 1.0],
        }
    )
except Exception:
    pass

# -----------------------------
# 4) CV + GridSearch
# -----------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
SCORING = {
    "bal_acc": "balanced_accuracy",
    "f1_macro": "f1_macro",
    "acc": "accuracy",
}

results = []
best_overall = None
best_key = None
timestamp = time.strftime("%Y%m%d_%H%M%S")
outdir = Path("ml_runs") / timestamp
outdir.mkdir(parents=True, exist_ok=True)

for key, (pipe, grid) in models_and_grids.items():
    print(f"\n=== Tuning {key} ===")
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        cv=cv,
        scoring=SCORING,
        refit="bal_acc",        # choose your main metric to refit
        n_jobs=-1,
        verbose=1
    )
    gs.fit(X_train, y_train)

    # Save CV results
    cv_df = pd.DataFrame(gs.cv_results_)
    cv_df.to_csv(outdir / f"{key}_cv_results.csv", index=False)

    # Evaluate on test set
    y_pred = gs.best_estimator_.predict(X_test)
    metrics = {
        "model": key,
        "best_params": gs.best_params_,
        "cv_best_bal_acc": float(gs.best_score_),
        "test_acc": float(accuracy_score(y_test, y_pred)),
        "test_bal_acc": float(balanced_accuracy_score(y_test, y_pred)),
        "test_f1_macro": float(f1_score(y_test, y_pred, average="macro")),
        "classes": sorted(list(pd.unique(y))),
        "confusion_matrix": confusion_matrix(y_test, y_pred, labels=sorted(pd.unique(y))).tolist(),
        "classification_report": classification_report(y_test, y_pred, digits=4),
    }
    results.append(metrics)

    # Save best model for this family
    joblib.dump(gs.best_estimator_, outdir / f"best_{key}.joblib")

    # Track global best
    if best_overall is None or metrics["test_bal_acc"] > best_overall["test_bal_acc"]:
        best_overall, best_key = metrics, key

# Save summary JSON
with open(outdir / "summary.json", "w") as f:
    json.dump({"results": results, "best": best_overall}, f, indent=2)

print("\n==== Summary ====")
print(json.dumps(best_overall, indent=2))

# Also write the classification report nicely
with open(outdir / f"classification_report_{best_key}.txt", "w") as f:
    f.write(best_overall["classification_report"])

# Optional: plot confusion matrix for best model
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    labels = best_overall["classes"]
    cm = np.array(best_overall["confusion_matrix"])
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels)
    plt.title(f"Confusion Matrix - {best_key}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(outdir / f"cm_{best_key}.png", dpi=150)
    plt.close()
except Exception as e:
    print("CM plot skipped:", e)

print(f"\nArtifacts saved to: {outdir.resolve()}")


=== Tuning logreg ===
Fitting 5 folds for each of 8 candidates, totalling 40 fits

=== Tuning svm ===
Fitting 5 folds for each of 16 candidates, totalling 80 fits

=== Tuning rf ===
Fitting 5 folds for each of 48 candidates, totalling 240 fits

==== Summary ====
{
  "model": "logreg",
  "best_params": {
    "clf__C": 3.0,
    "clf__penalty": "l2",
    "clf__solver": "lbfgs"
  },
  "cv_best_bal_acc": 0.5121703104270384,
  "test_acc": 0.45286686103012636,
  "test_bal_acc": 0.5331931911613725,
  "test_f1_macro": 0.37250573123546804,
  "classes": [
    "APC",
    "CP",
    "FA",
    "FP",
    "KP",
    "PC"
  ],
  "confusion_matrix": [
    [
      34,
      4,
      4,
      7,
      6,
      5
    ],
    [
      10,
      70,
      9,
      5,
      23,
      4
    ],
    [
      1,
      0,
      8,
      2,
      0,
      0
    ],
    [
      28,
      11,
      24,
      57,
      20,
      22
    ],
    [
      6,
      10,
      1,
      2,
      40,
      14
    ],
    [
      67,
