In [1]:
# install joblib
!pip install joblib



In [2]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# import train_test_split
from sklearn.model_selection import train_test_split
# import MinMaxScaler to scale data
from sklearn.preprocessing import MinMaxScaler
# import LogisticRegression for model1
from sklearn.linear_model import LogisticRegression
# import KNeighborsClassifier for model2
from sklearn.neighbors import KNeighborsClassifier
# import for decision tree for model3
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# import GridSearch
from sklearn.model_selection import GridSearchCV

In [10]:
fp = f'{os.getcwd()}/Dataset/TOI_2025.10.03_10.51.46.csv'  # despite .csv, it's TSV+comments
df = pd.read_csv(fp, comment='#')

In [13]:
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df

Unnamed: 0,rowid,toi,toipfx,tid,ctoi_alias,pl_pnum,tfopwg_disp,rastr,ra,decstr,...,st_loggerr2,st_logglim,st_loggsymerr,st_rad,st_raderr1,st_raderr2,st_radlim,st_radsymerr,toi_created,rowupdate
0,1,1000.01,1000,50365310,5.036531e+07,1,FP,07h29m25.85s,112.357708,-12d41m45.46s,...,-0.07,0,1,2.169860,0.072573,-0.072573,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
1,2,1001.01,1001,88863718,8.886372e+07,1,PC,08h10m19.31s,122.580465,-05d30m49.87s,...,-0.09,0,1,2.010000,0.090000,-0.090000,0,1,2019-07-24 15:58:33,2023-04-03 14:31:04
4,5,1004.01,1004,238597883,2.385979e+08,1,FP,08h08m42.77s,122.178195,-48d48m10.12s,...,-0.07,0,1,2.150000,0.060000,-0.060000,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
7,8,1007.01,1007,65212867,6.521287e+07,1,PC,07h31m00.57s,112.752393,-04d27m48.09s,...,-0.09,0,1,2.700000,0.130000,-0.130000,0,1,2019-07-24 15:58:33,2021-10-29 12:59:15
12,13,1011.01,1011,114018671,1.140187e+08,1,PC,07h35m56.34s,113.984761,-32d50m31.2s,...,-0.09,0,1,0.940000,0.050000,-0.050000,0,1,2019-07-24 15:58:33,2023-03-22 16:02:02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7696,7697,993.01,993,259353953,2.593540e+08,1,PC,07h35m50.6s,113.960841,-15d29m58.04s,...,-0.08,0,1,1.670000,0.050000,-0.050000,0,1,2019-07-24 15:58:33,2023-07-12 16:02:01
7697,7698,994.01,994,93963408,9.396341e+07,1,FP,07h40m11.12s,115.046333,-09d05m03.37s,...,-0.07,0,1,1.877390,0.078985,-0.078985,0,1,2019-07-24 15:58:33,2021-10-29 12:59:15
7700,7701,997.01,997,341729521,3.417295e+08,1,FP,08h05m16.69s,121.319521,-59d34m47.27s,...,-0.08,0,1,0.926261,0.045789,-0.045789,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01
7701,7702,998.01,998,54390047,5.439005e+07,1,FP,07h53m16.69s,118.319555,-14d13m07.76s,...,-0.07,0,1,2.349860,0.091578,-0.091578,0,1,2019-07-24 15:58:33,2024-09-09 10:08:01


In [15]:
target_column = 'tfopwg_disp'
X = df.drop(target_column,axis=1)
y = df[target_column]

In [25]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric:", numeric_cols[:10], "...")
print("Non-numeric:", non_numeric_cols[:10], "...")

Numeric: ['rowid', 'toi', 'toipfx', 'tid', 'ctoi_alias', 'pl_pnum', 'ra', 'dec', 'st_pmra', 'st_pmraerr1'] ...
Non-numeric: ['rastr', 'decstr', 'toi_created', 'rowupdate'] ...


In [26]:
drop_cols = [
    "rowid", "toi", "tid", "ctoi_alias", 
    "rastr", "decstr", "toi_created", "rowupdate"
]
X = X.drop(columns=drop_cols, errors="ignore")


In [27]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = [col for col in non_numeric_cols if col not in drop_cols]

# One-hot encode them later inside pipeline


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

# Example pipeline with Random Forest
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=20))
])

clf.fit(X_train, y_train)
print("Train acc:", clf.score(X_train, y_train))
print("Test acc:", clf.score(X_test, y_test))


Train acc: 0.997812879708384
Test acc: 0.6452866861030127


In [16]:
# Use train_test_split to create test and train sets of data
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=12)

In [24]:
X_train.head()

Unnamed: 0,rowid,toi,toipfx,tid,ctoi_alias,pl_pnum,rastr,ra,decstr,dec,...,st_loggerr2,st_logglim,st_loggsymerr,st_rad,st_raderr1,st_raderr2,st_radlim,st_radsymerr,toi_created,rowupdate
6911,6912,705.01,705,391904697,391904700.0,1,07h05m54.52s,106.477148,-72d33m31.55s,-72.558765,...,-0.119192,0,1,0.877877,0.04401,-0.04401,0,1,2019-04-30 13:04:30,2024-09-11 10:08:01
5142,5143,5491.01,5491,247166992,247167000.0,1,07h34m22.99s,113.595788,+17d17m39.38s,17.294273,...,-0.09,0,1,0.77,0.05,-0.05,0,1,2022-04-20 19:54:45,2024-08-22 10:08:01
6951,6952,7084.01,7084,165337974,165338000.0,1,21h05m59.72s,316.498826,+37d39m54.95s,37.665263,...,-0.08,0,1,0.87,0.05,-0.05,0,1,2024-10-24 17:28:59,2025-09-13 12:03:34
6906,6907,7045.01,7045,21720215,21720220.0,1,17h04m12.97s,256.054062,+31d33m55.33s,31.565369,...,-0.08,0,1,1.3,0.05,-0.05,0,1,2024-08-28 20:47:11,2025-02-14 12:03:07
3590,3591,4119.01,4119,160618074,160618100.0,1,16h25m04.9s,246.270436,+73d17m00.58s,73.283495,...,-0.09,0,1,1.18,0.08,-0.08,0,1,2021-06-23 15:28:25,2024-09-20 12:02:42
