In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = None

In [2]:
file_path = '../data/raw/koi_cumulative_2.csv'
df = pd.read_csv(file_path)

In [3]:
cols_to_drop = [
    'kepid',
    'kepoi_name',
    'kepler_name',
    'koi_pdisposition',
    'koi_tce_delivname',
    'koi_fpflag_nt',
    'koi_sage',
]

In [4]:
df = df.drop(columns=cols_to_drop)

In [5]:
df = df[df["koi_disposition"] != "CANDIDATE"]

In [6]:
cols = [var for var in df.columns if df[var].isnull().mean() < 0.05 and df[var].isnull().mean() > 0]

columns_to_impute = ['koi_smass', 'koi_num_transits', 'koi_score']
minimal_missing_columns = list(set(cols) - set(columns_to_impute))
complete_columns = list(set(df.columns) - set(minimal_missing_columns) - set(columns_to_impute))

In [8]:
df_filtered = df.dropna(subset=minimal_missing_columns)

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

df_rf = df_filtered.copy()

In [12]:
rf_imputer = IterativeImputer(
    estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    missing_values=np.nan,
    random_state=42
)
df_rf[columns_to_impute] = rf_imputer.fit_transform(df_filtered[columns_to_impute])



In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

X = df_rf.drop(columns=['koi_disposition'])
y = df_rf['koi_disposition']

In [16]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [17]:
X_resampled

Unnamed: 0,koi_score,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_steff,koi_slogg,koi_smet,koi_srad,koi_smass,koi_kepmag
0,1.000000,0,0,0,9.488036,170.538750,0.146000,2.957500,615.800000,2.260000,793.000000,93.590000,35.800000,2,142.000000,1.000000,5455.000000,4.467000,0.140000,0.927000,0.919000,15.347000
1,0.969000,0,0,0,54.418383,162.513840,0.586000,4.507000,874.800000,2.830000,443.000000,9.110000,25.800000,2,25.000000,2.000000,5455.000000,4.467000,0.140000,0.927000,0.919000,15.347000
2,0.000000,1,0,0,1.736952,170.307565,1.276000,2.406410,8079.200000,33.460000,1395.000000,891.960000,505.600000,1,621.000000,1.000000,5805.000000,4.564000,-0.520000,0.791000,0.836000,15.597000
3,1.000000,0,0,0,2.525592,171.595550,0.701000,1.654500,603.300000,2.750000,1406.000000,926.160000,40.900000,1,515.000000,1.000000,6031.000000,4.438000,0.070000,1.046000,1.095000,15.509000
4,1.000000,0,0,0,11.094321,171.201160,0.538000,4.594500,1517.500000,3.900000,835.000000,114.810000,66.500000,3,95.000000,1.000000,6046.000000,4.486000,-0.080000,0.972000,1.053000,15.714000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8731,0.985928,0,0,0,18.725915,169.506527,0.344415,3.874853,845.121706,2.517500,617.006585,34.291514,32.625006,2,64.907894,1.546053,5504.302658,4.526671,0.014737,0.866625,0.919684,15.712039
8732,1.000000,0,0,0,144.229411,259.725293,0.429644,14.106686,3950.619198,7.687149,385.286015,5.407870,157.214550,2,9.107160,1.000000,5806.713219,4.293642,0.127500,1.205037,1.035678,14.928820
8733,0.993899,0,0,0,6.967004,136.277728,0.357914,2.141495,238.151387,1.254534,782.663972,88.594253,22.101194,2,178.983832,2.445343,5134.060811,4.552453,0.106559,0.806749,0.846624,14.328637
8734,0.932468,0,0,0,88.909065,196.961570,0.629164,4.784432,163.173823,1.748385,442.653089,9.095876,17.026692,2,14.362909,2.637091,5918.242185,4.339565,-0.043869,1.119580,1.001533,12.061388


In [18]:
cleaned_data = pd.DataFrame(X_resampled.copy())

In [20]:
cleaned_data['koi_disposition'] = pd.Series(y_resampled)

In [21]:
cleaned_data.head()

Unnamed: 0,koi_score,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_count,koi_num_transits,koi_tce_plnt_num,koi_steff,koi_slogg,koi_smet,koi_srad,koi_smass,koi_kepmag,koi_disposition
0,1.0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,2,142.0,1.0,5455.0,4.467,0.14,0.927,0.919,15.347,CONFIRMED
1,0.969,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,2,25.0,2.0,5455.0,4.467,0.14,0.927,0.919,15.347,CONFIRMED
2,0.0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,1,621.0,1.0,5805.0,4.564,-0.52,0.791,0.836,15.597,FALSE POSITIVE
3,1.0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,1,515.0,1.0,6031.0,4.438,0.07,1.046,1.095,15.509,CONFIRMED
4,1.0,0,0,0,11.094321,171.20116,0.538,4.5945,1517.5,3.9,835.0,114.81,66.5,3,95.0,1.0,6046.0,4.486,-0.08,0.972,1.053,15.714,CONFIRMED


In [22]:
cleaned_data.to_csv("cleaned_data.csv", index=False)