In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.metrics import  accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv('cleaned_data.csv')
df.dropna(inplace=True)
df.head()


Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_score,koi_period,koi_duration,koi_depth,koi_time0bk,koi_prad,koi_ror,...,koi_period_err1,koi_period_err2,koi_duration_err1,koi_duration_err2,koi_depth_err1,koi_depth_err2,koi_prad_err1,koi_prad_err2,koi_steff_err1,koi_steff_err2
0,10797460,K00752.01,CONFIRMED,1.0,9.488036,2.9575,615.8,170.53875,2.26,0.022344,...,2.775e-05,-2.775e-05,0.0819,-0.0819,19.5,-19.5,0.26,-0.15,81.0,-81.0
1,10797460,K00752.02,CONFIRMED,0.969,54.418383,4.507,874.8,162.51384,2.83,0.027954,...,0.0002479,-0.0002479,0.116,-0.116,35.5,-35.5,0.32,-0.19,81.0,-81.0
2,10811496,K00753.01,CANDIDATE,0.0,19.89914,1.7822,10829.0,175.850252,14.6,0.154046,...,1.494e-05,-1.494e-05,0.0341,-0.0341,171.0,-171.0,3.92,-1.31,158.0,-176.0
3,10848459,K00754.01,FALSE POSITIVE,0.0,1.736952,2.40641,8079.2,170.307565,33.46,0.387394,...,2.63e-07,-2.63e-07,0.00537,-0.00537,12.8,-12.8,8.5,-2.83,157.0,-174.0
4,10854555,K00755.01,CONFIRMED,1.0,2.525592,1.6545,603.3,171.59555,2.75,0.024064,...,3.761e-06,-3.761e-06,0.042,-0.042,16.9,-16.9,0.88,-0.35,169.0,-211.0


In [4]:
drop_cols = [
    'kepid',
    'kepoi_name',
    'koi_vet_stat',
    'koi_vet_date',
    'koi_disp_prov'
]

df = df.drop(columns=drop_cols)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9563 entries, 0 to 9563
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    9563 non-null   object 
 1   koi_score          9563 non-null   float64
 2   koi_period         9563 non-null   float64
 3   koi_duration       9563 non-null   float64
 4   koi_depth          9563 non-null   float64
 5   koi_time0bk        9563 non-null   float64
 6   koi_prad           9563 non-null   float64
 7   koi_ror            9563 non-null   float64
 8   koi_dor            9563 non-null   float64
 9   koi_impact         9563 non-null   float64
 10  koi_incl           9563 non-null   float64
 11  koi_srad           9563 non-null   float64
 12  koi_smass          9563 non-null   float64
 13  koi_steff          9563 non-null   float64
 14  koi_slogg          9563 non-null   float64
 15  koi_smet           9563 non-null   float64
 16  koi_kepmag         9563 non-n

In [5]:
df2 = df.copy()

drop_cols = []
drop_cols += [col for col in df.columns if 'err' in col]
df2 = df2.drop(columns=drop_cols)

df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9563 entries, 0 to 9563
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   koi_disposition  9563 non-null   object 
 1   koi_score        9563 non-null   float64
 2   koi_period       9563 non-null   float64
 3   koi_duration     9563 non-null   float64
 4   koi_depth        9563 non-null   float64
 5   koi_time0bk      9563 non-null   float64
 6   koi_prad         9563 non-null   float64
 7   koi_ror          9563 non-null   float64
 8   koi_dor          9563 non-null   float64
 9   koi_impact       9563 non-null   float64
 10  koi_incl         9563 non-null   float64
 11  koi_srad         9563 non-null   float64
 12  koi_smass        9563 non-null   float64
 13  koi_steff        9563 non-null   float64
 14  koi_slogg        9563 non-null   float64
 15  koi_smet         9563 non-null   float64
 16  koi_kepmag       9563 non-null   float64
 17  koi_fpflag_nt    95

In [6]:
le = LabelEncoder()
df['koi_disposition'] = le.fit_transform(df['koi_disposition'].astype(str))
df['koi_disposition'].value_counts()

koi_disposition
2    4838
1    2746
0    1979
Name: count, dtype: int64

In [7]:
le = LabelEncoder()
df2['koi_disposition'] = le.fit_transform(df2['koi_disposition'].astype(str))
df2['koi_disposition'].value_counts()

koi_disposition
2    4838
1    2746
0    1979
Name: count, dtype: int64

In [8]:
X = df.drop('koi_disposition', axis=1)
Y = df['koi_disposition']

In [9]:
x = df2.drop('koi_disposition', axis=1)
y = df2['koi_disposition']

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=y, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
from sklearn.pipeline import Pipeline

models = {
    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ]),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=200),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": Pipeline([
        ("scaler", StandardScaler()),
        ("model", SVC(class_weight="balanced"))
    ]),
    "K-Nearest Neighbors": Pipeline([
        ("scaler", StandardScaler()),
        ("model", KNeighborsClassifier(n_neighbors=7))
    ])}

In [12]:
# with err columns

results = {}
for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    acc = accuracy_score(Y_test, Y_pred) * 100  # in percentage
    results[name] = acc
    print(f"{name} Accuracy: {acc:.2f}%")

Logistic Regression Accuracy: 90.38%
Decision Tree Accuracy: 88.87%
Random Forest Accuracy: 92.11%
Gradient Boosting Accuracy: 92.58%
Support Vector Machine Accuracy: 84.42%
K-Nearest Neighbors Accuracy: 82.85%


In [13]:
# without err columns

results = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred) * 100  # in percentage
    results[name] = acc
    print(f"{name} Accuracy: {acc:.2f}%")

Logistic Regression Accuracy: 88.08%
Decision Tree Accuracy: 87.77%
Random Forest Accuracy: 91.53%
Gradient Boosting Accuracy: 90.64%
Support Vector Machine Accuracy: 82.70%
K-Nearest Neighbors Accuracy: 80.71%


In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# with err columns
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb)],
    voting='soft' 
)

voting_clf.fit(X_train, Y_train)

Y_pred = voting_clf.predict(X_test)

print("Voting (RF + GB) Accuracy:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, Y_pred))
