In [1]:
import pandas as pd

NASA = '/kaggle/input/nasa-asteroids-classification/nasa.csv'
df = pd.read_csv(filepath_or_buffer=NASA, index_col=[0]).drop(columns=['Orbiting Body', 'Equinox'])

We probably have unbalanced classes. Let's check.

In [2]:
from plotly import express
express.pie(data_frame=df, names='Hazardous', color='Hazardous')

Honestly it's probably a good thing for humanity that true is the minority class, but it makes building a good classifier difficult.

Let's do the dumb thing first.

In [3]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

time_start = arrow.now()

COLUMNS = [column for column in df.columns if column != 'Hazardous' and not column.endswith('Date')]
TARGET = 'Hazardous'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.20, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=100000, tol=1e-12).fit(X=X_train, y=y_train)
print('accuracy: {:5.4f}'.format( accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))
print('f1: {:5.4f}'.format( f1_score(y_true=y_test, y_pred=model.predict(X=X_test),)))
print(classification_report(zero_division=0, y_true=y_test, y_pred=model.predict(X=X_test)))

print('model done in {}'.format(arrow.now() - time_start))

accuracy: 0.8390
f1: 0.0000
              precision    recall  f1-score   support

       False       0.84      1.00      0.91       787
        True       0.00      0.00      0.00       151

    accuracy                           0.84       938
   macro avg       0.42      0.50      0.46       938
weighted avg       0.70      0.84      0.77       938

model done in 0:00:00.156656


We get this strong divergence between accuracy and f1 because our model is always returning the dominant class. Let's try another model.

In [4]:
import arrow
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score

time_start = arrow.now()

adaboost = AdaBoostClassifier(algorithm='SAMME', random_state=2024).fit(X=X_train, y=y_train)
print('f1: {:5.4f}'.format(f1_score(y_true=y_test, y_pred=adaboost.predict(X=X_test))))
print(classification_report(zero_division=0, y_true=y_test, y_pred=adaboost.predict(X=X_test)))

f1: 0.9834
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       787
        True       0.99      0.98      0.98       151

    accuracy                           0.99       938
   macro avg       0.99      0.99      0.99       938
weighted avg       0.99      0.99      0.99       938



So much better. Why didn't we try that to begin with?