In [1]:
import pandas as pd

OBJECTS = '/kaggle/input/nasa-nearest-earth-objects-1910-2024/nearest-earth-objects(1910-2024).csv'

# the ordbiting body is always Earth so we can drop that column
# we only have 28 rows with missing values so we can drop them
df = pd.read_csv(filepath_or_buffer=OBJECTS).drop(columns=['orbiting_body']).dropna()
df.head()

Unnamed: 0,neo_id,name,absolute_magnitude,estimated_diameter_min,estimated_diameter_max,relative_velocity,miss_distance,is_hazardous
0,2162117,162117 (1998 SD15),19.14,0.394962,0.883161,71745.401048,58143620.0,False
1,2349507,349507 (2008 QY),18.5,0.530341,1.185878,109949.757148,55801050.0,True
2,2455415,455415 (2003 GA),21.45,0.136319,0.304818,24865.506798,67206890.0,False
3,3132126,(2002 PB),20.63,0.198863,0.444672,78890.076805,30396440.0,False
4,3557844,(2011 DW),22.7,0.076658,0.171412,56036.519484,63118630.0,False


In [2]:
df.nunique()

neo_id                     33511
name                       33511
absolute_magnitude          1778
estimated_diameter_min      1778
estimated_diameter_max      1778
relative_velocity         338161
miss_distance             337798
is_hazardous                   2
dtype: int64

We have a lot of observations relative to the number of objects; are the observations evenly distributed?

In [3]:
df['name'].value_counts().head(n=10)

name
277810 (2006 FV35)               211
469219 Kamo`oalewa (2016 HO3)    197
(2014 OL339)                     192
(2017 FZ2)                       186
(2023 FW13)                      169
(2022 YF4)                       151
164207 (2004 GU9)                135
85770 (1998 UP1)                 101
138852 (2000 WN10)                99
(2015 RE36)                       98
Name: count, dtype: int64

What percentage of objects are hazardous?

In [4]:
print(df[['name', 'is_hazardous']].drop_duplicates(ignore_index=True)['is_hazardous'].value_counts(normalize=True).to_dict())

{False: 0.9256065172629883, True: 0.07439348273701173}


Fortunately only about 7.4% of the objects in the sample are hazardous.

Let's see if we can do some dimension reduction to cluster the observations. We have too many rows to use them all, so we need to take a sample.

In [5]:
import arrow
from umap import UMAP

umap_df = df.sample(frac=0.02, random_state=2024)
COLUMNS = [key for key, value in df.dtypes.to_dict().items() if str(value) == 'float64']
time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
umap_df[['x', 'y']] = umap.fit_transform(X=umap_df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-07-18 17:43:55.528718: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 17:43:55.528871: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 17:43:55.708425: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Thu Jul 18 17:44:07 2024 Construct fuzzy simplicial set
Thu Jul 18 17:44:07 2024 Finding Nearest Neighbors
Thu Jul 18 17:44:07 2024 Building RP forest with 9 trees
Thu Jul 18 17:44:12 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	Stopping threshold met -- exiting after 2 iterations
Thu Jul 18 17:44:29 2024 Finished Nearest Neighbor Search
Thu Jul 18 17:44:33 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Jul 18 17:44:53 2024 Finished embedding
done with UMAP in 0:00:46.698382


In [6]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=umap_df, x='x', y='y', color='is_hazardous', facet_col='is_hazardous')

This is not entirely encouraging; let's build a regression model and see what happens.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

TARGET = 'is_hazardous'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 25 iterations
accuracy: 0.8724


In [8]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.87      1.00      0.93     59002
        True       0.00      0.00      0.00      8633

    accuracy                           0.87     67635
   macro avg       0.44      0.50      0.47     67635
weighted avg       0.76      0.87      0.81     67635



Logistic regression does not find any of the hazardous objects. That's not good.


Can we do any better with AdaBoost?

In [9]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(algorithm='SAMME', random_state=2024).fit(X=X_train, y=y_train)

print(classification_report(zero_division=0 , y_true=y_test, y_pred=adaboost.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.89      0.99      0.94     59002
        True       0.70      0.15      0.25      8633

    accuracy                           0.88     67635
   macro avg       0.80      0.57      0.60     67635
weighted avg       0.87      0.88      0.85     67635



Turns out this is a hard problem to solve with the available data.