In [15]:
import pandas
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support

In [2]:
asteroids = pandas.read_csv("asteroids.csv", delimiter=",")

In [3]:
asteroids

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,5.483974e+07,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.265800,0.594347,73588.726663,6.143813e+07,Earth,False,20.00,True
2,2512244,512244 (2015 YE18),0.722030,1.614507,114258.692129,4.979872e+07,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,2.543497e+07,Earth,False,22.20,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,4.627557e+07,Earth,False,20.09,True
...,...,...,...,...,...,...,...,...,...,...
90831,3763337,(2016 VX1),0.026580,0.059435,52078.886692,1.230039e+07,Earth,False,25.00,False
90832,3837603,(2019 AD3),0.016771,0.037501,46114.605073,5.432121e+07,Earth,False,26.00,False
90833,54017201,(2020 JP3),0.031956,0.071456,7566.807732,2.840077e+07,Earth,False,24.60,False
90834,54115824,(2021 CN5),0.007321,0.016370,69199.154484,6.869206e+07,Earth,False,27.80,False


### Feature Selection

We don't need the following data, as there is either no relation between it and whether or not the asteroid is dangerous,
or there is no data from it to pull (all data is the same):
* id
* name
* orbiting_body
* sentry_object

In [4]:
asteroids = asteroids[["est_diameter_min","est_diameter_max","relative_velocity","miss_distance","absolute_magnitude","hazardous"]]

In [5]:
asteroids.corr()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
est_diameter_min,1.0,1.0,0.221553,0.142241,-0.560188,0.183363
est_diameter_max,1.0,1.0,0.221553,0.142241,-0.560188,0.183363
relative_velocity,0.221553,0.221553,1.0,0.327169,-0.353863,0.191185
miss_distance,0.142241,0.142241,0.327169,1.0,-0.264168,0.042302
absolute_magnitude,-0.560188,-0.560188,-0.353863,-0.264168,1.0,-0.365267
hazardous,0.183363,0.183363,0.191185,0.042302,-0.365267,1.0


From this, we can see there are no direct correlations between hazardous and the other variables, so there is nothing further we can do for feature selection

In [6]:
# From here, split the datasets so we can do some AI
X_data = asteroids.drop("hazardous", axis=1).to_numpy()
y_data = asteroids["hazardous"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42)

### Challenges
One of the challenges to this dataset is the uneven split between safe and unsafe asteroids (about 80/20 split).  If we just assumed all asteroids were safe, we would have a model that is 80% accurate, but unprecise.  To combat this, we can downsample the training data so that we can achieve a more moderate split

In [9]:
# downsample here
train_true = y_train[y_train == True].shape[0]
train_false = y_train[y_train == False].shape[0]

# process : pull all true data out, then pull an equal number of falses out, and use that to create a 50/50 split in data
true_index = np.where(y_train == True)
false_index = np.where(y_train == False)[0]

num_to_get = len(true_index[0])
N = np.random.choice(X_train[false_index].shape[0], num_to_get)

X_false_data = X_train[N]
X_true_data = X_train[true_index]

y_false_data = y_train[N]
y_true_data = y_train[true_index]

X_down = np.concatenate((X_false_data, X_true_data), axis=0)
y_down = np.concatenate((y_false_data, y_true_data), axis=0)

np.random.shuffle(X_down)
np.random.shuffle(y_down) #in place, no need to return anything

print(X_down.shape)
print(y_down.shape)

(13270, 5)
(13270,)


### Training models on training dataset

In [30]:
# SVN baby
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=0.7))
pipeline.fit(X_down, y_down)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC(C=0.7))])

In [31]:
predictions = pipeline.predict(X_train)


In [32]:
conf_matrix = confusion_matrix(predictions, y_train)
print(conf_matrix)

correct_class_samples = np.sum(np.diag(conf_matrix))
prob_error = 1 - (correct_class_samples / 13270)

print("POE : {}".format(prob_error))

precision, recall, fscore, _ = precision_recall_fscore_support(predictions, y_train, average='macro')

print("\nPrecision {}\nRecall {}\nfscore {}".format(precision, recall, fscore))

[[  369   157]
 [61123  6478]]
POE : 0.48402411454408445

Precision 0.49116919210326326
Recall 0.39867394867772277
fscore 0.09321213025012158


With a plain SVC, we don't get great results, with a probability of error of 48%, worse than a coin flip

In [34]:
# train SVC without downsampling
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=0.7))
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)

conf_matrix = confusion_matrix(predictions, y_test)
print(conf_matrix)

correct_class_samples = np.sum(np.diag(conf_matrix))
prob_error = 1 - (correct_class_samples / 13270)

print("POE : {}".format(prob_error))

precision, recall, fscore, _ = precision_recall_fscore_support(predictions, y_test, average='macro')

print("\nPrecision {}\nRecall {}\nfscore {}".format(precision, recall, fscore))

[[20432  1962]
 [   72   243]]
POE : -0.5580256217030897

Precision 0.5533462858416874
Recall 0.8419079090062389
fscore 0.5727211724822336


In fact, training without downsampling achieves better results, with a higher recall score, but a