In [7]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import r2_score
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [8]:
#convert csv into dataframe
neo_df = pd.read_csv('Resources/neo.csv')
neo_df.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [9]:
#drop unnecessary columns
neo_df = neo_df.drop(['id', 'name', 'orbiting_body', 'sentry_object'], axis=1)
neo_df.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,54839740.0,16.73,False
1,0.2658,0.594347,73588.726663,61438130.0,20.0,True
2,0.72203,1.614507,114258.692129,49798720.0,17.83,False
3,0.096506,0.215794,24764.303138,25434970.0,22.2,False
4,0.255009,0.570217,42737.733765,46275570.0,20.09,True


In [10]:
#encode qualitative data
neo_df = pd.get_dummies(neo_df, columns=['hazardous'])
neo_df.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous_False,hazardous_True
0,1.198271,2.679415,13569.249224,54839740.0,16.73,1,0
1,0.2658,0.594347,73588.726663,61438130.0,20.0,0,1
2,0.72203,1.614507,114258.692129,49798720.0,17.83,1,0
3,0.096506,0.215794,24764.303138,25434970.0,22.2,1,0
4,0.255009,0.570217,42737.733765,46275570.0,20.09,0,1


In [11]:
#delete the hazardous_False column since our model will be detectring postive values for hazardous_True
neo_df = neo_df.drop(['hazardous_False'], axis=1)
neo_df.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous_True
0,1.198271,2.679415,13569.249224,54839740.0,16.73,0
1,0.2658,0.594347,73588.726663,61438130.0,20.0,1
2,0.72203,1.614507,114258.692129,49798720.0,17.83,0
3,0.096506,0.215794,24764.303138,25434970.0,22.2,0
4,0.255009,0.570217,42737.733765,46275570.0,20.09,1


In [12]:
# Separate the y variable, the labels
y = neo_df['hazardous_True']
y.head()

0    0
1    1
2    0
3    0
4    1
Name: hazardous_True, dtype: uint8

In [13]:
# Separate the X variable, the features
X = neo_df.drop(columns=["hazardous_True"])
X.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
0,1.198271,2.679415,13569.249224,54839740.0,16.73
1,0.2658,0.594347,73588.726663,61438130.0,20.0
2,0.72203,1.614507,114258.692129,49798720.0,17.83
3,0.096506,0.215794,24764.303138,25434970.0,22.2
4,0.255009,0.570217,42737.733765,46275570.0,20.09


In [14]:
# 1 = hazardous
# 0 = not hazardous
y.value_counts()

0    81996
1     8840
Name: hazardous_True, dtype: int64

In [15]:
#for binning purposes:
unique_values = neo_df.nunique()
print(unique_values)

est_diameter_min       1638
est_diameter_max       1638
relative_velocity     90828
miss_distance         90536
absolute_magnitude     1638
hazardous_True            2
dtype: int64


In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

In [18]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [19]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.92


In [20]:
# Print other evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96     16439
           1       0.62      0.43      0.51      1729

    accuracy                           0.92     18168
   macro avg       0.78      0.70      0.73     18168
weighted avg       0.91      0.92      0.91     18168



In [21]:
#Reformat Classfication Report as DataFrame for readability
metrics_df = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1-Score'],
    '% Non-hazardous (Class 0)': [precision_class_0, recall_class_0, f1_score_class_0],
    '% Hazardous (Class 1)': [precision_class_1, recall_class_1, f1_score_class_1]
})

# Display the DataFrame
display(metrics_df)

NameError: name 'precision_class_0' is not defined

In [None]:
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Confusion Matrix
[[15975   464]
 [  978   751]]


In [None]:
tn, fp, fn, tp = conf_matrix.ravel()
print(f'True Negative (Correctly predicted as non-hazardous): {tn}')
print(f'False Positive (Non-hazardous, incorrectly predicted as hazardous): {fp}')
print(f'False Negative (Hazardous, incorrectly predicted as non-hazardous): {fn}')
print(f'True Positive (Correctly predicted as hazardous): {tp}')

True Negative (Correctly predicted as non-hazardous): 15975
False Positive (Non-hazardous, incorrectly predicted as hazardous): 464
False Negative (Hazardous, incorrectly predicted as non-hazardous): 978
True Positive (Correctly predicted as hazardous): 751


In [None]:
accuracy = accuracy_score(y_test, y_pred)

# Check if accuracy meets the criteria
if accuracy >= 0.75:
    print(f"Classification Accuracy: {accuracy:.2f} meets the criteria.")
else:
    print(f"Classification Accuracy: {accuracy:.2f} does not meet the criteria.")

Classification Accuracy: 0.92 meets the criteria.


In [None]:
y_pred = model.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)

# Check if R-squared meets the criteria
if r_squared >= 0.80:
    print(f"R-squared: {r_squared:.2f} meets the criteria.")
else:
    print(f"R-squared: {r_squared:.2f} does not meet the criteria.")

R-squared: 0.08 does not meet the criteria.
