In [71]:
#Import your dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import sqlite3

## Pre-Modeling

retrieving data from SQL

In [45]:
# sources:
# https://www.freecodecamp.org/news/how-to-read-and-write-data-to-a-sql-database-using-python/
# https://theleftjoin.com/how-to-write-a-pandas-dataframe-to-an-sqlite-table/
# https://stackoverflow.com/questions/48594217/pandas-read-sql-query-is-putting-header-info-in-first-row-of-dataframe
# https://datacarpentry.org/python-ecology-lesson/instructor/09-working-with-sql.html

In [46]:
# Connect to SQLite database
connection = sqlite3.connect('nasa_data.db')

# Create a cursor
cursor = connection.cursor()

# Define the query
query = "SELECT * FROM nasa;"

# Execute the query
cursor.execute(query)

# Fetch all results
data = cursor.fetchall()

# Extract column names from cursor description
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame
nasa = pd.DataFrame(data, columns=columns)

# Close the database connection
connection.close()

# Preview the dataframe
nasa.head()

Unnamed: 0,index,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0,1.198271,2.679415,13569.249224,54839740.0,16.73,False
1,1,0.2658,0.594347,73588.726663,61438130.0,20.0,True
2,2,0.72203,1.614507,114258.692129,49798720.0,17.83,False
3,3,0.096506,0.215794,24764.303138,25434970.0,22.2,False
4,4,0.255009,0.570217,42737.733765,46275570.0,20.09,True


In [47]:
# drop index, since it will confuse the model
nasa = nasa.drop(['index'], axis=1)

# encode qualitative data
nasa = pd.get_dummies(nasa, columns=['hazardous'])

# delete the 'hazardous_False' column since our model will be detectring postive values for hazardous_True
nasa = nasa.drop(['hazardous_False'], axis=1)

nasa.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous_True
0,1.198271,2.679415,13569.249224,54839740.0,16.73,0
1,0.2658,0.594347,73588.726663,61438130.0,20.0,1
2,0.72203,1.614507,114258.692129,49798720.0,17.83,0
3,0.096506,0.215794,24764.303138,25434970.0,22.2,0
4,0.255009,0.570217,42737.733765,46275570.0,20.09,1


## Data Modeling

initializing

In [48]:
# Sources: code taken from our Module 20 homework and in-class assignments

In [49]:
# Separate the y variable, the labels
y = nasa['hazardous_True']
y.head()

0    0
1    1
2    0
3    0
4    1
Name: hazardous_True, dtype: uint8

In [50]:
# Separate the X variable, the features
X = nasa.drop(columns=["hazardous_True"])
X.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
0,1.198271,2.679415,13569.249224,54839740.0,16.73
1,0.2658,0.594347,73588.726663,61438130.0,20.0
2,0.72203,1.614507,114258.692129,49798720.0,17.83
3,0.096506,0.215794,24764.303138,25434970.0,22.2
4,0.255009,0.570217,42737.733765,46275570.0,20.09


In [51]:
# 1 = hazardous
# 0 = not hazardous
y.value_counts()

0    81996
1     8840
Name: hazardous_True, dtype: int64

In [52]:
#for binning purposes:
unique_values = nasa.nunique()
print(unique_values)

est_diameter_min       1638
est_diameter_max       1638
relative_velocity     90828
miss_distance         90536
absolute_magnitude     1638
hazardous_True            2
dtype: int64


In [53]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
# Initialize the model
model = RandomForestClassifier(random_state=42)

training

In [55]:
# Train the model
model.fit(X_train, y_train)

In [56]:
# Make predictions on the test set
y_pred = model.predict(X_test)

evaluating

In [57]:
# Print evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracy:.2f}%")


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96     16439
           1       0.62      0.43      0.51      1729

    accuracy                           0.92     18168
   macro avg       0.78      0.70      0.73     18168
weighted avg       0.91      0.92      0.91     18168

Accuracy: 92.06%


In [58]:
conf_matrix = (confusion_matrix(y_test, y_pred))
print('Confusion Matrix')
print(conf_matrix)

Confusion Matrix
[[15975   464]
 [  978   751]]


In [59]:
tn, fp, fn, tp = conf_matrix.ravel()
print(f'True Negative (Correctly predicted as non-hazardous): {tn}')
print(f'False Positive (Non-hazardous, incorrectly predicted as hazardous): {fp}')
print(f'False Negative (Hazardous, incorrectly predicted as non-hazardous): {fn}')
print(f'True Positive (Correctly predicted as hazardous): {tp}')

True Negative (Correctly predicted as non-hazardous): 15975
False Positive (Non-hazardous, incorrectly predicted as hazardous): 464
False Negative (Hazardous, incorrectly predicted as non-hazardous): 978
True Positive (Correctly predicted as hazardous): 751


## Data Model Optimization

altering the class weights

In [60]:
# rather than trying to find the best comibation of class weights through a random process of trial and error,
# I decided to research ways to calculate the best class weights within my code.
# sources: 
# https://www.analyticsvidhya.com/blog/2020/10/improve-class-imbalance-class-weights/
# https://stackoverflow.com/questions/65716209/optimal-ways-to-calculate-class-weights-for-large-datasets
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://www.mygreatlearning.com/blog/gridsearchcv/

In [61]:
# Define the parameter grid
param_grid = {
    'class_weight': [{0: 1, 1: w} for w in [1, 2, 5, 10, 20, 50, 100]]
}

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)

# Fit the model with different class weights
grid_search.fit(X_train, y_train)

In [62]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model
best_model = grid_search.best_estimator_

Best Parameters: {'class_weight': {0: 1, 1: 100}}


In [63]:
# Initialize the model with updated class weights
model = RandomForestClassifier(random_state=42, class_weight=best_params['class_weight'])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [64]:
# Print evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy: {accuracy:.2f}%")


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96     16439
           1       0.63      0.41      0.49      1729

    accuracy                           0.92     18168
   macro avg       0.79      0.69      0.73     18168
weighted avg       0.91      0.92      0.91     18168

Accuracy: 92.09%


In [65]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix')
print(conf_matrix)


Confusion Matrix
[[16027   412]
 [ 1025   704]]


In [66]:
# Check feature importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
print("Feature Importance:")
print(feature_importance)

Feature Importance:
est_diameter_min      0.284718
est_diameter_max      0.244171
relative_velocity     0.118506
miss_distance         0.106222
absolute_magnitude    0.246384
dtype: float64


In [67]:
tn, fp, fn, tp = conf_matrix.ravel()
print(f'True Negative (Correctly predicted as non-hazardous): {tn}')
print(f'False Positive (Non-hazardous, incorrectly predicted as hazardous): {fp}')
print(f'False Negative (Hazardous, incorrectly predicted as non-hazardous): {fn}')
print(f'True Positive (Correctly predicted as hazardous): {tp}')

True Negative (Correctly predicted as non-hazardous): 16027
False Positive (Non-hazardous, incorrectly predicted as hazardous): 412
False Negative (Hazardous, incorrectly predicted as non-hazardous): 1025
True Positive (Correctly predicted as hazardous): 704


thresholds

In [73]:
# Get predicted probabilities
y_prob = model.predict_proba(X_test)[:, 1]

In [74]:
# source:
# https://stackoverflow.com/questions/67315332/using-for-loop-to-create-threshold-evaluation

thresholds = np.linspace(0, 1, 100)

best_threshold = 0
best_accuracy = 0

for threshold in thresholds:
    y_pred_threshold = (y_prob > threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred_threshold)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

print(f"Best Threshold for Maximum Accuracy: {best_threshold:.3f}")
print(f"Maximum Accuracy: {best_accuracy:.2%}")

Best Threshold for Maximum Accuracy: 0.576
Maximum Accuracy: 92.38%


In [75]:
final_y_pred = (y_prob > best_threshold).astype(int)

# Evaluate the model with the best threshold
print("\nClassification Report with Best Threshold:")
print(classification_report(y_test, final_y_pred))

final_accuracy = accuracy_score(y_test, final_y_pred) * 100
print(f"Accuracy with Best Threshold: {final_accuracy:.2f}%")

final_conf_matrix = confusion_matrix(y_test, final_y_pred)
print('Confusion Matrix with Best Threshold')
print(final_conf_matrix)

tn_final, fp_final, fn_final, tp_final = final_conf_matrix.ravel()
print(f'True Negative (Correctly predicted as non-hazardous): {tn_final}')
print(f'False Positive (Non-hazardous, incorrectly predicted as hazardous): {fp_final}')
print(f'False Negative (Hazardous, incorrectly predicted as non-hazardous): {fn_final}')
print(f'True Positive (Correctly predicted as hazardous): {tp_final}')


Classification Report with Best Threshold:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     16439
           1       0.70      0.34      0.46      1729

    accuracy                           0.92     18168
   macro avg       0.82      0.66      0.71     18168
weighted avg       0.91      0.92      0.91     18168

Accuracy with Best Threshold: 92.38%
Confusion Matrix with Best Threshold
[[16189   250]
 [ 1135   594]]
True Negative (Correctly predicted as non-hazardous): 16189
False Positive (Non-hazardous, incorrectly predicted as hazardous): 250
False Negative (Hazardous, incorrectly predicted as non-hazardous): 1135
True Positive (Correctly predicted as hazardous): 594


assessing meaningfulness

## Data Model Implementation

create hypothetical asteroids using random generator

In [76]:
# save model as file
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [77]:
# Save the trained model to a file
joblib.dump(model, 'asteroidpredictor.joblib')

['asteroidpredictor.joblib']

input into model