In [None]:
#>>> Near Earth Object Machine Learning Models by Christopher Madden.

#___________________ ¶¶¶¶¶¶¶¶ 
#_______________¶¶¶¶¶ _______¶¶¶¶¶ 
#_____________¶¶¶ ________________¶¶¶ 
#___________¶¶¶ ____________________¶¶¶ 
#__________¶¶ ________________________¶¶ 
#_________¶ ______¶¶¶_____¶¶¶__________¶¶ 
#________¶ _________¶¶______¶¶__________¶¶ 
#_______¶¶ __________¶¶______¶¶_________¶¶ 
#_______¶ ____________¶¶______¶¶___¶¶¶___¶¶ 
#______¶¶ _____¶¶_____¶¶______¶¶_____¶¶__¶¶ 
#______¶¶ ___¶¶¶______¶¶______¶¶______¶¶_¶¶ 
#______¶¶ __¶¶¶¶¶__________________¶¶_¶¶_¶¶ 
#_______¶ __¶¶__¶¶_________________¶¶____¶¶ 
#_______¶¶ ______¶¶______________¶¶¶____¶¶ 
#________¶¶ ______¶¶____________¶¶¶_____¶¶ 
#_________¶¶ _______¶¶¶¶_____¶¶¶¶______¶¶ 
#__________¶¶ _________¶¶¶¶¶¶¶________¶¶ 
#____________¶¶ ____________________¶¶ 
#_____________¶¶¶ ______________¶¶¶ 
#_______________ ¶¶¶¶¶¶¶¶¶¶¶¶¶¶¶

# LIBRARIES AND RESOURCES

In [None]:
#>>> Import dependencies.
import pandas as pd
import tensorflow as tf
import numpy as np
import datetime
import tabpy_clientfrom tabpy.tabpy_tools.client
import Client
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score, DetCurveDisplay
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from autoviz.AutoViz_Class import AutoViz_Class
from keras.preprocessing.image import ImageDataGenerator
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.metrics import classification_report_imbalanced
from config import db_password
from sqlalchemy import create_engine
from collections import Counter

#>>> Define TabPy client.
client = tabpy_client.Client('http://localhost:9004/')

#>>> Define database.
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/FINAL_PROJECT"

# PREPROCESS THE DATASET.

In [None]:
#>>> Extract data from pgAdmin database.
engine = create_engine(db_string)
neo_df = pd.read_sql_table(table_name='neo', con=engine)

#>>> Display the first 10 rows.
neo_df.head(10)

In [None]:
#>>> Display an overview of the dataset.
neo_df.info()

In [None]:
#>>> Determine the number of unique values in each column.
neo_df.nunique()

In [None]:
#>>> Drop unnecessary columns: 'id', 'orbiting_body', 'sentry_object'.
neo_df= neo_df.drop(['id', 'orbiting_body', 'sentry_object'],1)

#>>> Set index to 'name'.
neo_df = neo_df.set_index('name')

#>>> Display the first 10 rows.
neo_df.head(10)

In [None]:
#>>> Drop all samples with an absolute magnitude of 22 or greater.
neo_df = neo_df[neo_df['absolute_magnitude'] < 22] 

#>>> Display the first 10 rows.
neo_df.head(10)

In [None]:
#>>> Define the features set.
X = neo_df.copy()
X = X.drop('hazardous', axis=1)

#>>> Display the first 10 rows.
X.head(10)

In [None]:
#>>> Define the target set.
y = neo_df['hazardous'].values

#>>> Display the first ten values.
y[:10]

In [None]:
#>>> Split the preprocessed data into a training and testing dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=420, train_size=0.80)

#>>> Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#>>> Display the balance of the dataset.
Counter(y_train)

In [None]:
#>>> Implement combination sampling wih SMOTEENN.
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

#>>> Display the balance of the dataset.
Counter(y_resampled)

In [None]:
#>>> Create a StandardScaler instance.
scaler = StandardScaler()

#>>> Fit the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

#>>> Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# AUTOVIZUALIZE DATA

In [None]:
#>>> Initialize the Autoviz class in an object.
AV = AutoViz_Class()

#>>> Must specify in order for AutoViz to display plots.
%matplotlib inline

#>>> Passing the source data and parameters.
graph = AV.AutoViz(
    filename="",
    save_plot_dir='./Images',
    sep=',',
    depVar='hazardous',
    dfte=neo_df,
    header=0,
    verbose=2,
    lowess=False,
    chart_format='jpg',
    max_rows_analyzed=1500000,
    max_cols_analyzed=30,
)

# SUPPORT VECTOR MACHINE MODEL

In [None]:
#>>> Create SVM model instance.
svm_model = SVC(kernel='rbf')

#>>> Fit the data.
svm_model = svm_model.fit(X_train_scaled, y_train)

#>>> Make predictions using the test data
svm_predictions = svm_model.predict(X_test_scaled)

#>>> Calculate the confusion matrix.
svm_cm = confusion_matrix(y_test, svm_predictions)

#>>> Create a DataFrame from the confusion matrix.
svm_cm_df = pd.DataFrame(
    svm_cm, index=['Actual Non-Hazardous', 'Actual Hazardous'], columns=['Predicted Non-Hazardous', 'Predicted Hazardous'])

#>>> Display results.
print('Confusion Matrix')
display(svm_cm_df)
print(f'\nAccuracy Score : {accuracy_score(y_test, svm_predictions):.3f}')
print('\nClassification Report')
print(classification_report(y_test, svm_predictions))

# LOGISTIC REGRESSION MODEL

In [None]:
#>>> Create the LogisticRegressionCV instance.
log_model = LogisticRegressionCV()

#>>> Fit the model.
log_model = log_model.fit(X_train_scaled, y_train)

#>>> Make predictions using the testing data.
log_predictions = log_model.predict(X_test_scaled)

#>>> Calculate the confusion matrix.
log_cm = confusion_matrix(y_test, log_predictions)

#>>> Create a DataFrame from the confusion matrix.
log_cm_df = pd.DataFrame(
    log_cm, index=['Actual Non-Hazardous', 'Actual Hazardous'], columns=['Predicted Non-Hazardous', 'Predicted Hazardous'])

#>>> Display results.
print('Confusion Matrix')
display(log_cm_df)
print(f'\nAccuracy Score : {accuracy_score(y_test, log_predictions):.3f}')
print('\nClassification Report')
print(classification_report(y_test, log_predictions))

# DECISION TREE CLASSIFICATION

In [None]:
#>>> Create the decision tree classifier instance.
dt_model = tree.DecisionTreeClassifier()

#>>> Fit the model.
dt_model = dt_model.fit(X_train_scaled, y_train)

#>>> Make predictions using the testing data.
dt_predictions = dt_model.predict(X_test_scaled)

#>>> Calculate the confusion matrix.
dt_cm = confusion_matrix(y_test, dt_predictions)

#>>> Create a DataFrame from the confusion matrix.
dt_cm_df = pd.DataFrame(
    dt_cm, index=['Actual Non-Hazardous', 'Actual Hazardous'], columns=['Predicted Non-Hazardous', 'Predicted Hazardous'])

#>>> Display results.
print('Confusion Matrix')
display(dt_cm_df)
print(f'\nAccuracy Score : {accuracy_score(y_test, dt_predictions):.3f}')
print('\nClassification Report')
print(classification_report(y_test, dt_predictions))

In [None]:
#>>> Sort the features by importance.
sorted(zip(dt_model.feature_importances_, X.columns), reverse=True)

In [None]:
#>>> Display decision tree and save the output.
clf = tree.DecisionTreeClassifier(max_leaf_nodes=10, random_state=0)
clf.fit(X_train, y_train)
plt.figure(figsize=(18,18))
tree.plot_tree(clf, feature_names = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute magnitude'], class_names= ['non-hazardous', 'hazardous'], filled=True, rounded=True, proportion=True, node_ids=True, fontsize=14)
plt.savefig('./Images/decision_tree.jpg')

# GRADIENT BOOSTED TREE

In [None]:
#>>> Create a classifier object.
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)

    #>>> Fit the model.
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    #>>> Score the model.
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))

In [None]:
#>>> Choose a learning rate and create classifier.
gbt_model = GradientBoostingClassifier(n_estimators=20,
                                        learning_rate=0.75,
                                        max_features=5,
                                        max_depth=3,
                                        random_state=0)

#>>> Fit the model.
gbt_model.fit(X_train_scaled, y_train)

#>>> Make predictions using the testing data.
gbt_predictions = gbt_model.predict(X_test_scaled)

#>>> Calculate the confusion matrix.
gbt_cm = confusion_matrix(y_test, gbt_predictions)

#>>> Create a DataFrame from the confusion matrix.
gbt_cm_df = pd.DataFrame(
    gbt_cm, index=['Actual Non-Hazardous', 'Actual Hazardous'], columns=['Predicted Non-Hazardous', 'Predicted Hazardous'])

#>>> Display results.
print('Confusion Matrix')
display(gbt_cm_df)
print(f'\nAccuracy Score : {accuracy_score(y_test, gbt_predictions):.3f}')
print('\nClassification Report')
print(classification_report(y_test, gbt_predictions))

In [None]:
#>>> Sort the features by importance.
sorted(zip(gbt_model.feature_importances_, X.columns), reverse=True)

# RANDOM FOREST CLASSIFIER

In [None]:
#>>> Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

#>>> Fit the model.
rf_model = rf_model.fit(X_train_scaled, y_train)

#>>> Make predictions using the testing data.
rf_predictions = rf_model.predict(X_test_scaled)

#>>> Calculate the confusion matrix.
rf_cm = confusion_matrix(y_test, rf_predictions)

#>>> Create a DataFrame from the confusion matrix.
rf_cm_df = pd.DataFrame(
    rf_cm, index=['Actual Non-Hazardous', 'Actual Hazardous'], columns=['Predicted Non-Hazardous', 'Predicted Hazardous'])

#>>> Display results.
print('Confusion Matrix')
display(rf_cm_df)
print(f'\nAccuracy Score : {accuracy_score(y_test, rf_predictions):.3f}')
print('\nClassification Report')
print(classification_report(y_test, rf_predictions))

In [None]:
#>>> Sort the features by importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

# VOTING CLASSIFIER

In [None]:
#>>> Create voting classifier.
eclf = VotingClassifier(estimators=[('Support Vector Machine', svm_model), ('Logistic Regression', log_model), ('Decision Tree', dt_model), ('Gradient Boosted Tree', gbt_model), ('Random Forests', rf_model)], voting='hard')

#>>> Run voting classifier.
for clf, label in zip([svm_model, log_model, dt_model, gbt_model, rf_model, eclf], ['Support Vector Machine', 'Logistic Regression', 'Decision Tree', 'Gradient Boosted Tree', 'Random Forests', 'Ensemble']):
        scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# DEPLOY MODELS TO TABLEAU

In [None]:
#>>> Deploying models to tableau.
client.deploy('SVM', svm_model,'Support Vector Machine Model', override = True)
client.deploy('LRM', log_model,'Logistic Regression Model', override = True)
client.deploy('DTM', dt_model,'Decision Tree Model', override = True)
client.deploy('GBT', gbt_model,'Gradient Boosted Tree', override = True)
client.deploy('RFM', rf_model,'Random Forests Model', override = True)