In [47]:
import pandas as pd
import numpy as np
import mysql.connector
import shutil
import os

from imblearn.over_sampling import SMOTE

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from configparser import ConfigParser

RANDOM_STATE=123

FEATURES = ['est_diameter_min', 'est_diameter_max', 'relative_velocity',
            'miss_distance', 'absolute_magnitude']
LABEL = 'hazardous'
COLUMNS =  ['id', 'name'] + FEATURES + [LABEL]

parser = ConfigParser()
parser.read('pass.cfg')
mysql_pass = parser.get('mysql', 'password')

#### Split dataset into train/test to fit CatBoost and GBTClassifier models in the same conditions
- stratified by classes
- use SMOTE to oversampling positive examples
- ignore sentry_object feature, because it always equals 0
- ignore orbiting_body feature, because it always equals 'Earth'

In [48]:
db_connection = mysql.connector.connect(user="root", password=mysql_pass)
db_cursor = db_connection.cursor()
db_cursor.execute('USE NeoDB;')

In [65]:
db_cursor.execute(f'SELECT {",".join(COLUMNS)} FROM Neo')
df = pd.DataFrame(db_cursor.fetchall())
df.columns = COLUMNS

X_train, X_test, _, _ = train_test_split(df[COLUMNS], df[LABEL], test_size=0.1,
                                         stratify=df[LABEL], random_state=RANDOM_STATE)

n_train, n_train_pos  = X_train.shape[0], sum(X_train.hazardous)
print(f'Train set: {n_train}, positive: {n_train_pos}/{np.round(n_train_pos/n_train*100, 3)}%')
n_test, n_test_pos  = X_test.shape[0], sum(X_test.hazardous)
print(f'Test set: {n_test}, positive: {n_test_pos}/{np.round(n_test_pos/n_test*100, 3)}%')

# Do oversampling for train set
sm = SMOTE(random_state=RANDOM_STATE)
X_train_aug, y_train_aug = sm.fit_resample(X_train[FEATURES], X_train.hazardous)
X_train_aug['hazardous'] = y_train_aug
print('--')
n_train_aug, n_train_aug_pos  = X_train_aug.shape[0], sum(X_train_aug.hazardous)
print(f'Train set with oversampling: \
{n_train_aug}, positive: {n_train_aug_pos}/{np.round(n_train_aug_pos/n_train_aug * 100, 3)}%')

def df2mysqlstring(df):
    tuples = list(df.itertuples(index=False, name=None))
    return ','.join(['(' + ','.join([str(w) for w in t]) + ')' for t in tuples])

train_table = 'NeoTrain'
db_cursor.execute(f'DROP TABLE IF EXISTS {train_table}')
db_cursor.execute(
    f'CREATE TABLE {train_table} (\
    est_diameter_min FLOAT, est_diameter_max FLOAT, relative_velocity FLOAT, \
    miss_distance FLOAT, absolute_magnitude FLOAT, hazardous BOOLEAN);'
)
db_cursor.execute(
    f'INSERT INTO {train_table} (\
    est_diameter_min, est_diameter_max, relative_velocity, \
    miss_distance, absolute_magnitude, hazardous) VALUES ' + df2mysqlstring(X_train_aug) + ';')
db_cursor.execute('FLUSH TABLES;')

test_table = 'NeoTest'
X_test.name = '"' + X_test.name + '"'
db_cursor.execute(f'DROP TABLE IF EXISTS {test_table}')
db_cursor.execute(
    f'CREATE TABLE {test_table} (\
    id INT, name VARCHAR(1000), est_diameter_min FLOAT, est_diameter_max FLOAT, relative_velocity FLOAT, \
    miss_distance FLOAT, absolute_magnitude FLOAT, hazardous BOOLEAN);'
)
db_cursor.execute(
    f'INSERT INTO {test_table} (\
    id, name, est_diameter_min, est_diameter_max, relative_velocity, \
    miss_distance, absolute_magnitude, hazardous) VALUES ' + df2mysqlstring(X_test) + ';')
db_cursor.execute('FLUSH TABLES;')


Train set: 81752, positive: 7956/9.732%
Test set: 9084, positive: 884/9.731%
--
Train set with oversampling: 147592, positive: 73796/50.0%


#### Train CatBoost

Load train/test data from MySQL, split train data into the train and validation sets

In [90]:
db_cursor.execute(f'SELECT {",".join(FEATURES + ["hazardous"])} FROM NeoTrain')
X_train = pd.DataFrame(db_cursor.fetchall())
X_train.columns = FEATURES + ["hazardous"]

db_cursor.execute(f'SELECT {",".join(COLUMNS)} FROM NeoTest')
X_test = pd.DataFrame(db_cursor.fetchall())
X_test.columns = COLUMNS
y_test = X_test.hazardous

X_val = X_train.iloc[:X_test.shape[0]]
y_val = X_val.hazardous
X_train = X_train.iloc[X_test.shape[0]:]
y_train = X_train.hazardous

In [97]:
def print_all_metrics(y_true, y_pred):
    print(f'True positive values: {sum(y_true)}, found positive values: {sum(y_pred)}')
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    print(f'Precision: {np.round(precision, 5)}')
    print(f'Recall: {np.round(recall, 5)}')
    print(f'F1: {np.round(f1, 5)}')

model_catboost = CatBoostClassifier(iterations=10000)
model_catboost.fit(X_train[FEATURES], y_train, eval_set=(X_val[FEATURES], y_val), verbose=False)
y_pred_catboost = model_catboost.predict(X_test[FEATURES])
print_all_metrics(y_test, y_pred_catboost)

True positive values: 884, found positive values: 1188
Precision: 0.42845
Recall: 0.57579
F1: 0.49131


Save CatBoost results

In [99]:
# Create and archive parquet files
prediction_df = X_test.copy()
prediction_df['y_pred'] = y_pred_catboost
outdir = './data/catboost_prediction'
if not os.path.exists(outdir):
    os.mkdir(outdir)

prediction_df.to_parquet('data/catboost_prediction/result')
shutil.make_archive('data/catboost_prediction', 'zip', 'data/catboost_prediction')

# Save to MySQL table
table='NeoCatBoostResult'
db_cursor.execute(f'DROP TABLE IF EXISTS {table}')
db_cursor.execute(
    f'CREATE TABLE IF NOT EXISTS {table}(\
     id INT, name VARCHAR(1000), est_diameter_min FLOAT, \
     est_diameter_max FLOAT, relative_velocity FLOAT, \
     miss_distance FLOAT, absolute_magnitude FLOAT, \
     hazardous BOOLEAN, y_pred BOOLEAN);'
)
prediction_df.name = '"' + prediction_df.name + '"'
tuples = list(prediction_df.itertuples(index=False, name=None))
tuples_string = ','.join(['(' + ','.join([str(w) for w in t]) + ')' for t in tuples])

db_cursor.execute(
    f'INSERT INTO {table} (\
    id, name, est_diameter_min, est_diameter_max, relative_velocity, \
    miss_distance, absolute_magnitude, hazardous, y_pred) VALUES ' + tuples_string + ';')
db_cursor.execute('FLUSH TABLES;')

# Save model
model_catboost.save_model('models/model_catboost.cbm')

#### Train GBTClassifier [PySpark]

Load train/test data from MySQL, split train data into the train and validation sets.

In [182]:
import os
from pyspark.sql.functions import lit
from pyspark import SparkContext
from pyspark.sql.functions import monotonically_increasing_id

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier

os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk/'

In [208]:
def mysql2pyspark_df(builder, db, table, mysql_user, mysql_pass):
    return spark.read.format('jdbc').option('url', f'jdbc:mysql://localhost:3306/{db}') \
    .option('driver', 'com.mysql.cj.jdbc.Driver') \
    .option('dbtable', table) \
    .option('user', mysql_user).option('password', mysql_pass).load()

spark = SparkSession.builder.appName('Neo')\
.config('spark.jars', 'mysql-connector-java-8.0.30/mysql-connector-java-8.0.30.jar').getOrCreate()

df_train = mysql2pyspark_df(spark, 'NeoDB', 'NeoTrain', 'root', mysql_pass)
df_train = df_train.withColumn('hazardous', df_train.hazardous.cast('float'))
df_test = mysql2pyspark_df(spark, 'NeoDB', 'NeoTest', 'root', mysql_pass)
df_test = df_test.withColumn('hazardous', df_test.hazardous.cast('float'))

# Split train on train/val:
df_train = df_train.withColumn('index', monotonically_increasing_id())
df_val = df_train.limit(df_test.count())
df_val = df_train.where(df_train.index < df_test.count())
df_val = df_val.withColumn('valIndicator', lit(True))
df_train = df_train.where(df_train.index >= df_test.count())
df_train = df_train.withColumn('valIndicator', lit(False))
df_train_val = df_train.union(df_val)

X_train_val_vec = VectorAssembler(inputCols=FEATURES + ['valIndicator'],
                                  outputCol="features").transform(df_train_val)
X_test_vec = VectorAssembler(inputCols=FEATURES, outputCol='features').transform(df_test)

In [210]:
gbt_clf = GBTClassifier(labelCol='hazardous', featuresCol='features').setValidationIndicatorCol('valIndicator')
model_gbt = gbt_clf.fit(X_train_val_vec)

In [220]:
y_pred_gbt = model_gbt.transform(X_test_vec)
y_pred_gbt.select('name', 'prediction', 'hazardous').show(5)

+----------+----------+---------+
|      name|prediction|hazardous|
+----------+----------+---------+
|(2021 CR1)|       0.0|      0.0|
|(2019 XN2)|       0.0|      0.0|
| (2016 JE)|       1.0|      0.0|
|(2018 MG7)|       1.0|      1.0|
|(2019 GV5)|       0.0|      0.0|
+----------+----------+---------+
only showing top 5 rows



In [225]:
def print_all_metrics_spark(df_with_pred):
    dd = df_with_pred.rdd
    scoreAndLabels = dd.map(lambda t: (t.prediction, t.hazardous))
    metrics = MulticlassMetrics(scoreAndLabels)
    y_true = df_with_pred.agg({'hazardous': 'sum'}).collect()[0][0]
    y_pred = df_with_pred.agg({'prediction': 'sum'}).collect()[0][0]
    print(f'True positive values: {y_true}, found positive values: {y_pred}')
    print(f'Precision: {np.round(metrics.precision(1.0), 5)}')
    print(f'Recall: {np.round(metrics.recall(1.0), 5)}')
    print(f'F1: {np.round(metrics.fMeasure(1.0), 5)}')

print_all_metrics_spark(y_pred_gbt)

True positive values: 884.0, found positive values: 2734.0
Precision: 0.31346
Recall: 0.96946
F1: 0.47374


[Stage 538:>                                                        (0 + 1) / 1]                                                                                