In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select features (variables)

In [3]:
# Only drop those that will cause colinearity issues, as we do not know which variable can be useful.
X = df.drop(['koi_period_err2', 'koi_time0bk_err2', 'koi_depth_err2', 'koi_disposition'], axis = 1)
print(X.shape)

(6991, 37)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
y = df['koi_disposition']

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [5]:
# Scale the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create a Neural Network Deep Learning model here (based on the dimension of X)
model2 = Sequential()
model2.add(Dense(units = 30, activation = 'relu', input_dim = X_train_scaled.shape[1]))
model2.add(Dense(units = 15, activation = 'relu'))
model2.add(Dense(units = 6, activation = 'relu'))
model2.add(Dense(units = y_train_categorical.shape[1], activation = 'softmax'))

# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model2.compile(optimizer='adam',
               loss ='categorical_crossentropy',
               metrics=['accuracy'])

# Use the training data to fit (train) the model
model2.fit(X_train_scaled,
           y_train_categorical,
           epochs=100,
           shuffle=True,
           verbose = 0
         )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


<tensorflow.python.keras.callbacks.History at 0x22094a6fef0>

In [7]:
print(f"Training Data Score: {model2.evaluate(X_train_scaled, y_train_categorical)[1]}")
print(f"Testing Data Score: {model2.evaluate(X_test_scaled, y_test_categorical)[1]}")

Training Data Score: 0.9063513278961182
Testing Data Score: 0.8930205702781677


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [8]:
# Use scikit-learn to grid search the batch size and epochs
import numpy
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model, required for KerasClassifier
def create_model(neurons_L1=30, neurons_L2=15):
# create model
    model2 = Sequential()
    model2.add(Dense(units = neurons_L1, activation = 'relu', input_dim = 37))
    model2.add(Dense(units = neurons_L2, activation = 'relu'))
    model2.add(Dense(units = 6, activation = 'relu'))
    model2.add(Dense(units = 3, activation='softmax'))
# Compile model
    model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model2
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create model
model2 = KerasClassifier(build_fn=create_model, verbose=2)
# define the grid search parameters
epochs = [100, 150, 200]
neurons_L1 = [30, 25]
neurons_L2 = [20, 15, 10]
param_grid = dict(neurons_L1 = neurons_L1, neurons_L2 = neurons_L2, epochs = epochs)
grid = GridSearchCV(estimator=model2, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train_categorical, verbose = 0)

Using TensorFlow backend.


In [9]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.892809 using {'epochs': 200, 'neurons_L1': 30, 'neurons_L2': 10}
0.884036 (0.010243) with: {'epochs': 100, 'neurons_L1': 30, 'neurons_L2': 20}
0.887278 (0.002482) with: {'epochs': 100, 'neurons_L1': 30, 'neurons_L2': 15}
0.874690 (0.002506) with: {'epochs': 100, 'neurons_L1': 30, 'neurons_L2': 10}
0.860576 (0.029037) with: {'epochs': 100, 'neurons_L1': 25, 'neurons_L2': 20}
0.878505 (0.011757) with: {'epochs': 100, 'neurons_L1': 25, 'neurons_L2': 15}
0.879268 (0.004644) with: {'epochs': 100, 'neurons_L1': 25, 'neurons_L2': 10}
0.884417 (0.008099) with: {'epochs': 150, 'neurons_L1': 30, 'neurons_L2': 20}
0.874499 (0.019557) with: {'epochs': 150, 'neurons_L1': 30, 'neurons_L2': 15}
0.885752 (0.006027) with: {'epochs': 150, 'neurons_L1': 30, 'neurons_L2': 10}
0.881747 (0.006597) with: {'epochs': 150, 'neurons_L1': 25, 'neurons_L2': 20}
0.891093 (0.008332) with: {'epochs': 150, 'neurons_L1': 25, 'neurons_L2': 15}
0.884990 (0.016262) with: {'epochs': 150, 'neurons_L1': 25, 'neurons_

# Test the Accuracy of the Tuned Model on the Test Data

In [11]:
# Build the tuned model
model2 = Sequential()
model2.add(Dense(units = 30, activation = 'relu', input_dim = 37))
model2.add(Dense(units = 10, activation = 'relu'))
model2.add(Dense(units = 6, activation = 'relu'))
model2.add(Dense(units = 3, activation='softmax'))
# Compile model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model with test data
model2.fit(X_test_scaled,
           y_test_categorical,
           epochs=200,
           verbose = 0
         )

<tensorflow.python.keras.callbacks.History at 0x22099808f60>

In [12]:
# Print the model accuracy
print(f"Testing Data Score: {model2.evaluate(X_test_scaled, y_test_categorical)[1]}")

Testing Data Score: 0.8987414240837097


#### This model is a little bit better than the logistic regression model, even with the help of grid search.

# Save the Model

In [13]:
# Save the model
model2.save("NN_grid.h5")