In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Yanuo Zhou\AppData\Local\pip\Cache\wheels\76\03\bb\589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [3]:
# Set features. This will also be used as your x values.
# Only drop those that will cause colinearity issues, as we do not know which variable can be useful. Just throw everything
# into the algorithms
X = df.drop(['koi_period_err2', 'koi_time0bk_err2', 'koi_depth_err2', 'koi_disposition'], axis = 1)
print(X.shape)

(6991, 37)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
y = df['koi_disposition']

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [5]:
# Scale the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Create a Neural Network model here
model2 = Sequential()
model2.add(Dense(units = 6, activation = 'relu', input_dim = X_train_scaled.shape[1]))
model2.add(Dense(units = 4, activation = 'relu'))
model2.add(Dense(units = y_train_categorical.shape[1], activation = 'softmax'))

# Compile the model using categorical_crossentropy for the loss function, the adam optimizer,
# and add accuracy to the training metrics
model2.compile(optimizer='adam',
               loss ='categorical_crossentropy',
               metrics=['accuracy'])

# Use the training data to fit (train) the model
model2.fit(X_train_scaled,
           y_train_categorical,
           epochs=100,
           shuffle=True,
           verbose = 2
         )

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Train on 5243 samples
Epoch 1/100
5243/5243 - 0s - loss: 0.9923 - acc: 0.6103
Epoch 2/100
5243/5243 - 0s - loss: 0.6494 - acc: 0.7578
Epoch 3/100
5243/5243 - 0s - loss: 0.4656 - acc: 0.7814
Epoch 4/100
5243/5243 - 0s - loss: 0.4115 - acc: 0.7936
Epoch 5/100
5243/5243 - 0s - loss: 0.3930 - acc: 0.7957
Epoch 6/100
5243/5243 - 0s - loss: 0.3805 - acc: 0.8055
Epoch 7/100
5243/5243 - 0s - loss: 0.3725 - acc: 0.8098
Epoch 8/100
5243/5243 - 0s - loss: 0.3648 - acc: 0.8220
Epoch 9/100
5243/5243 - 0s - loss: 0.3592 - acc: 0.8211
Epoch 10/100
5243/5243 - 0s - loss: 0.3549 - acc: 0.8219
Epoch 11/100
5243/5243 - 0s - loss: 0.3507 - acc: 0.8264
Epoch 12/100
5243/5243 - 0s - loss: 0.3472 - acc: 0.8289
Epoch 13/100
5243/5243 - 0s - loss: 0.3443 - acc: 0.8331
Epoch 14/100
5243/5243 - 0s - loss: 0.3420 - acc: 0.8356
Epoch 15/100
5243/5243 - 0s - loss: 0.3387 - acc: 0.8352
Epoch 16/100
5243/5243 - 0s - loss: 0.3361 - acc: 0

<tensorflow.python.keras.callbacks.History at 0x1fc6afc2eb8>

In [7]:
print(f"Training Data Score: {model2.evaluate(X_train_scaled, y_train_categorical)[1]}")
print(f"Testing Data Score: {model2.evaluate(X_test_scaled, y_test_categorical)[1]}")

Training Data Score: 0.8891856074333191
Testing Data Score: 0.8792906403541565


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [13]:
# Use scikit-learn to grid search the batch size and epochs
import numpy
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
# Function to create model, required for KerasClassifier
def create_model(neurons=6):
# create model
    model2 = Sequential()
    model2.add(Dense(units = neurons, activation = 'relu', input_dim = 37))
    model2.add(Dense(units = 4, activation = 'relu'))
    model2.add(Dense(units = 3, activation='softmax'))
# Compile model
    model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model2
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create model
model2 = KerasClassifier(build_fn=create_model, verbose=2)
# define the grid search parameters
epochs = [50, 100, 150]
neurons = [6, 8, 10]
param_grid = dict(neurons = neurons, epochs = epochs)
grid = GridSearchCV(estimator=model2, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train_categorical)

Train on 5243 samples
Epoch 1/150
5243/5243 - 0s - loss: 1.0003 - acc: 0.5895
Epoch 2/150
5243/5243 - 0s - loss: 0.7468 - acc: 0.7265
Epoch 3/150
5243/5243 - 0s - loss: 0.5248 - acc: 0.7379
Epoch 4/150
5243/5243 - 0s - loss: 0.4294 - acc: 0.7372
Epoch 5/150
5243/5243 - 0s - loss: 0.4038 - acc: 0.7616
Epoch 6/150
5243/5243 - 0s - loss: 0.3948 - acc: 0.7658
Epoch 7/150
5243/5243 - 0s - loss: 0.3906 - acc: 0.7814
Epoch 8/150
5243/5243 - 0s - loss: 0.3887 - acc: 0.7700
Epoch 9/150
5243/5243 - 0s - loss: 0.3876 - acc: 0.7891
Epoch 10/150
5243/5243 - 0s - loss: 0.3848 - acc: 0.7873
Epoch 11/150
5243/5243 - 0s - loss: 0.3830 - acc: 0.7932
Epoch 12/150
5243/5243 - 0s - loss: 0.3821 - acc: 0.8051
Epoch 13/150
5243/5243 - 0s - loss: 0.3793 - acc: 0.8020
Epoch 14/150
5243/5243 - 0s - loss: 0.3757 - acc: 0.8123
Epoch 15/150
5243/5243 - 0s - loss: 0.3715 - acc: 0.8224
Epoch 16/150
5243/5243 - 0s - loss: 0.3668 - acc: 0.8249
Epoch 17/150
5243/5243 - 0s - loss: 0.3626 - acc: 0.8238
Epoch 18/150
5243/

Epoch 144/150
5243/5243 - 0s - loss: 0.2593 - acc: 0.8867
Epoch 145/150
5243/5243 - 0s - loss: 0.2520 - acc: 0.8961
Epoch 146/150
5243/5243 - 0s - loss: 0.2546 - acc: 0.8947
Epoch 147/150
5243/5243 - 0s - loss: 0.2530 - acc: 0.8951
Epoch 148/150
5243/5243 - 0s - loss: 0.2526 - acc: 0.8947
Epoch 149/150
5243/5243 - 0s - loss: 0.2528 - acc: 0.8949
Epoch 150/150
5243/5243 - 0s - loss: 0.2540 - acc: 0.8945


In [14]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.878505 using {'epochs': 150, 'neurons': 8}
0.724776 (0.158537) with: {'epochs': 50, 'neurons': 6}
0.831394 (0.048773) with: {'epochs': 50, 'neurons': 8}
0.830059 (0.047905) with: {'epochs': 50, 'neurons': 10}
0.865916 (0.003819) with: {'epochs': 100, 'neurons': 6}
0.876597 (0.006866) with: {'epochs': 100, 'neurons': 8}
0.872973 (0.003719) with: {'epochs': 100, 'neurons': 10}
0.856380 (0.013045) with: {'epochs': 150, 'neurons': 6}
0.878505 (0.007162) with: {'epochs': 150, 'neurons': 8}
0.872020 (0.002749) with: {'epochs': 150, 'neurons': 10}


# Test the Accuracy of the Tuned Model on the Test Data

In [15]:
# Build the tuned model
model2 = Sequential()
model2.add(Dense(units = 8, activation = 'relu', input_dim = 37))
model2.add(Dense(units = 4, activation = 'relu'))
model2.add(Dense(units = 3, activation='softmax'))
# Compile model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model with test data
model2.fit(X_test_scaled,
           y_test_categorical,
           epochs=150,
           verbose = 2
         )

Train on 1748 samples
Epoch 1/150
1748/1748 - 0s - loss: 1.0659 - acc: 0.4199
Epoch 2/150
1748/1748 - 0s - loss: 1.0141 - acc: 0.5011
Epoch 3/150
1748/1748 - 0s - loss: 0.9709 - acc: 0.5017
Epoch 4/150
1748/1748 - 0s - loss: 0.9178 - acc: 0.5046
Epoch 5/150
1748/1748 - 0s - loss: 0.8457 - acc: 0.5280
Epoch 6/150
1748/1748 - 0s - loss: 0.7647 - acc: 0.5795
Epoch 7/150
1748/1748 - 0s - loss: 0.6932 - acc: 0.6808
Epoch 8/150
1748/1748 - 0s - loss: 0.6392 - acc: 0.7271
Epoch 9/150
1748/1748 - 0s - loss: 0.5990 - acc: 0.7391
Epoch 10/150
1748/1748 - 0s - loss: 0.5683 - acc: 0.7391
Epoch 11/150
1748/1748 - 0s - loss: 0.5451 - acc: 0.7746
Epoch 12/150
1748/1748 - 0s - loss: 0.5262 - acc: 0.7763
Epoch 13/150
1748/1748 - 0s - loss: 0.5094 - acc: 0.7769
Epoch 14/150
1748/1748 - 0s - loss: 0.4953 - acc: 0.7769
Epoch 15/150
1748/1748 - 0s - loss: 0.4841 - acc: 0.7929
Epoch 16/150
1748/1748 - 0s - loss: 0.4737 - acc: 0.7935
Epoch 17/150
1748/1748 - 0s - loss: 0.4635 - acc: 0.7992
Epoch 18/150
1748/

Epoch 144/150
1748/1748 - 0s - loss: 0.2840 - acc: 0.8856
Epoch 145/150
1748/1748 - 0s - loss: 0.2830 - acc: 0.8816
Epoch 146/150
1748/1748 - 0s - loss: 0.2841 - acc: 0.8804
Epoch 147/150
1748/1748 - 0s - loss: 0.2842 - acc: 0.8793
Epoch 148/150
1748/1748 - 0s - loss: 0.2840 - acc: 0.8816
Epoch 149/150
1748/1748 - 0s - loss: 0.2812 - acc: 0.8833
Epoch 150/150
1748/1748 - 0s - loss: 0.2853 - acc: 0.8856


<tensorflow.python.keras.callbacks.History at 0x1fc710d57b8>

In [16]:
# Print the model accuracy
print(f"Testing Data Score: {model2.evaluate(X_test_scaled, y_test_categorical)[1]}")

Testing Data Score: 0.8867276906967163


#### This model is a little bit better than the previous logistic regression model.

# Save the Model

In [18]:
# Save the model
model2.save("Yanuo_Zhou.h5")