In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/serenabaker/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
#!pip install tensorflow
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [4]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import EarlyStopping

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
#df = pd.get_dummies(df)
#df.head()
#df.columns

## KOI (Kepler Object of Interest) Columns:
### koi_disposition (y-value) = Exoplanet Archive Disposition (Confirmed = 1)
#### koi_period = Orbital Period (Days)
#### koi_time0bk = Transit Epoch (Center of first detected transit)
#### koi_impact = Impact Parameter (Distance from center of planet/stellar disc)
#### koi_duration = Transit Duration (Hours-Between planet/star first/last contact)
#### koi_prad = Planetary Radius
#### koi_teq = Equilibrium Temperature (Kelvin)
#### koi_model_snr = Transit depth normalized by mean uncertainty

# Select your features (columns)

In [7]:
# Set features. This will also be used as your x values.
# selected_features = df[['koi_disposition', 'koi_fpflag_co', 'koi_fpflag_ss', 'koi_model_snr']]
# selected_features.head()

# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
X = df.drop('koi_disposition', axis=1).astype(int)
y = df['koi_disposition']
print(X.shape, y.shape)
X.dtypes


(6991, 40) (6991,)


koi_fpflag_nt        int64
koi_fpflag_ss        int64
koi_fpflag_co        int64
koi_fpflag_ec        int64
koi_period           int64
koi_period_err1      int64
koi_period_err2      int64
koi_time0bk          int64
koi_time0bk_err1     int64
koi_time0bk_err2     int64
koi_impact           int64
koi_impact_err1      int64
koi_impact_err2      int64
koi_duration         int64
koi_duration_err1    int64
koi_duration_err2    int64
koi_depth            int64
koi_depth_err1       int64
koi_depth_err2       int64
koi_prad             int64
koi_prad_err1        int64
koi_prad_err2        int64
koi_teq              int64
koi_insol            int64
koi_insol_err1       int64
koi_insol_err2       int64
koi_model_snr        int64
koi_tce_plnt_num     int64
koi_steff            int64
koi_steff_err1       int64
koi_steff_err2       int64
koi_slogg            int64
koi_slogg_err1       int64
koi_slogg_err2       int64
koi_srad             int64
koi_srad_err1        int64
koi_srad_err2        int64
r

In [9]:
#X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.5, random_state=1, stratify=y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3382,0,0,1,0,1,0,0,132,0,0,...,-153,4,0,0,0,0,0,285,45,15
4054,0,1,1,0,0,0,0,132,0,0,...,-210,4,0,0,0,0,0,294,38,15
6361,0,1,0,0,1,0,0,132,0,0,...,-277,4,0,0,1,0,0,286,45,11
5005,0,0,0,0,47,0,0,135,0,0,...,-167,4,0,0,0,0,0,295,51,14
1702,0,0,0,0,7,0,0,138,0,0,...,-78,3,0,0,2,0,0,290,40,14


In [10]:
# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(y_train)
# encoded_y_train = label_encoder.transform(y_train)
# encoded_y_test = label_encoder.transform(y_test)

In [11]:
# # Step 2: Convert encoded labels to one-hot-encoding
# y_train_categorical = to_categorical(encoded_y_train)
# y_test_categorical = to_categorical(encoded_y_test)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
print(y_train_categorical)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]]


# Train the Model



In [17]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [18]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               4100      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 303       
Total params: 14,503
Trainable params: 14,503
Non-trainable params: 0
_________________________________________________________________


In [19]:
early_stopping = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2,
    callbacks=early_stopping
)

Epoch 1/60
110/110 - 0s - loss: 0.6323 - accuracy: 0.6976
Epoch 2/60
110/110 - 0s - loss: 0.3726 - accuracy: 0.8009
Epoch 3/60
110/110 - 0s - loss: 0.3464 - accuracy: 0.8074
Epoch 4/60
110/110 - 0s - loss: 0.3444 - accuracy: 0.8117
Epoch 5/60
110/110 - 0s - loss: 0.3348 - accuracy: 0.8240
Epoch 6/60
110/110 - 0s - loss: 0.3280 - accuracy: 0.8343
Epoch 7/60
110/110 - 0s - loss: 0.3299 - accuracy: 0.8263
Epoch 8/60
110/110 - 0s - loss: 0.3298 - accuracy: 0.8272
Epoch 9/60
110/110 - 0s - loss: 0.3247 - accuracy: 0.8283
Epoch 10/60
110/110 - 0s - loss: 0.3220 - accuracy: 0.8361
Epoch 11/60
110/110 - 0s - loss: 0.3272 - accuracy: 0.8340
Epoch 12/60
110/110 - 0s - loss: 0.3261 - accuracy: 0.8318
Epoch 13/60
110/110 - 0s - loss: 0.3167 - accuracy: 0.8378
Epoch 14/60
110/110 - 0s - loss: 0.3144 - accuracy: 0.8398
Epoch 15/60
110/110 - 0s - loss: 0.3247 - accuracy: 0.8326
Epoch 16/60
110/110 - 0s - loss: 0.3125 - accuracy: 0.8383
Epoch 17/60
110/110 - 0s - loss: 0.3112 - accuracy: 0.8418
Epoch 

Epoch 45/60
110/110 - 0s - loss: 0.2985 - accuracy: 0.8498
Epoch 46/60
110/110 - 0s - loss: 0.3014 - accuracy: 0.8426
Epoch 47/60
110/110 - 0s - loss: 0.2987 - accuracy: 0.8498
Epoch 48/60
110/110 - 0s - loss: 0.2987 - accuracy: 0.8443
Epoch 49/60
110/110 - 0s - loss: 0.3019 - accuracy: 0.8401
Epoch 50/60
110/110 - 0s - loss: 0.2994 - accuracy: 0.8492
Epoch 51/60
110/110 - 0s - loss: 0.3081 - accuracy: 0.8386
Epoch 52/60
110/110 - 0s - loss: 0.2950 - accuracy: 0.8515
Epoch 53/60
110/110 - 0s - loss: 0.2997 - accuracy: 0.8486
Epoch 54/60
110/110 - 0s - loss: 0.2930 - accuracy: 0.8558
Epoch 55/60
110/110 - 0s - loss: 0.2951 - accuracy: 0.8518
Epoch 56/60
110/110 - 0s - loss: 0.2972 - accuracy: 0.8461
Epoch 57/60
110/110 - 0s - loss: 0.2934 - accuracy: 0.8418
Epoch 58/60
110/110 - 0s - loss: 0.2954 - accuracy: 0.8466
Epoch 59/60
110/110 - 0s - loss: 0.2934 - accuracy: 0.8504
Epoch 60/60
110/110 - 0s - loss: 0.2915 - accuracy: 0.8512


<tensorflow.python.keras.callbacks.History at 0x7fef6bb8f6d0>

In [21]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

110/110 - 0s - loss: 0.3966 - accuracy: 0.8335
Normal Neural Network - Loss: 0.3965592086315155, Accuracy: 0.8335240483283997


In [22]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [23]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['CONFIRMED' 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'FALSE POSITIVE']
Actual Labels: ['CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE']


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [24]:
# Create the GridSearchCV model
model = SVC(kernel='linear')
model

SVC(kernel='linear')

In [25]:
SVC(kernel='linear')

SVC(kernel='linear')

In [26]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [27]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.838, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.847, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.833, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.834, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.824, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.838, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.847, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.833, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.834, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.824, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.848, total=   0.2s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.848, total=   0.2s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.833, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.850, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.858, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.848, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.848, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   12.7s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [29]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [30]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.8474964234620886


In [31]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [32]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.836


In [33]:
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "OTHER"]))

                precision    recall  f1-score   support

     CONFIRMED       0.71      0.60      0.65       844
FALSE POSITIVE       0.68      0.74      0.71       900
         OTHER       0.97      1.00      0.99      1752

      accuracy                           0.84      3496
     macro avg       0.79      0.78      0.78      3496
  weighted avg       0.83      0.84      0.83      3496



# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

# import joblib
# filename = 'serena_baker.sav'
# joblib.dump(model2, filename)