In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/serenabaker/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [4]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC 
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import EarlyStopping

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Create a random forest classifier
target = df["koi_disposition"]
target_names = ["CONFIRMED", "FALSE POSITIVE", "OTHER"]

data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.2, random_state=1)

In [8]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.873481057898499

In [9]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.908506075768406

In [10]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.1082612177063831, 'koi_fpflag_co'),
 (0.1015763950594457, 'koi_fpflag_nt'),
 (0.0711712991681786, 'koi_fpflag_ss'),
 (0.05756316234637173, 'koi_model_snr'),
 (0.04962340588618611, 'koi_prad'),
 (0.03676597047394771, 'koi_prad_err2'),
 (0.03519442969272711, 'koi_fpflag_ec'),
 (0.03389279157088083, 'koi_duration_err1'),
 (0.03229429805960799, 'koi_duration_err2'),
 (0.031117081820816, 'koi_steff_err1'),
 (0.02934662121461936, 'koi_prad_err1'),
 (0.02643209737953936, 'koi_steff_err2'),
 (0.022275457277577706, 'koi_duration'),
 (0.02204909028661767, 'koi_time0bk_err1'),
 (0.020749614027627268, 'koi_insol_err1'),
 (0.02037348115661571, 'koi_time0bk_err2'),
 (0.020056419245262122, 'koi_depth'),
 (0.019655737900259106, 'koi_impact'),
 (0.01787971088798216, 'koi_period'),
 (0.017007228073151182, 'koi_period_err2'),
 (0.01586059462411828, 'koi_insol'),
 (0.015610106402076506, 'koi_insol_err2'),
 (0.014594689199827909, 'koi_period_err1'),
 (0.014418513536652501, 'koi_teq'),
 (0.0140211151481

In [11]:
df = df.drop('koi_kepmag', axis=1)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [12]:
X = df.drop('koi_disposition', axis=1).astype(float)
y = df['koi_disposition']
print(X.shape, y.shape)

(6991, 39) (6991,)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec
3867,0.0,0.0,1.0,1.0,2.213961,2.3e-05,-2.3e-05,132.5689,0.0107,-0.0107,...,148.0,-148.0,4.505,0.084,-0.058,0.823,0.059,-0.088,284.31454,42.589321
908,0.0,0.0,0.0,0.0,9.115003,5.1e-05,-5.1e-05,133.97523,0.00469,-0.00469,...,160.0,-195.0,4.431,0.07,-0.21,1.028,0.32,-0.137,289.38351,41.392689
2222,0.0,0.0,0.0,0.0,10.183182,9.8e-05,-9.8e-05,133.21404,0.00752,-0.00752,...,104.0,-127.0,4.421,0.04,-0.12,1.084,0.177,-0.071,287.97885,42.679359
2094,0.0,0.0,0.0,0.0,3.089428,3.4e-05,-3.4e-05,134.1999,0.0101,-0.0101,...,80.0,-86.0,4.418,0.028,-0.105,1.103,0.175,-0.062,293.43079,46.692089
775,0.0,1.0,0.0,0.0,9.085905,1.2e-05,-1.2e-05,182.08162,0.00113,-0.00113,...,177.0,-195.0,4.479,0.091,-0.169,0.866,0.21,-0.113,296.03342,40.019611


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [14]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [15]:
deep_model = Sequential()
deep_model.add(Dense(units=6, activation='relu', input_dim=39))
deep_model.add(Dense(units=6, activation='relu'))
deep_model.add(Dense(units=3, activation='softmax'))
deep_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 240       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 21        
Total params: 303
Trainable params: 303
Non-trainable params: 0
_________________________________________________________________


In [16]:
early_stopping = [EarlyStopping(monitor='val_loss', patience=2)]

deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2,
    callbacks=early_stopping
)

Epoch 1/100
175/175 - 1s - loss: 1.0836 - accuracy: 0.4483
Epoch 2/100
175/175 - 0s - loss: 0.9492 - accuracy: 0.5014
Epoch 3/100
175/175 - 0s - loss: 0.6446 - accuracy: 0.7246
Epoch 4/100
175/175 - 0s - loss: 0.4724 - accuracy: 0.7527
Epoch 5/100
175/175 - 0s - loss: 0.4136 - accuracy: 0.7530
Epoch 6/100
175/175 - 0s - loss: 0.3970 - accuracy: 0.7623
Epoch 7/100
175/175 - 0s - loss: 0.3887 - accuracy: 0.7949
Epoch 8/100
175/175 - 0s - loss: 0.3831 - accuracy: 0.7983
Epoch 9/100
175/175 - 0s - loss: 0.3788 - accuracy: 0.8106
Epoch 10/100
175/175 - 0s - loss: 0.3740 - accuracy: 0.8151
Epoch 11/100
175/175 - 0s - loss: 0.3695 - accuracy: 0.8180
Epoch 12/100
175/175 - 0s - loss: 0.3635 - accuracy: 0.8197
Epoch 13/100
175/175 - 0s - loss: 0.3582 - accuracy: 0.8262
Epoch 14/100
175/175 - 0s - loss: 0.3525 - accuracy: 0.8249
Epoch 15/100
175/175 - 0s - loss: 0.3498 - accuracy: 0.8274
Epoch 16/100
175/175 - 0s - loss: 0.3448 - accuracy: 0.8344
Epoch 17/100
175/175 - 0s - loss: 0.3417 - accura

Epoch 45/100
175/175 - 0s - loss: 0.2966 - accuracy: 0.8623
Epoch 46/100
175/175 - 0s - loss: 0.2958 - accuracy: 0.8639
Epoch 47/100
175/175 - 0s - loss: 0.2949 - accuracy: 0.8664
Epoch 48/100
175/175 - 0s - loss: 0.2943 - accuracy: 0.8652
Epoch 49/100
175/175 - 0s - loss: 0.2941 - accuracy: 0.8625
Epoch 50/100
175/175 - 0s - loss: 0.2941 - accuracy: 0.8643
Epoch 51/100
175/175 - 0s - loss: 0.2938 - accuracy: 0.8664
Epoch 52/100
175/175 - 0s - loss: 0.2922 - accuracy: 0.8677
Epoch 53/100
175/175 - 0s - loss: 0.2902 - accuracy: 0.8705
Epoch 54/100
175/175 - 0s - loss: 0.2917 - accuracy: 0.8671
Epoch 55/100
175/175 - 0s - loss: 0.2904 - accuracy: 0.8709
Epoch 56/100
175/175 - 0s - loss: 0.2882 - accuracy: 0.8712
Epoch 57/100
175/175 - 0s - loss: 0.2900 - accuracy: 0.8673
Epoch 58/100
175/175 - 0s - loss: 0.2877 - accuracy: 0.8691
Epoch 59/100
175/175 - 0s - loss: 0.2879 - accuracy: 0.8700
Epoch 60/100
175/175 - 0s - loss: 0.2859 - accuracy: 0.8720
Epoch 61/100
175/175 - 0s - loss: 0.2869

Epoch 89/100
175/175 - 0s - loss: 0.2759 - accuracy: 0.8784
Epoch 90/100
175/175 - 0s - loss: 0.2756 - accuracy: 0.8813
Epoch 91/100
175/175 - 0s - loss: 0.2758 - accuracy: 0.8814
Epoch 92/100
175/175 - 0s - loss: 0.2756 - accuracy: 0.8807
Epoch 93/100
175/175 - 0s - loss: 0.2752 - accuracy: 0.8823
Epoch 94/100
175/175 - 0s - loss: 0.2756 - accuracy: 0.8816
Epoch 95/100
175/175 - 0s - loss: 0.2755 - accuracy: 0.8827
Epoch 96/100
175/175 - 0s - loss: 0.2741 - accuracy: 0.8813
Epoch 97/100
175/175 - 0s - loss: 0.2742 - accuracy: 0.8822
Epoch 98/100
175/175 - 0s - loss: 0.2763 - accuracy: 0.8822
Epoch 99/100
175/175 - 0s - loss: 0.2745 - accuracy: 0.8852
Epoch 100/100
175/175 - 0s - loss: 0.2733 - accuracy: 0.8852


<tensorflow.python.keras.callbacks.History at 0x7f8f6c9a85b0>

In [17]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

44/44 - 0s - loss: 0.2998 - accuracy: 0.8749
Deep Neural Network - Loss: 0.29975736141204834, Accuracy: 0.8749106526374817


In [18]:
encoded_predictions = deep_model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [19]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CONFIRMED']
Actual Labels: ['FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CANDIDATE']


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# Create the GridSearchCV model
deep_model = SVC(kernel='linear')

In [21]:
SVC(kernel='linear')

SVC(kernel='linear')

In [22]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(deep_model, param_grid, verbose=3)

In [23]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.860, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.834, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.838, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.835, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.830, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.860, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.834, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.838, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.835, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.893, total=   0.5s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.879, total=   0.5s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.886, total=   0.5s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.871, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.881, total=   0.5s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.893, total=   0.5s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.879, total=   0.4s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   35.3s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [24]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

In [25]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.8819727874843531


In [26]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

['FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE' ... 'FALSE POSITIVE'
 'FALSE POSITIVE' 'CONFIRMED']


In [27]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.881


In [28]:
from sklearn import metrics
from sklearn.metrics import f1_score
#metrics.f1_score(y_test, predictions, labels=np.unique(predictions))
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))


0.8790899625319256

In [29]:
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "OTHER"]))

                precision    recall  f1-score   support

     CONFIRMED       0.85      0.67      0.75       338
FALSE POSITIVE       0.74      0.85      0.79       360
         OTHER       0.97      1.00      0.99       701

      accuracy                           0.88      1399
     macro avg       0.86      0.84      0.84      1399
  weighted avg       0.88      0.88      0.88      1399



# Save the Model

In [31]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

import joblib
filename = 'serena_baker.sav'
joblib.dump(deep_model, filename)

['serena_baker.sav']