In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/serenabaker/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [4]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC 
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import EarlyStopping

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Create a random forest classifier
target = df["koi_disposition"]
target_names = ["CONFIRMED", "FALSE POSITIVE", "OTHER"]

data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.33, random_state=1)

In [8]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8587521663778163

In [9]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9046793760831889

In [10]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.10469836147689512, 'koi_fpflag_co'),
 (0.09778928662433507, 'koi_fpflag_nt'),
 (0.06730889002597913, 'koi_fpflag_ss'),
 (0.05902346423789947, 'koi_model_snr'),
 (0.05330857619696956, 'koi_prad'),
 (0.03602008012889727, 'koi_fpflag_ec'),
 (0.03578191702809878, 'koi_prad_err2'),
 (0.03219675399100091, 'koi_duration_err1'),
 (0.03059632152407208, 'koi_duration_err2'),
 (0.030264305071042993, 'koi_steff_err1'),
 (0.029038579778418342, 'koi_prad_err1'),
 (0.026255370550743052, 'koi_steff_err2'),
 (0.023049115342806528, 'koi_insol_err1'),
 (0.02293500417510501, 'koi_duration'),
 (0.021926469902436057, 'koi_depth'),
 (0.021255538723689217, 'koi_period'),
 (0.02082045631606978, 'koi_time0bk_err2'),
 (0.020761920251696413, 'koi_impact'),
 (0.020669174657824474, 'koi_time0bk_err1'),
 (0.01755068610650016, 'koi_period_err2'),
 (0.015799345983198047, 'koi_period_err1'),
 (0.015399639388540729, 'koi_insol'),
 (0.014971394611852177, 'koi_teq'),
 (0.01416085694574643, 'koi_depth_err2'),
 (0.01368

# Create a Train Test Split

Use `koi_disposition` for the y values

In [11]:
X = df.drop('koi_disposition', axis=1).astype(float)
y = df['koi_disposition']
print(X.shape, y.shape)

(6991, 40) (6991,)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=1, stratify=y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3382,0.0,0.0,1.0,0.0,1.316399,5.66e-06,-5.66e-06,132.2998,0.00352,-0.00352,...,-153.0,4.508,0.088,-0.064,0.806,0.081,-0.089,285.88486,45.380138,15.797
4054,0.0,1.0,1.0,0.0,0.708805,4.19e-07,-4.19e-07,132.168239,0.000505,-0.000505,...,-210.0,4.5,0.052,-0.208,0.908,0.269,-0.09,294.04828,38.364109,15.036
6361,0.0,1.0,0.0,0.0,1.387207,8.6e-08,-8.6e-08,132.846915,5.1e-05,-5.1e-05,...,-277.0,4.06,0.198,-0.162,1.894,0.576,-0.471,286.00845,45.35603,11.658
5005,0.0,0.0,0.0,0.0,47.557135,0.0007251,-0.0007251,135.8497,0.0132,-0.0132,...,-167.0,4.436,0.084,-0.263,0.958,0.367,-0.114,295.51495,51.203739,14.732
1702,0.0,0.0,0.0,0.0,7.010743,2.52e-05,-2.52e-05,138.4027,0.00308,-0.00308,...,-78.0,3.745,0.225,-0.09,2.476,0.402,-0.747,290.6333,40.13699,14.689


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [13]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [18]:
deep_model = Sequential()
deep_model.add(Dense(units=6, activation='relu', input_dim=40))
deep_model.add(Dense(units=6, activation='relu'))
deep_model.add(Dense(units=3, activation='softmax'))
deep_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 246       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 21        
Total params: 309
Trainable params: 309
Non-trainable params: 0
_________________________________________________________________


In [19]:
early_stopping = [EarlyStopping(monitor='val_loss', patience=2)]

deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

deep_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2,
    callbacks=early_stopping
)

Epoch 1/100
110/110 - 0s - loss: 1.1171 - accuracy: 0.3608
Epoch 2/100
110/110 - 0s - loss: 1.0588 - accuracy: 0.5007
Epoch 3/100
110/110 - 0s - loss: 1.0297 - accuracy: 0.5013
Epoch 4/100
110/110 - 0s - loss: 0.9781 - accuracy: 0.6252
Epoch 5/100
110/110 - 0s - loss: 0.8905 - accuracy: 0.7531
Epoch 6/100
110/110 - 0s - loss: 0.7887 - accuracy: 0.7514
Epoch 7/100
110/110 - 0s - loss: 0.7002 - accuracy: 0.7516
Epoch 8/100
110/110 - 0s - loss: 0.6341 - accuracy: 0.7519
Epoch 9/100
110/110 - 0s - loss: 0.5870 - accuracy: 0.7534
Epoch 10/100
110/110 - 0s - loss: 0.5519 - accuracy: 0.7536
Epoch 11/100
110/110 - 0s - loss: 0.5250 - accuracy: 0.7539
Epoch 12/100
110/110 - 0s - loss: 0.5039 - accuracy: 0.7539
Epoch 13/100
110/110 - 0s - loss: 0.4865 - accuracy: 0.7539
Epoch 14/100
110/110 - 0s - loss: 0.4726 - accuracy: 0.7539
Epoch 15/100
110/110 - 0s - loss: 0.4608 - accuracy: 0.7539
Epoch 16/100
110/110 - 0s - loss: 0.4521 - accuracy: 0.7539
Epoch 17/100
110/110 - 0s - loss: 0.4428 - accura

Epoch 45/100
110/110 - 0s - loss: 0.3775 - accuracy: 0.8011
Epoch 46/100
110/110 - 0s - loss: 0.3749 - accuracy: 0.7920
Epoch 47/100
110/110 - 0s - loss: 0.3741 - accuracy: 0.8006
Epoch 48/100
110/110 - 0s - loss: 0.3726 - accuracy: 0.7989
Epoch 49/100
110/110 - 0s - loss: 0.3725 - accuracy: 0.7969
Epoch 50/100
110/110 - 0s - loss: 0.3714 - accuracy: 0.7983
Epoch 51/100
110/110 - 0s - loss: 0.3707 - accuracy: 0.8232
Epoch 52/100
110/110 - 0s - loss: 0.3704 - accuracy: 0.8017
Epoch 53/100
110/110 - 0s - loss: 0.3692 - accuracy: 0.8066
Epoch 54/100
110/110 - 0s - loss: 0.3685 - accuracy: 0.8123
Epoch 55/100
110/110 - 0s - loss: 0.3691 - accuracy: 0.8097
Epoch 56/100
110/110 - 0s - loss: 0.3630 - accuracy: 0.8006
Epoch 57/100
110/110 - 0s - loss: 0.3603 - accuracy: 0.8229
Epoch 58/100
110/110 - 0s - loss: 0.3595 - accuracy: 0.8126
Epoch 59/100
110/110 - 0s - loss: 0.3594 - accuracy: 0.8183
Epoch 60/100
110/110 - 0s - loss: 0.3577 - accuracy: 0.8235
Epoch 61/100
110/110 - 0s - loss: 0.3562

Epoch 89/100
110/110 - 0s - loss: 0.3264 - accuracy: 0.8389
Epoch 90/100
110/110 - 0s - loss: 0.3256 - accuracy: 0.8418
Epoch 91/100
110/110 - 0s - loss: 0.3253 - accuracy: 0.8403
Epoch 92/100
110/110 - 0s - loss: 0.3259 - accuracy: 0.8392
Epoch 93/100
110/110 - 0s - loss: 0.3231 - accuracy: 0.8438
Epoch 94/100
110/110 - 0s - loss: 0.3222 - accuracy: 0.8432
Epoch 95/100
110/110 - 0s - loss: 0.3227 - accuracy: 0.8423
Epoch 96/100
110/110 - 0s - loss: 0.3219 - accuracy: 0.8423
Epoch 97/100
110/110 - 0s - loss: 0.3225 - accuracy: 0.8401
Epoch 98/100
110/110 - 0s - loss: 0.3200 - accuracy: 0.8432
Epoch 99/100
110/110 - 0s - loss: 0.3215 - accuracy: 0.8423
Epoch 100/100
110/110 - 0s - loss: 0.3197 - accuracy: 0.8403


<tensorflow.python.keras.callbacks.History at 0x7fce47ada790>

In [20]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

110/110 - 0s - loss: 0.3641 - accuracy: 0.8244
Deep Neural Network - Loss: 0.36407119035720825, Accuracy: 0.8243706822395325


In [22]:
encoded_predictions = deep_model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [23]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['CONFIRMED' 'CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'FALSE POSITIVE']
Actual Labels: ['CONFIRMED', 'CANDIDATE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE']


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [39]:
# Create the GridSearchCV model
deep_model = SVC(kernel='linear')

In [40]:
SVC(kernel='linear')

SVC(kernel='linear')

In [41]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(deep_model, param_grid, verbose=3)

In [42]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.863, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.848, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.844, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.841, total=   0.1s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.845, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.863, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.848, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.844, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.841, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.845, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.877, total=   0.1s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.874, total=   0.2s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.890, total=   0.2s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.893, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.910, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.877, total=   0.1s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.874, total=   0.2s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   11.8s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [43]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

In [44]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.8886981402002861


In [45]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

['CONFIRMED' 'CANDIDATE' 'FALSE POSITIVE' ... 'CANDIDATE' 'FALSE POSITIVE'
 'CANDIDATE']


In [46]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.878


In [47]:
from sklearn import metrics
from sklearn.metrics import f1_score
#metrics.f1_score(y_test, predictions, labels=np.unique(predictions))
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))


0.8757546361092301

In [48]:
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "OTHER"]))

                precision    recall  f1-score   support

     CONFIRMED       0.82      0.68      0.74       844
FALSE POSITIVE       0.74      0.84      0.79       900
         OTHER       0.97      1.00      0.99      1752

      accuracy                           0.88      3496
     macro avg       0.85      0.84      0.84      3496
  weighted avg       0.88      0.88      0.88      3496



# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

# import joblib
# filename = 'serena_baker.sav'
# joblib.dump(model, filename)