In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/serenabaker/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [4]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.svm import SVC 
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.callbacks import EarlyStopping

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
# Create a random forest classifier
target = df["koi_disposition"]
target_names = ["CONFIRMED", "FALSE POSITIVE", "OTHER"]

data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.33, random_state=1)

In [8]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8587521663778163

In [9]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9046793760831889

In [10]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True) 

[(0.10469836147689512, 'koi_fpflag_co'),
 (0.09778928662433507, 'koi_fpflag_nt'),
 (0.06730889002597913, 'koi_fpflag_ss'),
 (0.05902346423789947, 'koi_model_snr'),
 (0.05330857619696956, 'koi_prad'),
 (0.03602008012889727, 'koi_fpflag_ec'),
 (0.03578191702809878, 'koi_prad_err2'),
 (0.03219675399100091, 'koi_duration_err1'),
 (0.03059632152407208, 'koi_duration_err2'),
 (0.030264305071042993, 'koi_steff_err1'),
 (0.029038579778418342, 'koi_prad_err1'),
 (0.026255370550743052, 'koi_steff_err2'),
 (0.023049115342806528, 'koi_insol_err1'),
 (0.02293500417510501, 'koi_duration'),
 (0.021926469902436057, 'koi_depth'),
 (0.021255538723689217, 'koi_period'),
 (0.02082045631606978, 'koi_time0bk_err2'),
 (0.020761920251696413, 'koi_impact'),
 (0.020669174657824474, 'koi_time0bk_err1'),
 (0.01755068610650016, 'koi_period_err2'),
 (0.015799345983198047, 'koi_period_err1'),
 (0.015399639388540729, 'koi_insol'),
 (0.014971394611852177, 'koi_teq'),
 (0.01416085694574643, 'koi_depth_err2'),
 (0.01368

In [None]:
df = df.drop('koi_kepmag', axis=1)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [13]:
X = df.drop('koi_disposition', axis=1).astype(float)
y = df['koi_disposition']
print(X.shape, y.shape)

(6991, 39) (6991,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=1, stratify=y)
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec
4472,1.0,0.0,0.0,0.0,290.625016,0.007059,-0.007059,349.7098,0.0172,-0.0172,...,148.0,-156.0,3.735,0.628,-0.34,2.409,1.296,-1.286,284.18527,42.563271
1650,0.0,0.0,0.0,0.0,4.695056,3.9e-05,-3.9e-05,133.93643,0.00599,-0.00599,...,168.0,-210.0,4.494,0.05,-0.2,0.959,0.285,-0.095,290.68524,43.859638
822,0.0,0.0,0.0,0.0,12.681734,7e-06,-7e-06,132.682519,0.000434,-0.000434,...,136.0,-136.0,4.577,0.071,-0.044,0.701,0.065,-0.072,286.0528,50.256592
6167,0.0,0.0,0.0,0.0,3.626574,2.5e-05,-2.5e-05,133.57658,0.00617,-0.00617,...,161.0,-161.0,4.579,0.034,-0.136,0.8,0.159,-0.068,291.49585,42.47813
2011,0.0,0.0,0.0,0.0,8.703122,8.5e-05,-8.5e-05,140.15531,0.00824,-0.00824,...,72.0,-84.0,4.403,0.054,-0.117,1.082,0.178,-0.076,281.11234,43.227791


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [15]:
# Scale data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Train the Model



In [19]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=39))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [20]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               4000      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 303       
Total params: 14,403
Trainable params: 14,403
Non-trainable params: 0
_________________________________________________________________


In [21]:
early_stopping = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2,
    callbacks=early_stopping
)

Epoch 1/60
147/147 - 0s - loss: 0.5828 - accuracy: 0.7081
Epoch 2/60
147/147 - 0s - loss: 0.3654 - accuracy: 0.8146
Epoch 3/60
147/147 - 0s - loss: 0.3458 - accuracy: 0.8230
Epoch 4/60
147/147 - 0s - loss: 0.3314 - accuracy: 0.8319
Epoch 5/60
147/147 - 0s - loss: 0.3257 - accuracy: 0.8373
Epoch 6/60
147/147 - 0s - loss: 0.3281 - accuracy: 0.8373
Epoch 7/60
147/147 - 0s - loss: 0.3229 - accuracy: 0.8386
Epoch 8/60
147/147 - 0s - loss: 0.3143 - accuracy: 0.8458
Epoch 9/60
147/147 - 0s - loss: 0.3103 - accuracy: 0.8529
Epoch 10/60
147/147 - 0s - loss: 0.3062 - accuracy: 0.8488
Epoch 11/60
147/147 - 0s - loss: 0.2995 - accuracy: 0.8610
Epoch 12/60
147/147 - 0s - loss: 0.3084 - accuracy: 0.8527
Epoch 13/60
147/147 - 0s - loss: 0.2955 - accuracy: 0.8638
Epoch 14/60
147/147 - 0s - loss: 0.2918 - accuracy: 0.8638
Epoch 15/60
147/147 - 0s - loss: 0.2993 - accuracy: 0.8576
Epoch 16/60
147/147 - 0s - loss: 0.2911 - accuracy: 0.8650
Epoch 17/60
147/147 - 0s - loss: 0.2859 - accuracy: 0.8695
Epoch 

Epoch 45/60
147/147 - 0s - loss: 0.2485 - accuracy: 0.8855
Epoch 46/60
147/147 - 0s - loss: 0.2392 - accuracy: 0.8960
Epoch 47/60
147/147 - 0s - loss: 0.2436 - accuracy: 0.8932
Epoch 48/60
147/147 - 0s - loss: 0.2395 - accuracy: 0.8928
Epoch 49/60
147/147 - 0s - loss: 0.2419 - accuracy: 0.8926
Epoch 50/60
147/147 - 0s - loss: 0.2394 - accuracy: 0.8977
Epoch 51/60
147/147 - 0s - loss: 0.2387 - accuracy: 0.8958
Epoch 52/60
147/147 - 0s - loss: 0.2390 - accuracy: 0.8919
Epoch 53/60
147/147 - 0s - loss: 0.2373 - accuracy: 0.8960
Epoch 54/60
147/147 - 0s - loss: 0.2344 - accuracy: 0.8962
Epoch 55/60
147/147 - 0s - loss: 0.2349 - accuracy: 0.9003
Epoch 56/60
147/147 - 0s - loss: 0.2359 - accuracy: 0.8930
Epoch 57/60
147/147 - 0s - loss: 0.2315 - accuracy: 0.8973
Epoch 58/60
147/147 - 0s - loss: 0.2314 - accuracy: 0.8992
Epoch 59/60
147/147 - 0s - loss: 0.2277 - accuracy: 0.9018
Epoch 60/60
147/147 - 0s - loss: 0.2345 - accuracy: 0.8971


<tensorflow.python.keras.callbacks.History at 0x7fabe7343af0>

In [22]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

73/73 - 0s - loss: 0.2997 - accuracy: 0.8830
Normal Neural Network - Loss: 0.29968446493148804, Accuracy: 0.8830155730247498


In [26]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [27]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CANDIDATE']
Actual Labels: ['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED']


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [28]:
# Create the GridSearchCV model
model = SVC(kernel='linear')

In [29]:
SVC(kernel='linear')

SVC(kernel='linear')

In [30]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [31]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.847, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.855, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.836, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.840, total=   0.2s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.829, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.847, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.855, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.836, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.840, total=   0.2s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.884, total=   0.4s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.880, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.872, total=   0.2s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.902, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.882, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.884, total=   0.4s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.880, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   20.6s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [32]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=2)

In [33]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.884052121245291


In [34]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)
print(predictions)

['CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE' ... 'CANDIDATE'
 'FALSE POSITIVE' 'CANDIDATE']


In [35]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.879


In [36]:
from sklearn import metrics
from sklearn.metrics import f1_score
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))


0.8767944845215428

In [37]:
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "OTHER"]))

                precision    recall  f1-score   support

     CONFIRMED       0.83      0.67      0.74       557
FALSE POSITIVE       0.75      0.84      0.79       594
         OTHER       0.97      1.00      0.98      1157

      accuracy                           0.88      2308
     macro avg       0.85      0.84      0.84      2308
  weighted avg       0.88      0.88      0.88      2308



# Save the Model

In [38]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

# import joblib
# filename = 'serena_baker.sav'
# joblib.dump(model, filename)