In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/serenabaker/opt/anaconda3/lib/python3.8/site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
#!pip install tensorflow
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [4]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
#from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the CSV and Perform Basic Data Cleaning

In [5]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [6]:
#df = pd.get_dummies(df)
#df.head()
#df.columns

## KOI (Kepler Object of Interest) Columns:
### koi_disposition (y-value) = Exoplanet Archive Disposition (Confirmed = 1)
#### koi_period = Orbital Period (Days)
#### koi_time0bk = Transit Epoch (Center of first detected transit)
#### koi_impact = Impact Parameter (Distance from center of planet/stellar disc)
#### koi_duration = Transit Duration (Hours-Between planet/star first/last contact)
#### koi_prad = Planetary Radius
#### koi_teq = Equilibrium Temperature (Kelvin)
#### koi_model_snr = Transit depth normalized by mean uncertainty

# Select your features (columns)

In [7]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_disposition', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_prad', 'koi_teq', 'koi_model_snr']]
selected_features.head()

Unnamed: 0,koi_disposition,koi_period,koi_time0bk,koi_impact,koi_duration,koi_prad,koi_teq,koi_model_snr
0,CONFIRMED,54.418383,162.51384,0.586,4.507,2.83,443,25.8
1,FALSE POSITIVE,19.89914,175.850252,0.969,1.7822,14.6,638,76.3
2,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,33.46,1395,505.6
3,CONFIRMED,2.525592,171.59555,0.701,1.6545,2.75,1406,40.9
4,CONFIRMED,4.134435,172.97937,0.762,3.1402,2.77,1160,40.2


# Create a Train Test Split

Use `koi_disposition` for the y values

In [8]:
X = selected_features.drop('koi_disposition', axis=1).astype(int)
y = selected_features['koi_disposition']
print(X.shape, y.shape)
X.dtypes
y.head()

(6991, 7) (6991,)


0         CONFIRMED
1    FALSE POSITIVE
2    FALSE POSITIVE
3         CONFIRMED
4         CONFIRMED
Name: koi_disposition, dtype: object

In [9]:
#X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)
X_train.head()

Unnamed: 0,koi_period,koi_time0bk,koi_impact,koi_duration,koi_prad,koi_teq,koi_model_snr
3563,10,139,1,1,3,899,11
4099,24,140,0,3,2,491,18
5460,1,131,0,1,14,1276,476
1091,201,187,0,10,2,300,34
5999,91,175,0,10,2,568,8


In [10]:
# # Step 1: Label-encode data set
# label_encoder = LabelEncoder()
# label_encoder.fit(y_train)
# encoded_y_train = label_encoder.transform(y_train)
# encoded_y_test = label_encoder.transform(y_test)

In [11]:
# # Step 2: Convert encoded labels to one-hot-encoding
# y_train_categorical = to_categorical(encoded_y_train)
# y_test_categorical = to_categorical(encoded_y_test)

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
print(y_train_categorical)

[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]]


# Train the Model



In [14]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=7))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [15]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               800       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 11,203
Trainable params: 11,203
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
164/164 - 1s - loss: 0.9664 - accuracy: 0.4940
Epoch 2/60
164/164 - 0s - loss: 0.8728 - accuracy: 0.5897
Epoch 3/60
164/164 - 0s - loss: 0.8470 - accuracy: 0.6122
Epoch 4/60
164/164 - 0s - loss: 0.8332 - accuracy: 0.6155
Epoch 5/60
164/164 - 0s - loss: 0.8211 - accuracy: 0.6218
Epoch 6/60
164/164 - 0s - loss: 0.8152 - accuracy: 0.6267
Epoch 7/60
164/164 - 0s - loss: 0.8047 - accuracy: 0.6330
Epoch 8/60
164/164 - 0s - loss: 0.7977 - accuracy: 0.6412
Epoch 9/60
164/164 - 0s - loss: 0.7935 - accuracy: 0.6422
Epoch 10/60
164/164 - 0s - loss: 0.7831 - accuracy: 0.6491
Epoch 11/60
164/164 - 0s - loss: 0.7767 - accuracy: 0.6483
Epoch 12/60
164/164 - 0s - loss: 0.7709 - accuracy: 0.6544
Epoch 13/60
164/164 - 0s - loss: 0.7671 - accuracy: 0.6576
Epoch 14/60
164/164 - 0s - loss: 0.7627 - accuracy: 0.6506
Epoch 15/60
164/164 - 0s - loss: 0.7559 - accuracy: 0.6588
Epoch 16/60
164/164 - 0s - loss: 0.7523 - accuracy: 0.6649
Epoch 17/60
164/164 - 0s - loss: 0.7501 - accuracy: 0.6595
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f805e0826a0>

In [17]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.6692 - accuracy: 0.7168
Normal Neural Network - Loss: 0.6692386865615845, Accuracy: 0.7168192267417908


In [18]:
encoded_predictions = model.predict_classes(X_test_scaled[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [19]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

Predicted classes: ['CONFIRMED' 'CANDIDATE' 'FALSE POSITIVE' 'CONFIRMED' 'FALSE POSITIVE']
Actual Labels: ['CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE']


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [20]:
# Create the GridSearchCV model
model2 = SVC(kernel='linear')
model2

SVC(kernel='linear')

In [21]:
SVC(kernel='linear')

SVC(kernel='linear')

In [22]:
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [23]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.553, total=   0.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.555, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.546, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.537, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.556, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.553, total=   0.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.555, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.546, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................... C=1, gamma=0.0005, score=0.537, total=   0.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] .

[CV] .................. C=50, gamma=0.0001, score=0.615, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.614, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.622, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.614, total=   0.3s
[CV] C=50, gamma=0.0001 ..............................................
[CV] .................. C=50, gamma=0.0001, score=0.607, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.615, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .................. C=50, gamma=0.0005, score=0.614, total=   0.3s
[CV] C=50, gamma=0.0005 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   26.2s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [24]:
GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [25]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 50, 'gamma': 0.0001}
0.6141506996849053


In [26]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled)

In [27]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test))

Test Acc: 0.618


In [28]:
print(classification_report(y_test, predictions,
                            target_names=["CONFIRMED", "FALSE POSITIVE", "OTHER"]))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_time0bk']))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_impact']))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_duration']))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_prad']))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_teq']))
# print(classification_report(y_test, predictions,
#                             target_names=['koi_disposition_CONFIRMED', 'koi_model_snr']))

                precision    recall  f1-score   support

     CONFIRMED       0.42      0.07      0.12       404
FALSE POSITIVE       0.45      0.82      0.58       435
         OTHER       0.78      0.77      0.77       909

      accuracy                           0.62      1748
     macro avg       0.55      0.55      0.49      1748
  weighted avg       0.62      0.62      0.57      1748



# Save the Model

In [29]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

import joblib
filename = 'serena_baker.sav'
joblib.dump(model2, filename)

['serena_baker.sav']