In [1]:
# magic function to draw matplotlib plots inline
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Refer to NASA Exoplanet Archive - Data Columns in Kepler Objects of Interest Table
https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html

# Pre-processing
## Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", 
                        # Transit Properties
#                         'koi_period_err1','koi_period_err2', 
#                         'koi_time0bk_err1','koi_time0bk_err2', 
#                         'koi_impact_err1', 'koi_impact_err2',
#                         'koi_duration_err1', 'koi_duration_err2', 
#                         'koi_depth_err1', 'koi_depth_err2', 
#                         'koi_prad_err1','koi_prad_err2', 
#                         'koi_teq_err1', 'koi_teq_err2', 
#                         'koi_insol_err1', 'koi_insol_err2', 
                        # Threshold-Crossing Event (TCE) Information
                        'koi_tce_delivname', #'koi_model_snr', 'koi_tce_plnt_num',
                        # Stellar Parameters
#                         'koi_steff_err1', 'koi_steff_err2',
#                         'koi_slogg_err1', 'koi_slogg_err2', 
#                         'koi_srad_err1', 'koi_srad_err2',
                        # KIC ( Kepler Input Catalog ) Parameters
#                         'ra', 'dec', 'koi_kepmag'
                     ])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
print(df.dtypes)
df.head()

koi_disposition       object
koi_fpflag_nt          int64
koi_fpflag_ss          int64
koi_fpflag_co          int64
koi_fpflag_ec          int64
koi_period           float64
koi_period_err1      float64
koi_period_err2      float64
koi_time0bk          float64
koi_time0bk_err1     float64
koi_time0bk_err2     float64
koi_impact           float64
koi_impact_err1      float64
koi_impact_err2      float64
koi_duration         float64
koi_duration_err1    float64
koi_duration_err2    float64
koi_depth            float64
koi_depth_err1       float64
koi_depth_err2       float64
koi_prad             float64
koi_prad_err1        float64
koi_prad_err2        float64
koi_teq              float64
koi_insol            float64
koi_insol_err1       float64
koi_insol_err2       float64
koi_model_snr        float64
koi_tce_plnt_num     float64
koi_steff            float64
koi_steff_err1       float64
koi_steff_err2       float64
koi_slogg            float64
koi_slogg_err1       float64
koi_slogg_err2

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


Use `koi_disposition` for the y values

In [20]:
X = df.drop('koi_disposition', axis=1)
y = df['koi_disposition']
print(y.unique())
print(X.shape, y.shape)

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']
(8744, 40) (8744,)


## Create a Train Test Split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y)

## Scale the data using the MinMaxScaler

In [5]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


## label encoding, one-hot-encoding

In [23]:
from tensorflow.keras.utils import to_categorical

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

print(encoded_y_train)
print(encoded_y_test)
print(f'y : {y}')
print(f'encoded : {label_encoder.transform(y)}')
# 0 : CANDIDATE
# 1 : CONFIRMED
# 2 : FALSE POSITIVE

[2 2 2 ... 0 2 0]
[2 0 1 ... 1 2 1]
y : 0            CONFIRMED
1            CONFIRMED
2       FALSE POSITIVE
3       FALSE POSITIVE
4            CONFIRMED
5            CONFIRMED
6            CONFIRMED
7            CONFIRMED
8       FALSE POSITIVE
9            CONFIRMED
10           CONFIRMED
11           CONFIRMED
12           CONFIRMED
13           CONFIRMED
14      FALSE POSITIVE
15      FALSE POSITIVE
16      FALSE POSITIVE
17      FALSE POSITIVE
18           CONFIRMED
19           CONFIRMED
20      FALSE POSITIVE
21           CONFIRMED
22           CONFIRMED
23           CONFIRMED
24      FALSE POSITIVE
25           CONFIRMED
26           CONFIRMED
27           CONFIRMED
28      FALSE POSITIVE
29      FALSE POSITIVE
             ...      
9532    FALSE POSITIVE
9533         CANDIDATE
9535    FALSE POSITIVE
9536    FALSE POSITIVE
9537    FALSE POSITIVE
9538    FALSE POSITIVE
9539         CANDIDATE
9540         CONFIRMED
9541    FALSE POSITIVE
9542         CANDIDATE
9543    FALSE POS

In [12]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

print(y_train_categorical.shape, y_test_categorical.shape)
print(y_train_categorical, y_test_categorical)

(6558, 3) (2186, 3)
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]] [[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


# Train the Support Vector Machine for multi-class classification 'SVC'

SVC and NuSVC implement the “one-against-one” approach (Knerr et al., 1990) for multi- class classification. If n_class is the number of classes, then n_class * (n_class - 1) / 2 classifiers are constructed and each one trains data from two classes. To provide a consistent interface with other classifiers, the `decision_function_shape` option allows to monotically transform the results of the `“one-against-one”` classifiers to a decision function of shape (n_samples, n_classes).

In [27]:
# SVM one-versus-one
from sklearn.svm import SVC 
modelOVO = SVC(gamma='scale', decision_function_shape='ovo')
modelOVO.fit(X_train_scaled, encoded_y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
# SVM one-versus-rest
modelOVR = SVC(gamma='scale', decision_function_shape='ovr')
modelOVR.fit(X_train_scaled, encoded_y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
print(f"Training Data Score: {modelOVO.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {modelOVO.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.8307410795974383
Testing Data Score: 0.8371454711802379


In [30]:
print(f"Training Data Score: {modelOVR.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {modelOVR.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.8307410795974383
Testing Data Score: 0.8371454711802379


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [31]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50], #C:penalty parameter for error
              'gamma': [0.0001, 0.0005, 0.001, 0.005]} #gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
gridOVO = GridSearchCV(modelOVO, param_grid, verbose=3)

In [32]:
# Train the model with GridSearch
gridOVO.fit(X_train_scaled, encoded_y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.4983996342021033, total=   3.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s


[CV] ..... C=1, gamma=0.0001, score=0.49817017383348583, total=   2.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.8s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.4983981693363844, total=   3.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.4983996342021033, total=   2.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ..... C=1, gamma=0.0005, score=0.49817017383348583, total=   2.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.4983981693363844, total=   3.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.4983996342021033, total=   2.7s
[CV] C=1, gamma=0.001 ................................................
[CV] ...... C=1, gamma=0.001, score=0.49817017383348583, total=   2.6s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.4983981693363844, total=   3.3s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.6min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [33]:
print(gridOVO.best_params_)
print(gridOVO.best_score_)

{'C': 50, 'gamma': 0.005}
0.8315035071668192
