# Read the CSV and Perform Basic Data Cleaning

In [1]:
# import data
import pandas as pd
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [2]:
# Set features
X = df.drop('koi_disposition', axis=1)
#X = df[['koi_period', 'koi_time0bk', 'koi_slogg', 'koi_srad', 'koi_kepmag']]

# Set target
y = df['koi_disposition']
target_names = list(set(y))
print(target_names)

['CANDIDATE', 'FALSE POSITIVE', 'CONFIRMED']


# Create a Train Test Split

Use `koi_disposition` for the y values

In [3]:
# Create train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [4]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414
1091,0,0,0,0,201.118319,0.001461,-0.001461,187.56986,0.00529,-0.00529,...,-112,4.447,0.072,-0.108,0.954,0.135,-0.083,283.11377,48.13139,13.328
5999,0,0,0,0,91.649983,0.003181,-0.003181,175.7156,0.0286,-0.0286,...,-233,4.145,0.164,-0.164,1.608,0.905,-0.383,294.93198,39.81242,12.964


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Encode y
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Categorical y
from keras.utils import to_categorical
categorical_y_train = to_categorical(encoded_y_train)
categorical_y_test = to_categorical(encoded_y_test)

Using TensorFlow backend.


ImportError: cannot import name 'export_saved_model'

# Train the Model



In [10]:
# Support vector machine linear classifier
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model2 = model
model.fit(X_train_scaled, categorical_y_train)

# Model Accuracy
print('Train Acc: %.3f' % model.score(X_train_scaled, categorical_y_train))
print('Test Acc: %.3f' % model.score(X_test_scaled, categorical_y_test))

Train Acc: 0.490
Test Acc: 0.489


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [11]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
parameters = {
                'kernel':['linear', 'poly', 'rbf'],
                'gamma':['scale', 'auto'],
                'degree': [2,3],
                'C':[1, 5, 10]
            }
grid = GridSearchCV(model2, parameters)

In [12]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)
print('Test Acc: %.3f' % grid.score(X_test, y_test))

ValueError: Invalid parameter C for estimator LinearRegression(). Check the list of available parameters with `estimator.get_params().keys()`.

In [44]:
print(grid.best_params_)
print(grid.best_score_)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_train, model.predict(X_train_scaled), target_names=target_names))
print(classification_report(y_test, model.predict(X_test_scaled), target_names=target_names))

{'C': 10, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
0.8670582306667928
                precision    recall  f1-score   support

     CANDIDATE       0.73      0.50      0.59      1283
     CONFIRMED       0.64      0.80      0.71      1365
FALSE POSITIVE       0.98      1.00      0.99      2595

      accuracy                           0.82      5243
     macro avg       0.78      0.77      0.76      5243
  weighted avg       0.83      0.82      0.82      5243

                precision    recall  f1-score   support

     CANDIDATE       0.75      0.52      0.62       404
     CONFIRMED       0.65      0.83      0.73       435
FALSE POSITIVE       0.99      1.00      0.99       909

      accuracy                           0.85      1748
     macro avg       0.80      0.78      0.78      1748
  weighted avg       0.85      0.85      0.84      1748



# Save the Model

In [48]:
# save  model
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'svm.sav'
joblib.dump(model, filename)
filename = 'svmgrid.sav'
joblib.dump(grid, filename)

['svmgrid.sav']