In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [25]:
# Read in data
df = pd.read_csv("exoplanet_data.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [26]:
print(df["koi_disposition"].unique())

['CONFIRMED' 'FALSE POSITIVE' 'CANDIDATE']


# Select features (columns)

In [27]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [28]:
# Search for top 10 features according to feature importances
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
model.feature_importances_



array([0.11735097, 0.17469171, 0.12786416, 0.05286117, 0.01541951,
       0.02057685, 0.01771557, 0.01118088, 0.02239609, 0.01904334,
       0.01303757, 0.01267119, 0.00994411, 0.01732386, 0.03104104,
       0.02340872, 0.02729496, 0.01711751, 0.01015152, 0.01303858,
       0.01009886, 0.01086177, 0.01122777, 0.0094938 , 0.01326209,
       0.00901678, 0.02886466, 0.01410531, 0.00961084, 0.02317564,
       0.01269682, 0.008962  , 0.00961794, 0.01527378, 0.00892606,
       0.00831096, 0.00884888, 0.01175588, 0.01049285, 0.011268  ])

In [31]:
feat_imp = pd.Series(model.feature_importances_, index=X.columns).nlargest(10)
feat_imp

koi_fpflag_ss        0.174692
koi_fpflag_co        0.127864
koi_fpflag_nt        0.117351
koi_fpflag_ec        0.052861
koi_duration_err1    0.031041
koi_model_snr        0.028865
koi_depth            0.027295
koi_duration_err2    0.023409
koi_steff_err1       0.023176
koi_time0bk_err1     0.022396
dtype: float64

In [34]:
# Set features based on feature importances
X = df[feat_imp.index]

# Use koi_disposition for y values
y = df['koi_disposition']

# Create a Train Test Split

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Pre-processing

Scale the data using the MinMaxScaler

In [36]:
from sklearn.preprocessing import MinMaxScaler

# Create scaler object
X_scaler = MinMaxScaler().fit(X_train)

# Scale training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model
Using a logistic regression


In [37]:
from sklearn.linear_model import LogisticRegression

# Initialize model (classifier)
classifier = LogisticRegression()

# Train the model
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8249094030135419
Testing Data Score: 0.8375286041189931


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [39]:
from sklearn.model_selection import GridSearchCV

# Create the GridSearchCV model
param_grid = {'C':np.logspace(-4, 4, 20),
             'penalty':['l1','l2']}

grid = GridSearchCV(classifier, param_grid, verbose=3)

In [40]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] C=0.0001, penalty=l1 ............................................
[CV] ................ C=0.0001, penalty=l1, score=0.241, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ................ C=0.0001, penalty=l1, score=0.241, total=   0.0s
[CV] C=0.0001, penalty=l1 ............................................
[CV] ................ C=0.0001, penalty=l1, score=0.241, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ................ C=0.0001, penalty=l2, score=0.501, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ................ C=0.0001, penalty=l2, score=0.501, total=   0.0s
[CV] C=0.0001, penalty=l2 ............................................
[CV] ................ C=0.0001, penalty=l2, score=0.501, total=   0.0s
[CV] C=0.00026366508987303583, penalty=l1 ............................
[CV]  C=0.00026



[CV] .. C=0.004832930238571752, penalty=l2, score=0.538, total=   0.0s
[CV] C=0.004832930238571752, penalty=l2 ..............................
[CV] .. C=0.004832930238571752, penalty=l2, score=0.586, total=   0.0s
[CV] C=0.012742749857031334, penalty=l1 ..............................
[CV] .. C=0.012742749857031334, penalty=l1, score=0.739, total=   0.0s
[CV] C=0.012742749857031334, penalty=l1 ..............................
[CV] .. C=0.012742749857031334, penalty=l1, score=0.747, total=   0.0s
[CV] C=0.012742749857031334, penalty=l1 ..............................
[CV] .. C=0.012742749857031334, penalty=l1, score=0.741, total=   0.0s
[CV] C=0.012742749857031334, penalty=l2 ..............................
[CV] .. C=0.012742749857031334, penalty=l2, score=0.755, total=   0.0s
[CV] C=0.012742749857031334, penalty=l2 ..............................
[CV] .. C=0.012742749857031334, penalty=l2, score=0.758, total=   0.0s
[CV] C=0.012742749857031334, penalty=l2 ..............................
[CV] .



[CV] ... C=0.03359818286283781, penalty=l1, score=0.748, total=   0.0s
[CV] C=0.03359818286283781, penalty=l1 ...............................
[CV] ... C=0.03359818286283781, penalty=l1, score=0.743, total=   0.1s
[CV] C=0.03359818286283781, penalty=l2 ...............................
[CV] ... C=0.03359818286283781, penalty=l2, score=0.789, total=   0.0s
[CV] C=0.03359818286283781, penalty=l2 ...............................
[CV] ... C=0.03359818286283781, penalty=l2, score=0.789, total=   0.0s
[CV] C=0.03359818286283781, penalty=l2 ...............................
[CV] ... C=0.03359818286283781, penalty=l2, score=0.774, total=   0.0s
[CV] C=0.08858667904100823, penalty=l1 ...............................
[CV] ... C=0.08858667904100823, penalty=l1, score=0.820, total=   0.1s
[CV] C=0.08858667904100823, penalty=l1 ...............................




[CV] ... C=0.08858667904100823, penalty=l1, score=0.823, total=   0.1s
[CV] C=0.08858667904100823, penalty=l1 ...............................
[CV] ... C=0.08858667904100823, penalty=l1, score=0.807, total=   0.1s
[CV] C=0.08858667904100823, penalty=l2 ...............................
[CV] ... C=0.08858667904100823, penalty=l2, score=0.821, total=   0.0s
[CV] C=0.08858667904100823, penalty=l2 ...............................
[CV] ... C=0.08858667904100823, penalty=l2, score=0.824, total=   0.0s
[CV] C=0.08858667904100823, penalty=l2 ...............................
[CV] ... C=0.08858667904100823, penalty=l2, score=0.809, total=   0.0s
[CV] C=0.23357214690901212, penalty=l1 ...............................




[CV] ... C=0.23357214690901212, penalty=l1, score=0.825, total=   0.1s
[CV] C=0.23357214690901212, penalty=l1 ...............................
[CV] ... C=0.23357214690901212, penalty=l1, score=0.827, total=   0.0s
[CV] C=0.23357214690901212, penalty=l1 ...............................
[CV] ... C=0.23357214690901212, penalty=l1, score=0.811, total=   0.1s
[CV] C=0.23357214690901212, penalty=l2 ...............................
[CV] ... C=0.23357214690901212, penalty=l2, score=0.820, total=   0.0s
[CV] C=0.23357214690901212, penalty=l2 ...............................
[CV] ... C=0.23357214690901212, penalty=l2, score=0.828, total=   0.0s
[CV] C=0.23357214690901212, penalty=l2 ...............................
[CV] ... C=0.23357214690901212, penalty=l2, score=0.809, total=   0.0s
[CV] C=0.615848211066026, penalty=l1 .................................




[CV] ..... C=0.615848211066026, penalty=l1, score=0.846, total=   0.1s
[CV] C=0.615848211066026, penalty=l1 .................................
[CV] ..... C=0.615848211066026, penalty=l1, score=0.842, total=   0.1s
[CV] C=0.615848211066026, penalty=l1 .................................
[CV] ..... C=0.615848211066026, penalty=l1, score=0.843, total=   0.1s
[CV] C=0.615848211066026, penalty=l2 .................................
[CV] ..... C=0.615848211066026, penalty=l2, score=0.825, total=   0.0s
[CV] C=0.615848211066026, penalty=l2 .................................




[CV] ..... C=0.615848211066026, penalty=l2, score=0.828, total=   0.0s
[CV] C=0.615848211066026, penalty=l2 .................................
[CV] ..... C=0.615848211066026, penalty=l2, score=0.812, total=   0.0s
[CV] C=1.623776739188721, penalty=l1 .................................




[CV] ..... C=1.623776739188721, penalty=l1, score=0.852, total=   0.2s
[CV] C=1.623776739188721, penalty=l1 .................................
[CV] ..... C=1.623776739188721, penalty=l1, score=0.849, total=   0.2s
[CV] C=1.623776739188721, penalty=l1 .................................




[CV] ..... C=1.623776739188721, penalty=l1, score=0.851, total=   0.1s
[CV] C=1.623776739188721, penalty=l2 .................................
[CV] ..... C=1.623776739188721, penalty=l2, score=0.825, total=   0.0s
[CV] C=1.623776739188721, penalty=l2 .................................
[CV] ..... C=1.623776739188721, penalty=l2, score=0.832, total=   0.0s
[CV] C=1.623776739188721, penalty=l2 .................................
[CV] ..... C=1.623776739188721, penalty=l2, score=0.821, total=   0.0s
[CV] C=4.281332398719396, penalty=l1 .................................




[CV] ..... C=4.281332398719396, penalty=l1, score=0.856, total=   0.4s
[CV] C=4.281332398719396, penalty=l1 .................................




[CV] ..... C=4.281332398719396, penalty=l1, score=0.850, total=   0.3s
[CV] C=4.281332398719396, penalty=l1 .................................




[CV] ..... C=4.281332398719396, penalty=l1, score=0.853, total=   0.5s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] ..... C=4.281332398719396, penalty=l2, score=0.829, total=   0.0s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] ..... C=4.281332398719396, penalty=l2, score=0.832, total=   0.0s
[CV] C=4.281332398719396, penalty=l2 .................................
[CV] ..... C=4.281332398719396, penalty=l2, score=0.831, total=   0.0s
[CV] C=11.288378916846883, penalty=l1 ................................




[CV] .... C=11.288378916846883, penalty=l1, score=0.858, total=   1.0s
[CV] C=11.288378916846883, penalty=l1 ................................




[CV] .... C=11.288378916846883, penalty=l1, score=0.852, total=   1.2s
[CV] C=11.288378916846883, penalty=l1 ................................




[CV] .... C=11.288378916846883, penalty=l1, score=0.854, total=   0.8s
[CV] C=11.288378916846883, penalty=l2 ................................
[CV] .... C=11.288378916846883, penalty=l2, score=0.847, total=   0.0s
[CV] C=11.288378916846883, penalty=l2 ................................
[CV] .... C=11.288378916846883, penalty=l2, score=0.838, total=   0.0s
[CV] C=11.288378916846883, penalty=l2 ................................
[CV] .... C=11.288378916846883, penalty=l2, score=0.844, total=   0.0s
[CV] C=29.763514416313132, penalty=l1 ................................
[CV] .... C=29.763514416313132, penalty=l1, score=0.862, total=   0.0s
[CV] C=29.763514416313132, penalty=l1 ................................
[CV] .... C=29.763514416313132, penalty=l1, score=0.854, total=   0.1s
[CV] C=29.763514416313132, penalty=l1 ................................




[CV] .... C=29.763514416313132, penalty=l1, score=0.854, total=   0.1s
[CV] C=29.763514416313132, penalty=l2 ................................
[CV] .... C=29.763514416313132, penalty=l2, score=0.851, total=   0.1s
[CV] C=29.763514416313132, penalty=l2 ................................
[CV] .... C=29.763514416313132, penalty=l2, score=0.848, total=   0.0s
[CV] C=29.763514416313132, penalty=l2 ................................
[CV] .... C=29.763514416313132, penalty=l2, score=0.848, total=   0.0s
[CV] C=78.47599703514607, penalty=l1 .................................




[CV] ..... C=78.47599703514607, penalty=l1, score=0.863, total=   0.1s
[CV] C=78.47599703514607, penalty=l1 .................................
[CV] ..... C=78.47599703514607, penalty=l1, score=0.854, total=   0.1s
[CV] C=78.47599703514607, penalty=l1 .................................
[CV] ..... C=78.47599703514607, penalty=l1, score=0.856, total=   0.1s
[CV] C=78.47599703514607, penalty=l2 .................................
[CV] ..... C=78.47599703514607, penalty=l2, score=0.852, total=   0.1s
[CV] C=78.47599703514607, penalty=l2 .................................




[CV] ..... C=78.47599703514607, penalty=l2, score=0.851, total=   0.1s
[CV] C=78.47599703514607, penalty=l2 .................................
[CV] ..... C=78.47599703514607, penalty=l2, score=0.852, total=   0.0s
[CV] C=206.913808111479, penalty=l1 ..................................
[CV] ...... C=206.913808111479, penalty=l1, score=0.863, total=   0.0s
[CV] C=206.913808111479, penalty=l1 ..................................
[CV] ...... C=206.913808111479, penalty=l1, score=0.854, total=   0.0s
[CV] C=206.913808111479, penalty=l1 ..................................
[CV] ...... C=206.913808111479, penalty=l1, score=0.856, total=   0.1s
[CV] C=206.913808111479, penalty=l2 ..................................




[CV] ...... C=206.913808111479, penalty=l2, score=0.856, total=   0.0s
[CV] C=206.913808111479, penalty=l2 ..................................
[CV] ...... C=206.913808111479, penalty=l2, score=0.851, total=   0.0s
[CV] C=206.913808111479, penalty=l2 ..................................
[CV] ...... C=206.913808111479, penalty=l2, score=0.853, total=   0.0s
[CV] C=545.5594781168514, penalty=l1 .................................
[CV] ..... C=545.5594781168514, penalty=l1, score=0.863, total=   0.0s
[CV] C=545.5594781168514, penalty=l1 .................................
[CV] ..... C=545.5594781168514, penalty=l1, score=0.854, total=   0.0s
[CV] C=545.5594781168514, penalty=l1 .................................




[CV] ..... C=545.5594781168514, penalty=l1, score=0.856, total=   0.0s
[CV] C=545.5594781168514, penalty=l2 .................................
[CV] ..... C=545.5594781168514, penalty=l2, score=0.859, total=   0.0s
[CV] C=545.5594781168514, penalty=l2 .................................
[CV] ..... C=545.5594781168514, penalty=l2, score=0.851, total=   0.0s
[CV] C=545.5594781168514, penalty=l2 .................................
[CV] ..... C=545.5594781168514, penalty=l2, score=0.853, total=   0.1s
[CV] C=1438.44988828766, penalty=l1 ..................................
[CV] ...... C=1438.44988828766, penalty=l1, score=0.863, total=   0.1s
[CV] C=1438.44988828766, penalty=l1 ..................................




[CV] ...... C=1438.44988828766, penalty=l1, score=0.855, total=   0.1s
[CV] C=1438.44988828766, penalty=l1 ..................................
[CV] ...... C=1438.44988828766, penalty=l1, score=0.856, total=   0.1s
[CV] C=1438.44988828766, penalty=l2 ..................................
[CV] ...... C=1438.44988828766, penalty=l2, score=0.860, total=   0.1s
[CV] C=1438.44988828766, penalty=l2 ..................................
[CV] ...... C=1438.44988828766, penalty=l2, score=0.852, total=   0.0s
[CV] C=1438.44988828766, penalty=l2 ..................................




[CV] ...... C=1438.44988828766, penalty=l2, score=0.855, total=   0.0s
[CV] C=3792.690190732246, penalty=l1 .................................
[CV] ..... C=3792.690190732246, penalty=l1, score=0.863, total=   0.1s
[CV] C=3792.690190732246, penalty=l1 .................................
[CV] ..... C=3792.690190732246, penalty=l1, score=0.855, total=   0.1s
[CV] C=3792.690190732246, penalty=l1 .................................
[CV] ..... C=3792.690190732246, penalty=l1, score=0.856, total=   0.1s
[CV] C=3792.690190732246, penalty=l2 .................................




[CV] ..... C=3792.690190732246, penalty=l2, score=0.862, total=   0.1s
[CV] C=3792.690190732246, penalty=l2 .................................
[CV] ..... C=3792.690190732246, penalty=l2, score=0.854, total=   0.1s
[CV] C=3792.690190732246, penalty=l2 .................................
[CV] ..... C=3792.690190732246, penalty=l2, score=0.855, total=   0.1s
[CV] C=10000.0, penalty=l1 ...........................................
[CV] ............... C=10000.0, penalty=l1, score=0.863, total=   0.1s
[CV] C=10000.0, penalty=l1 ...........................................




[CV] ............... C=10000.0, penalty=l1, score=0.855, total=   0.1s
[CV] C=10000.0, penalty=l1 ...........................................
[CV] ............... C=10000.0, penalty=l1, score=0.856, total=   0.1s
[CV] C=10000.0, penalty=l2 ...........................................
[CV] ............... C=10000.0, penalty=l2, score=0.863, total=   0.1s
[CV] C=10000.0, penalty=l2 ...........................................
[CV] ............... C=10000.0, penalty=l2, score=0.854, total=   0.0s
[CV] C=10000.0, penalty=l2 ...........................................
[CV] ............... C=10000.0, penalty=l2, score=0.856, total=   0.1s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    8.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00...3,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
     

In [41]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1438.44988828766, 'penalty': 'l1'}
0.8579057791340835


# Save the Model

In [None]:
import joblib
filename = 'name.sav'
joblib.dump(my_model, filename)