In [18]:
import pandas as pd
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Read the CSV and Perform Basic Data Cleaning

In [19]:
df = pd.read_csv("cumulative.csv")
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Create a Train Test Split

Use `koi_disposition` for the y values

In [20]:
from sklearn.model_selection import train_test_split

target = df["koi_disposition"]
data = df.drop(columns=['koi_disposition'])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=1, stratify=target)

In [22]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
5964,1,0,0,0,252.04444,0.02749,-0.02749,265.201,0.0494,-0.0494,...,-136.0,4.621,0.041,-0.035,0.664,0.057,-0.059,292.79022,41.948639,15.884
9410,1,0,0,0,371.51852,0.01579,-0.01579,317.6836,0.0339,-0.0339,...,-206.0,4.377,0.101,-0.203,1.089,0.364,-0.145,293.064,45.03421,13.731
4204,0,0,1,0,8.03867,0.000114,-0.000114,135.3098,0.0123,-0.0123,...,-181.0,4.485,0.05,-0.2,0.975,0.282,-0.101,290.51785,41.238762,14.999
5933,0,0,0,0,18.78216,0.000406,-0.000406,147.8508,0.0148,-0.0148,...,-167.0,4.488,0.048,-0.29,0.94,0.386,-0.087,291.76413,41.86013,14.043
6996,1,0,1,0,12.429716,0.000472,-0.000472,141.2846,0.042,-0.042,...,-200.0,4.534,0.037,-0.213,0.905,0.281,-0.088,297.52072,40.585419,15.842


# Pre-processing

Scale the data using the MinMaxScaler

In [23]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Support Vector Machine

In [24]:
# Create SVC model
from sklearn.svm import SVC
SVCmodel = SVC(kernel='linear')

# Train the model
SVCmodel.fit(X_train_scaled, y_train)

# Print scores
print(f"Training Data Score: {SVCmodel.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {SVCmodel.score(X_test_scaled, y_test)}")

Training Data Score: 0.8479719426654467
Testing Data Score: 0.8462946020128088


In [25]:
# Create Logistic Regression model
from sklearn.linear_model import LogisticRegression
model_log = LogisticRegression()

# Train the model
model_log.fit(X_train_scaled, y_train)

# Print scores
print(f"Training Data Score: {model_log.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_log.score(X_test_scaled, y_test)}")

Training Data Score: 0.8443122903324184
Testing Data Score: 0.8394327538883806


# Hyperparameter Tuning

Use `GridSearchCV` to tune the `C` and `gamma` parameters

In [26]:
# Create the GridSearch estimator and parameters for SVC model
from sklearn.model_selection import GridSearchCV
svc_param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01],
              'kernel': ['linear']}
svc_grid = GridSearchCV(SVCmodel, svc_param_grid, verbose=3)

In [27]:
# Fit the model using the grid search estimator
# This will take the SVC model and try each combination of parameters
svc_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8399634202103338, total=   0.3s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8508691674290942, total=   0.3s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  C=1, gamma=0.0001, kernel=linear, score=0.8361556064073227, total=   0.3s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.8399634202103338, total=   0.3s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.8508691674290942, total=   0.3s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV]  C=1, gamma=0.001, kernel=linear, score=0.8361556064073227, total=   0.3s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV]  C=1, gamma=0.01, kernel=linear, score=0.8399634202103338, total=   0.3s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV]  C=1, gamma=0.01, kernel=linear, score=0.8508691674290942, total=   0.3s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV]  C=1, gamma=0.01, kernel=linear, score=0.8361556064073227, total=   0.3s
[CV] C=5, gamma=0.0001,

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   15.8s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01], 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [28]:
# Print best parameters and best scores for SVC model
print(svc_grid.best_params_)
print(svc_grid.best_score_)

{'C': 10, 'gamma': 0.0001, 'kernel': 'linear'}
0.8690149435803599


In [29]:
# Create the GridSearch estimator and parameters for Logistic Regression model
from sklearn.model_selection import GridSearchCV

logistic_param_grid = {"penalty": ['l1', 'l2'],
              "C": np.logspace(0, 4, 10)}
logistic_grid = GridSearchCV(model_log, logistic_param_grid, cv=5, verbose=3)

In [30]:
# Fit the model using the grid search estimator
# This will take the Logistic Regression model and try each combination of parameters
logistic_grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... C=1.0, penalty=l1, score=0.8796648895658796, total=   0.7s
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ....... C=1.0, penalty=l1, score=0.881859756097561, total=   3.4s
[CV] C=1.0, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s


[CV] ...... C=1.0, penalty=l1, score=0.8887195121951219, total=   1.5s
[CV] C=1.0, penalty=l1 ...............................................
[CV] ...... C=1.0, penalty=l1, score=0.8924485125858124, total=   2.1s
[CV] C=1.0, penalty=l1 ...............................................
[CV] ....... C=1.0, penalty=l1, score=0.867175572519084, total=   1.1s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ....... C=1.0, penalty=l2, score=0.674028941355674, total=   0.6s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.6829268292682927, total=   0.6s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.6509146341463414, total=   0.5s
[CV] C=1.0, penalty=l2 ...............................................
[CV] ...... C=1.0, penalty=l2, score=0.6422578184591915, total=   0.5s
[CV] C=1.0, penalty=l2 ...............................................
[CV] .

[CV]  C=166.81005372000593, penalty=l2, score=0.6676829268292683, total=   0.6s
[CV] C=166.81005372000593, penalty=l2 ................................
[CV]  C=166.81005372000593, penalty=l2, score=0.6575133485888635, total=   0.7s
[CV] C=166.81005372000593, penalty=l2 ................................
[CV]  C=166.81005372000593, penalty=l2, score=0.6534351145038167, total=   0.7s
[CV] C=464.15888336127773, penalty=l1 ................................
[CV]  C=464.15888336127773, penalty=l1, score=0.8804265041888805, total=   1.4s
[CV] C=464.15888336127773, penalty=l1 ................................
[CV]  C=464.15888336127773, penalty=l1, score=0.8833841463414634, total=   2.4s
[CV] C=464.15888336127773, penalty=l1 ................................
[CV]  C=464.15888336127773, penalty=l1, score=0.8910060975609756, total=   2.9s
[CV] C=464.15888336127773, penalty=l1 ................................
[CV]  C=464.15888336127773, penalty=l1, score=0.8954996186117468, total=  14.8s
[CV] C=464.158

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  5.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [31]:
# Print best paramaters and best scores for Logistic Regression model
print(logistic_grid.best_params_)
print(logistic_grid.best_score_)

{'C': 1291.5496650148827, 'penalty': 'l1'}
0.8850259225373589


In [32]:
# Compare SVC model and Logistic Regression model scores

print("SVC model")
print(svc_grid.best_params_)
print(svc_grid.best_score_)
print("----------------")
print("Logistic Regression model")
print(logistic_grid.best_params_)
print(logistic_grid.best_score_)

SVC model
{'C': 10, 'gamma': 0.0001, 'kernel': 'linear'}
0.8690149435803599
----------------
Logistic Regression model
{'C': 1291.5496650148827, 'penalty': 'l1'}
0.8850259225373589
