In [62]:
#Import Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
import joblib
from joblib import dump, load

# Read in Data

In [8]:
#Read in CSV
exo_df = pd.read_csv("../exoplanet_data.csv")
exo_df.shape

(6991, 41)

In [9]:
# Drop Null columns and rows
exo_df = exo_df.dropna(axis='columns', how='all')
exo_df = exo_df.dropna()
exo_df.shape

(6991, 41)

No null columsns or rows

In [10]:
exo_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [45]:
exo_df.columns

Index(['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag'],
      dtype='object')

In [46]:
selected_features = exo_df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co',
       'koi_fpflag_ec', 'koi_period', 'koi_period_err1', 'koi_period_err2',
       'koi_time0bk', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_tce_plnt_num', 'koi_steff', 'koi_steff_err1',
       'koi_steff_err2', 'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2',
       'koi_srad', 'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec',
       'koi_kepmag']]

# Create Train Test Split

In [47]:
#Set X and y value
X = selected_features
y = exo_df["koi_disposition"]

In [48]:
# Create Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [13]:
# Scale your data
X_minmax = MinMaxScaler().fit(X_train)

X_train_scaled = X_minmax.transform(X_train)
X_test_scaled = X_minmax.transform(X_test)

# Create the Model

In [15]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [18]:
print(f'Training Data Score: {svm_model.score(X_train_scaled, y_train)}')
print(f'Test Data Score: {svm_model.score(X_test_scaled, y_test)}')

Training Data Score: 0.8439824527942018
Test Data Score: 0.8415331807780321


In [37]:
# Make predictions
svm_predict = svm_model.predict(X_test_scaled)

In [38]:
# Show Classification Report 
print(classification_report(y_test, predict))

                precision    recall  f1-score   support

     CANDIDATE       0.74      0.59      0.66       422
     CONFIRMED       0.67      0.77      0.72       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.84      1748
     macro avg       0.80      0.79      0.79      1748
  weighted avg       0.84      0.84      0.84      1748



# Hyperparemeter Tuning

In [21]:
svm_model.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [26]:
svm_param_grid = {'C': [1, 5, 10],
                 'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01]}
svm_grid = GridSearchCV(svm_model, svm_param_grid, verbose=3)

In [27]:
svm_grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] C=1, gamma=scale ................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................... C=1, gamma=scale, score=0.856, total=   0.3s
[CV] C=1, gamma=scale ................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] .................... C=1, gamma=scale, score=0.846, total=   0.3s
[CV] C=1, gamma=scale ................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] .................... C=1, gamma=scale, score=0.839, total=   0.3s
[CV] C=1, gamma=scale ................................................
[CV] .................... C=1, gamma=scale, score=0.841, total=   0.3s
[CV] C=1, gamma=scale ................................................
[CV] .................... C=1, gamma=scale, score=0.825, total=   0.3s
[CV] C=1, gamma=auto .................................................
[CV] ..................... C=1, gamma=auto, score=0.856, total=   0.3s
[CV] C=1, gamma=auto .................................................
[CV] ..................... C=1, gamma=auto, score=0.846, total=   0.3s
[CV] C=1, gamma=auto .................................................
[CV] ..................... C=1, gamma=auto, score=0.839, total=   0.3s
[CV] C=1, gamma=auto .................................................
[CV] ..................... C=1, gamma=auto, score=0.841, total=   0.3s
[CV] C=1, gamma=auto .................................................
[CV] .

[CV] .................. C=10, gamma=0.0001, score=0.881, total=   0.3s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.870, total=   0.3s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.856, total=   0.3s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.873, total=   0.3s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.860, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.881, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.870, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   22.4s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10],
                         'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01]},
             verbose=3)

In [28]:
print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'C': 10, 'gamma': 'scale'}
0.8680138845428944


In [31]:
hypertuned_svm = SVC(kernel='linear', C=10, gamma = 'scale')
hypertuned_svm.fit(X_train_scaled, y_train)

SVC(C=10, kernel='linear')

In [32]:
print(f'Training Data Score: {hypertuned_svm.score(X_train_scaled, y_train)}')
print(f'Test Data Score: {hypertuned_svm.score(X_test_scaled, y_test)}')      

Training Data Score: 0.876215906923517
Test Data Score: 0.8758581235697941


In [35]:
# Make predictions
hypertuned_svm_predict = hypertuned_svm.predict(X_test_scaled)

In [36]:
# Show Classification Report
print(classification_report(y_test, predict))

                precision    recall  f1-score   support

     CANDIDATE       0.74      0.59      0.66       422
     CONFIRMED       0.67      0.77      0.72       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.84      1748
     macro avg       0.80      0.79      0.79      1748
  weighted avg       0.84      0.84      0.84      1748



# Save model

In [65]:
filename = 'SVM_Model.sav'
joblib.dump(grid, filename)

NameError: name 'grid' is not defined