Lauren Hare

**Import and read in file**

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_validate

from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler

In [44]:
planets = pd.read_csv("no_errors_reduced.csv")
planets = planets.drop('Unnamed: 0', axis=1)
planets.head()

Unnamed: 0,PRF Δθ<sub>SQ</sub>(KIC) [arcsec],Number of Planets,Planet-Star Radius Ratio,Planetary Radius [Earth radii],PRF Δθ<sub>SQ</sub>(OOT) [arcsec],Maximum Multiple Event Statistic,FW Offset Significance [percent],Planet-Star Distance over Star Radius,Transit Depth [ppm],Transit Signal-to-Noise,...,Stellar Metallicity [dex],FW Source Δα(OOT) [sec],Transit Duration [hrs],FW Source Δδ(OOT) [arcsec],Maximum Single Event Statistic,Fitted Stellar Density [g/cm**3],Orbit Semi-Major Axis [AU],Orbital Period [days],Insolation Flux [Earth flux],Exoplanet Archive Disposition
0,0.32,2,0.022344,2.26,0.2,28.47082,0.002,24.81,615.8,35.8,...,0.14,0.43,2.9575,0.94,5.135849,3.20796,0.0853,9.488036,93.59,CONFIRMED
1,0.5,2,0.027954,2.83,0.39,20.109507,0.003,77.9,874.8,25.8,...,0.14,-0.63,4.507,1.23,7.027669,3.02368,0.2734,54.418383,9.11,CONFIRMED
2,0.276,1,0.387394,33.46,0.289,541.8951,0.0,3.278,8079.2,505.6,...,-0.52,-0.111,2.40641,0.002,39.06655,0.2208,0.0267,1.736952,891.96,FALSE POSITIVE
3,0.07,1,0.024064,2.75,0.1,33.1919,0.733,8.75,603.3,40.9,...,0.07,-0.01,1.6545,0.23,4.749945,1.98635,0.0374,2.525592,926.16,CONFIRMED
4,8.948,1,0.183387,39.21,8.93,46.15308,0.0,2.4,233.7,47.7,...,0.0,-13.45,5.022,24.09,10.964684,0.00485,0.082,7.36179,767.22,FALSE POSITIVE


**Create X and y**

In [45]:
X = planets.drop('Exoplanet Archive Disposition',axis=1)
y = planets['Exoplanet Archive Disposition']

**Scale X**

In [46]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

**Test and Train Split**

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

**Classifier with only default parameters and using test/train split**

In [48]:
svc = SVC()
svc.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
print("Training set score: %f" % svc.score(X_train,y_train))
#print(clf.best_params_)

Training set score: 0.947138


In [50]:
print("Test set score: %f" % svc.score(X_test, y_test))

Test set score: 0.952746


In [51]:
predictions = svc.predict(X_test)

In [52]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

     CONFIRMED       0.92      0.97      0.94       631
FALSE POSITIVE       0.98      0.94      0.96       935

   avg / total       0.95      0.95      0.95      1566



In [53]:
print(confusion_matrix(y_test,predictions))

[[609  22]
 [ 52 883]]


In [54]:
print(accuracy_score(y_test,predictions))

0.9527458492975734


**SVC with Stratified Shuffle Split - default parameters**

In [61]:
parameters = {}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.6s finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [62]:
("Score: %f" % grid_svc.score(X,y))

'Score: 0.950355'

In [63]:
predictions2 = grid_svc.predict(X)

In [64]:
print(classification_report(y,predictions2))

                precision    recall  f1-score   support

     CONFIRMED       0.92      0.96      0.94      2102
FALSE POSITIVE       0.98      0.94      0.96      3115

   avg / total       0.95      0.95      0.95      5217



In [65]:
print(confusion_matrix(y,predictions2))

[[2027   75]
 [ 184 2931]]


In [66]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(grid_svc, X, y, cv=3, scoring="accuracy")
print(cvs)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.4s finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.1s finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.6s finished


[0.92241379 0.95227142 0.9424626 ]


In [67]:
from sklearn.model_selection import cross_validate
svc_clf = SVC()
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 94.4844 (+/- 1.00)
Train Accuracy: 94.86 (+/- 0.39)


**SVC with Stratified Split - 1st Set of parameters**

In [68]:
parameters = {'C':[0.5,1,5,10],'kernel':['linear','poly','rbf','sigmoid'],
           'tol':[0.001,0.01,0.1],
           'shrinking':[True,False], 'probability':[True,False],
           'decision_function_shape':['ovo','ovr']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 384 candidates, totalling 38400 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 8176 tasks      | elapsed: 19.4m

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.5, 1, 5, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'tol': [0.001, 0.01, 0.1], 'shrinking': [True, False], 'probability': [True, False], 'decision_function_shape': ['ovo', 'ovr']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [None]:
#degree, coef0, and gamma depend on kernal option

In [69]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.9656890933486678
{'C': 10, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [70]:
predictions3 = grid_svc.predict(X)

In [71]:
print(classification_report(y,predictions3))

                precision    recall  f1-score   support

     CONFIRMED       0.94      0.98      0.96      2102
FALSE POSITIVE       0.98      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [72]:
print(confusion_matrix(y,predictions3))

[[2052   50]
 [ 129 2986]]


In [73]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=10, decision_function_shape='ovo', kernel='rbf', probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.5488 (+/- 0.90)
Train Accuracy: 96.44 (+/- 0.36)


**SVC with Stratified Split - 2nd Set of Parameters**

In [74]:
parameters = {'C':[10,30,50],'kernel':['rbf'],
           'tol':[0.00001,0.0001,0.001],
           'shrinking':[False], 'probability':[True],
           'decision_function_shape':['ovo']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 9 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  3.9min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [10, 30, 50], 'kernel': ['rbf'], 'tol': [1e-05, 0.0001, 0.001], 'shrinking': [False], 'probability': [True], 'decision_function_shape': ['ovo']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [75]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.9691393521180756
{'C': 30, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [76]:
predictions4 = grid_svc.predict(X)

In [77]:
print(classification_report(y,predictions4))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.98      0.96      2102
FALSE POSITIVE       0.99      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [78]:
print(confusion_matrix(y,predictions4))

[[2057   45]
 [ 116 2999]]


In [81]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf', probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8587 (+/- 0.84)
Train Accuracy: 97.07 (+/- 0.32)


**SVC with Stratified Split - 3rd set of parameters**

In [82]:
parameters = {'C':[20,30,40],'kernel':['rbf'],
           'tol':[0.001],
           'shrinking':[False], 'probability':[True],
           'decision_function_shape':['ovo']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 3 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [20, 30, 40], 'kernel': ['rbf'], 'tol': [0.001], 'shrinking': [False], 'probability': [True], 'decision_function_shape': ['ovo']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [83]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.9691393521180756
{'C': 30, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [84]:
predictions5 = grid_svc.predict(X)

In [85]:
print(classification_report(y,predictions5))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.98      0.96      2102
FALSE POSITIVE       0.99      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [55]:
print(confusion_matrix(y,predictions5))

[[ 148 1954]
 [  46 3069]]


In [87]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf',probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8587 (+/- 0.84)
Train Accuracy: 97.07 (+/- 0.32)


**Scoring type changed to 'roc_auc'**

In [89]:
#the disposition values are changed to numbers for roc_auc to run correctly

planets_v2 = planets.replace({'Exoplanet Archive Disposition':{'FALSE POSITIVE':0,'CONFIRMED':1}})
planets_v2.head()

Unnamed: 0,PRF Δθ<sub>SQ</sub>(KIC) [arcsec],Number of Planets,Planet-Star Radius Ratio,Planetary Radius [Earth radii],PRF Δθ<sub>SQ</sub>(OOT) [arcsec],Maximum Multiple Event Statistic,FW Offset Significance [percent],Planet-Star Distance over Star Radius,Transit Depth [ppm],Transit Signal-to-Noise,...,Stellar Metallicity [dex],FW Source Δα(OOT) [sec],Transit Duration [hrs],FW Source Δδ(OOT) [arcsec],Maximum Single Event Statistic,Fitted Stellar Density [g/cm**3],Orbit Semi-Major Axis [AU],Orbital Period [days],Insolation Flux [Earth flux],Exoplanet Archive Disposition
0,0.32,2,0.022344,2.26,0.2,28.47082,0.002,24.81,615.8,35.8,...,0.14,0.43,2.9575,0.94,5.135849,3.20796,0.0853,9.488036,93.59,1
1,0.5,2,0.027954,2.83,0.39,20.109507,0.003,77.9,874.8,25.8,...,0.14,-0.63,4.507,1.23,7.027669,3.02368,0.2734,54.418383,9.11,1
2,0.276,1,0.387394,33.46,0.289,541.8951,0.0,3.278,8079.2,505.6,...,-0.52,-0.111,2.40641,0.002,39.06655,0.2208,0.0267,1.736952,891.96,0
3,0.07,1,0.024064,2.75,0.1,33.1919,0.733,8.75,603.3,40.9,...,0.07,-0.01,1.6545,0.23,4.749945,1.98635,0.0374,2.525592,926.16,1
4,8.948,1,0.183387,39.21,8.93,46.15308,0.0,2.4,233.7,47.7,...,0.0,-13.45,5.022,24.09,10.964684,0.00485,0.082,7.36179,767.22,0


In [91]:
X = planets_v2.drop('Exoplanet Archive Disposition',axis=1)
y = planets_v2['Exoplanet Archive Disposition']

In [92]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [93]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf',probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='roc_auc', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 98.8699 (+/- 0.40)
Train Accuracy: 99.35 (+/- 0.13)


**Discussion**

Changing from scoring method of 'accuracy' to 'roc_auc', increased the validation accuracy score.

The score increased from grid search 1 to grid search 2, but did not change from grid search 2 to grid search 3. The parameter changes were effective only for the second grid search. The same ideal parameters were found in the third grid search as the second.