Lauren Hare

**Import and read in data (errors dataset)**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_validate

from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler

In [2]:
planets = pd.read_csv("errors_reduced.csv")
planets = planets.drop('Unnamed: 0', axis=1)
planets.head()

Unnamed: 0,koi_dikco_msky,koi_dicco_msky,koi_steff_err2,koi_fwm_stat_sig,koi_prad_err1,koi_steff_err1,koi_ror_err1,koi_smet_err2,koi_smet_err1,koi_count,...,koi_srho_err2,koi_insol_err1,koi_fwm_sdeco_err,koi_depth_err1,koi_period,koi_fwm_srao_err,koi_insol,koi_dor_err2,koi_dicco_msky_err,koi_disposition
0,0.32,0.2,-81.0,0.002,0.26,81.0,0.000832,-0.15,0.15,2,...,-1.09986,29.45,0.48,19.5,9.488036,0.51,93.59,-2.6,0.17,CONFIRMED
1,0.5,0.39,-81.0,0.003,0.32,81.0,0.009078,-0.15,0.15,2,...,-2.49638,2.87,0.68,35.5,54.418383,0.72,9.11,-28.4,0.36,CONFIRMED
2,0.276,0.289,-174.0,0.0,8.5,157.0,0.109232,-0.3,0.3,1,...,-0.01837,668.95,0.027,12.8,1.736952,0.031,891.96,-0.136,0.079,FALSE POSITIVE
3,0.07,0.1,-211.0,0.733,0.88,169.0,0.003751,-0.3,0.25,1,...,-1.74541,874.33,0.37,16.9,2.525592,0.35,926.16,-4.0,0.14,CONFIRMED
4,8.948,8.93,-124.0,0.0,6.45,111.0,7.128076,-0.15,0.15,1,...,-0.00053,349.28,0.22,5.8,7.36179,0.25,767.22,-1.2,0.074,FALSE POSITIVE


**Create X and y**

In [3]:
X = planets.drop('koi_disposition',axis=1)
y = planets['koi_disposition']

**Scale X**

In [4]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

**Test and Train Split**

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

**Classifier with only default parameters and using test/train split**

In [8]:
svc = SVC()
svc.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
print("Training set score: %f" % svc.score(X_train,y_train))
#print(clf.best_params_)

Training set score: 0.959733


In [10]:
print("Test set score: %f" % svc.score(X_test, y_test))

Test set score: 0.957902


In [11]:
predictions = svc.predict(X_test)

In [12]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

     CONFIRMED       0.94      0.96      0.95       631
FALSE POSITIVE       0.97      0.96      0.96       913

   avg / total       0.96      0.96      0.96      1544



In [13]:
print(confusion_matrix(y_test,predictions))

[[604  27]
 [ 38 875]]


In [14]:
print(accuracy_score(y_test,predictions))

0.957901554404145


**SVC with Stratified Shuffle Split - default parameters**

In [21]:
parameters = {}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   14.2s finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [22]:
("Score: %f" % grid_svc.score(X,y))

'Score: 0.962877'

In [23]:
predictions2 = grid_svc.predict(X)

In [24]:
print(classification_report(y,predictions2))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.96      0.95      2089
FALSE POSITIVE       0.97      0.96      0.97      3056

   avg / total       0.96      0.96      0.96      5145



In [25]:
print(confusion_matrix(y,predictions2))

[[2013   76]
 [ 115 2941]]


In [26]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(grid_svc, X, y, cv=3, scoring="accuracy")
print(cvs)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.4s finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.1s finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.5s finished


[0.92599068 0.97026239 0.94982497]


In [27]:
from sklearn.model_selection import cross_validate
svc_clf = SVC()
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.5128 (+/- 0.91)
Train Accuracy: 96.12 (+/- 0.39)


**SVC with Stratified Split - 1st Set of parameters**

In [68]:
parameters = {'C':[0.5,1,5,10],'kernel':['linear','poly','rbf','sigmoid'],
           'tol':[0.001,0.01,0.1],
           'shrinking':[True,False], 'probability':[True,False],
           'decision_function_shape':['ovo','ovr']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 384 candidates, totalling 38400 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 8176 tasks      | elapsed: 19.4m

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [0.5, 1, 5, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'tol': [0.001, 0.01, 0.1], 'shrinking': [True, False], 'probability': [True, False], 'decision_function_shape': ['ovo', 'ovr']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [69]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.9656890933486678
{'C': 10, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [70]:
predictions3 = grid_svc.predict(X)

In [71]:
print(classification_report(y,predictions3))

                precision    recall  f1-score   support

     CONFIRMED       0.94      0.98      0.96      2102
FALSE POSITIVE       0.98      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [72]:
print(confusion_matrix(y,predictions3))

[[2052   50]
 [ 129 2986]]


In [73]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=10, decision_function_shape='ovo', kernel='rbf', probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.5488 (+/- 0.90)
Train Accuracy: 96.44 (+/- 0.36)


**SVC with Stratified Split - 2nd Set of Parameters**

In [74]:
parameters = {'C':[10,30,50],'kernel':['rbf'],
           'tol':[0.00001,0.0001,0.001],
           'shrinking':[False], 'probability':[True],
           'decision_function_shape':['ovo']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 9 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  3.9min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [10, 30, 50], 'kernel': ['rbf'], 'tol': [1e-05, 0.0001, 0.001], 'shrinking': [False], 'probability': [True], 'decision_function_shape': ['ovo']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [75]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.9691393521180756
{'C': 30, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [76]:
predictions4 = grid_svc.predict(X)

In [77]:
print(classification_report(y,predictions4))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.98      0.96      2102
FALSE POSITIVE       0.99      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [78]:
print(confusion_matrix(y,predictions4))

[[2057   45]
 [ 116 2999]]


In [81]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf', probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8587 (+/- 0.84)
Train Accuracy: 97.07 (+/- 0.32)


**SVC with Stratified Split - 3rd set of parameters**

In [5]:
parameters = {'C':[20,30,40],'kernel':['rbf'],
           'tol':[0.001],
           'shrinking':[False], 'probability':[True],
           'decision_function_shape':['ovo']}
svc = SVC()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_svc = GridSearchCV(svc, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_svc.fit(X,y)

Fitting 100 folds for each of 3 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   57.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [20, 30, 40], 'kernel': ['rbf'], 'tol': [0.001], 'shrinking': [False], 'probability': [True], 'decision_function_shape': ['ovo']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [6]:
print(grid_svc.score(X, y))
print(grid_svc.best_params_)

0.984645286686103
{'C': 40, 'decision_function_shape': 'ovo', 'kernel': 'rbf', 'probability': True, 'shrinking': False, 'tol': 0.001}


In [7]:
predictions5 = grid_svc.predict(X)

In [8]:
print(classification_report(y,predictions5))

                precision    recall  f1-score   support

     CONFIRMED       0.98      0.99      0.98      2089
FALSE POSITIVE       0.99      0.98      0.99      3056

   avg / total       0.98      0.98      0.98      5145



In [9]:
print(confusion_matrix(y,predictions5))

[[2060   29]
 [  50 3006]]


In [11]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=40, decision_function_shape='ovo', kernel='rbf', probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.5762 (+/- 0.79)
Train Accuracy: 98.55 (+/- 0.25)


**Running the errors dataset with the best parameters from the no errors dataset**

In [32]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf',probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.5419 (+/- 0.84)
Train Accuracy: 98.39 (+/- 0.27)


**Discussion - comparing the scores of using optimal parameters from the no errors dataset versus the errors dataset**

The scores were very similar despite the different C value.

**Changing scoring type from 'accuracy' to 'roc_auc'**

In [28]:
#the values for disposition were changed to numbers to get the roc_auc to run correctly

planets_v2 = planets.replace({'koi_disposition':{'FALSE POSITIVE':0,'CONFIRMED':1}})
planets_v2.head()

Unnamed: 0,koi_dikco_msky,koi_dicco_msky,koi_steff_err2,koi_fwm_stat_sig,koi_prad_err1,koi_steff_err1,koi_ror_err1,koi_smet_err2,koi_smet_err1,koi_count,...,koi_srho_err2,koi_insol_err1,koi_fwm_sdeco_err,koi_depth_err1,koi_period,koi_fwm_srao_err,koi_insol,koi_dor_err2,koi_dicco_msky_err,koi_disposition
0,0.32,0.2,-81.0,0.002,0.26,81.0,0.000832,-0.15,0.15,2,...,-1.09986,29.45,0.48,19.5,9.488036,0.51,93.59,-2.6,0.17,1
1,0.5,0.39,-81.0,0.003,0.32,81.0,0.009078,-0.15,0.15,2,...,-2.49638,2.87,0.68,35.5,54.418383,0.72,9.11,-28.4,0.36,1
2,0.276,0.289,-174.0,0.0,8.5,157.0,0.109232,-0.3,0.3,1,...,-0.01837,668.95,0.027,12.8,1.736952,0.031,891.96,-0.136,0.079,0
3,0.07,0.1,-211.0,0.733,0.88,169.0,0.003751,-0.3,0.25,1,...,-1.74541,874.33,0.37,16.9,2.525592,0.35,926.16,-4.0,0.14,1
4,8.948,8.93,-124.0,0.0,6.45,111.0,7.128076,-0.15,0.15,1,...,-0.00053,349.28,0.22,5.8,7.36179,0.25,767.22,-1.2,0.074,0


In [29]:
X = planets_v2.drop('koi_disposition',axis=1)
y = planets_v2['koi_disposition']

In [30]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [31]:
from sklearn.model_selection import cross_validate
svc_clf = SVC(C=30, decision_function_shape='ovo', kernel='rbf',probability=True, shrinking=False, tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(svc_clf, X, y, cv=ss, return_train_score=True, scoring='roc_auc', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 99.2781 (+/- 0.29)
Train Accuracy: 99.81 (+/- 0.05)


**Discussion for roc_auc versus accuracy**

Changing the scoring type from 'accuracy' to 'roc_auc' improved the score.

The score increased with each grid search. The tuning of the parameters worked to increase the score.