Lauren Hare

**Importing and reading in csv**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_validate

from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler

In [2]:
planets = pd.read_csv("errors_reduced.csv")
planets = planets.drop('Unnamed: 0',axis=1)
planets.head()

Unnamed: 0,koi_dikco_msky,koi_dicco_msky,koi_steff_err2,koi_fwm_stat_sig,koi_prad_err1,koi_steff_err1,koi_ror_err1,koi_smet_err2,koi_smet_err1,koi_count,...,koi_srho_err2,koi_insol_err1,koi_fwm_sdeco_err,koi_depth_err1,koi_period,koi_fwm_srao_err,koi_insol,koi_dor_err2,koi_dicco_msky_err,koi_disposition
0,0.32,0.2,-81.0,0.002,0.26,81.0,0.000832,-0.15,0.15,2,...,-1.09986,29.45,0.48,19.5,9.488036,0.51,93.59,-2.6,0.17,CONFIRMED
1,0.5,0.39,-81.0,0.003,0.32,81.0,0.009078,-0.15,0.15,2,...,-2.49638,2.87,0.68,35.5,54.418383,0.72,9.11,-28.4,0.36,CONFIRMED
2,0.276,0.289,-174.0,0.0,8.5,157.0,0.109232,-0.3,0.3,1,...,-0.01837,668.95,0.027,12.8,1.736952,0.031,891.96,-0.136,0.079,FALSE POSITIVE
3,0.07,0.1,-211.0,0.733,0.88,169.0,0.003751,-0.3,0.25,1,...,-1.74541,874.33,0.37,16.9,2.525592,0.35,926.16,-4.0,0.14,CONFIRMED
4,8.948,8.93,-124.0,0.0,6.45,111.0,7.128076,-0.15,0.15,1,...,-0.00053,349.28,0.22,5.8,7.36179,0.25,767.22,-1.2,0.074,FALSE POSITIVE


**Create X and y**

In [3]:
X = planets.drop('koi_disposition',axis=1)
y = planets['koi_disposition']

**Scale X**

In [4]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

**Test and Train Split**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

**Classifier with only default parameters and using test/train split**

In [6]:
mlp=MLPClassifier()
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [7]:
print("Training set score: %f" % mlp.score(X_train,y_train))
#print(clf.best_params_)

Training set score: 0.973896


In [8]:
print("Test set score: %f" % mlp.score(X_test, y_test))

Test set score: 0.965026


In [9]:
predictions=mlp.predict(X_test)

In [10]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.96      0.96       631
FALSE POSITIVE       0.97      0.97      0.97       913

   avg / total       0.97      0.97      0.97      1544



In [11]:
print(confusion_matrix(y_test,predictions))

[[608  23]
 [ 31 882]]


In [12]:
print(accuracy_score(y_test,predictions))

0.9650259067357513


**MLP with StratifiedShuffleSplit - default parameters**

In [13]:
parameters={}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   59.0s finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [14]:
print("Score: %f" % grid_mlp.score(X,y))

Score: 0.983285


In [15]:
predictions2=grid_mlp.predict(X)

In [16]:
print(classification_report(y,predictions2))

                precision    recall  f1-score   support

     CONFIRMED       0.98      0.98      0.98      2089
FALSE POSITIVE       0.99      0.98      0.99      3056

   avg / total       0.98      0.98      0.98      5145



In [17]:
print(confusion_matrix(y,predictions2))

[[2055   34]
 [  52 3004]]


In [18]:
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(grid_mlp, X, y, cv=3, scoring="accuracy")
print(cvs)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   41.3s finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   40.7s finished


[0.9481352  0.97842566 0.95624271]


In [20]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.1380 (+/- 1.12)
Train Accuracy: 96.82 (+/- 1.00)


**MLP with Stratified Split - 1st set of parameters**

In [35]:
parameters={'solver':['lbfgs','sgd','adam'],'activation':['identity','logistic','tanh','relu'],
            'hidden_layer_sizes':[20,40,60],'alpha':[1e-5],
            'learning_rate':['constant','invscaling','adaptive'],
           'max_iter':[20,40,60],'tol':[1e-3]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 324 candidates, totalling 32400 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   32.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 8176 tasks      | elapsed: 11.9m

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs', 'sgd', 'adam'], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'hidden_layer_sizes': [20, 40, 60], 'alpha': [1e-05], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'max_iter': [20, 40, 60], 'tol': [0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verb

In [41]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9712342079689018
{'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': 60, 'learning_rate': 'invscaling', 'max_iter': 20, 'solver': 'lbfgs', 'tol': 0.001}


In [42]:
predictions3=grid_mlp.predict(X)

In [43]:
print(classification_report(y,predictions3))

             precision    recall  f1-score   support

          0       0.98      0.97      0.98      3056
          1       0.96      0.97      0.96      2089

avg / total       0.97      0.97      0.97      5145



In [44]:
print(confusion_matrix(y,predictions3))

[[2971   85]
 [  63 2026]]


In [45]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='tanh', alpha=1e-05, hidden_layer_sizes=60, learning_rate='invscaling', max_iter=20, solver='lbfgs', tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.3966 (+/- 0.85)
Train Accuracy: 97.25 (+/- 0.43)


**MLP with Stratified Split - 2nd set of parameters**

In [46]:
parameters={'solver':['lbfgs'],'activation':['tanh'],
            'hidden_layer_sizes':[60,80,100],'alpha':[1e-5],
            'learning_rate':['invscaling'],
           'max_iter':[10,20,30],'tol':[1e-3]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 9 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.8min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['tanh'], 'hidden_layer_sizes': [60, 80, 100], 'alpha': [1e-05], 'learning_rate': ['invscaling'], 'max_iter': [10, 20, 30], 'tol': [0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [47]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9795918367346939
{'activation': 'tanh', 'alpha': 1e-05, 'hidden_layer_sizes': 60, 'learning_rate': 'invscaling', 'max_iter': 30, 'solver': 'lbfgs', 'tol': 0.001}


In [48]:
predictions4=grid_mlp.predict(X)

In [49]:
print(classification_report(y,predictions4))

             precision    recall  f1-score   support

          0       0.98      0.98      0.98      3056
          1       0.97      0.98      0.97      2089

avg / total       0.98      0.98      0.98      5145



In [50]:
print(confusion_matrix(y,predictions4))

[[2999   57]
 [  48 2041]]


In [53]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='tanh', alpha=1e-05, hidden_layer_sizes=60, learning_rate='invscaling', max_iter=30, solver='lbfgs', tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.5089 (+/- 0.88)
Train Accuracy: 97.99 (+/- 0.45)


**MLP With Stratified Split - 3rd set of parameters**

In [54]:
parameters={'solver':['lbfgs'],'activation':['tanh'],
            'hidden_layer_sizes':[60],'alpha':[1e-7,1e-6,1e-5,1e-4,1e-2,1e-1],
            'learning_rate':['invscaling'],
           'max_iter':[30],'tol':[0.00001,0.0001,0.001,0.01,0.1]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 30 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 546 tasks      | elapsed:   52.1s
[Parallel(n_jobs=-1)]: Done 835 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1236 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1704 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 2220 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2817 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2985 out of 3000 | elapsed:  4.2min remaining:    1.2s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  4.2min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['tanh'], 'hidden_layer_sizes': [60], 'alpha': [1e-07, 1e-06, 1e-05, 0.0001, 0.01, 0.1], 'learning_rate': ['invscaling'], 'max_iter': [30], 'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [55]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9747327502429544
{'activation': 'tanh', 'alpha': 0.1, 'hidden_layer_sizes': 60, 'learning_rate': 'invscaling', 'max_iter': 30, 'solver': 'lbfgs', 'tol': 1e-05}


In [56]:
predictions5=grid_mlp.predict(X)

In [57]:
print(classification_report(y,predictions5))

             precision    recall  f1-score   support

          0       0.98      0.98      0.98      3056
          1       0.97      0.97      0.97      2089

avg / total       0.97      0.97      0.97      5145



In [58]:
print(confusion_matrix(y,predictions5))

[[2983   73]
 [  57 2032]]


In [59]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='tanh', alpha=0.1, hidden_layer_sizes=60, learning_rate='invscaling', max_iter=30, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.5455 (+/- 0.85)
Train Accuracy: 97.96 (+/- 0.46)


**GridSearch with best parameters and including default max_iter value**

In [68]:
parameters={'solver':['lbfgs'],'activation':['tanh'],
            'hidden_layer_sizes':[60],'alpha':[0.1],
            'learning_rate':['invscaling'],
           'max_iter':[30,100,200],'tol':[0.00001]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 3 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.3min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['tanh'], 'hidden_layer_sizes': [60], 'alpha': [0.1], 'learning_rate': ['invscaling'], 'max_iter': [30, 100, 200], 'tol': [1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [69]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9745383867832847
{'activation': 'tanh', 'alpha': 0.1, 'hidden_layer_sizes': 60, 'learning_rate': 'invscaling', 'max_iter': 30, 'solver': 'lbfgs', 'tol': 1e-05}


In [70]:
predictions7=grid_mlp.predict(X)

In [71]:
print(classification_report(y,predictions7))

             precision    recall  f1-score   support

          0       0.98      0.98      0.98      3056
          1       0.97      0.97      0.97      2089

avg / total       0.97      0.97      0.97      5145



In [72]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='tanh', alpha=0.1, hidden_layer_sizes=60, learning_rate='invscaling', max_iter=30, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X_train, y_train, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.2522 (+/- 1.05)
Train Accuracy: 98.36 (+/- 0.53)


**Running the errors dataset on the best parameters from the no errors dataset**

In [30]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X_train, y_train, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 96.1748 (+/- 1.06)
Train Accuracy: 98.90 (+/- 0.40)


**Changing the scoring to 'roc_auc'**

In [31]:
planets_v2 = planets.replace({'koi_disposition':{'FALSE POSITIVE':0,'CONFIRMED':1}})
planets_v2.head()

Unnamed: 0,koi_dikco_msky,koi_dicco_msky,koi_steff_err2,koi_fwm_stat_sig,koi_prad_err1,koi_steff_err1,koi_ror_err1,koi_smet_err2,koi_smet_err1,koi_count,...,koi_srho_err2,koi_insol_err1,koi_fwm_sdeco_err,koi_depth_err1,koi_period,koi_fwm_srao_err,koi_insol,koi_dor_err2,koi_dicco_msky_err,koi_disposition
0,0.32,0.2,-81.0,0.002,0.26,81.0,0.000832,-0.15,0.15,2,...,-1.09986,29.45,0.48,19.5,9.488036,0.51,93.59,-2.6,0.17,1
1,0.5,0.39,-81.0,0.003,0.32,81.0,0.009078,-0.15,0.15,2,...,-2.49638,2.87,0.68,35.5,54.418383,0.72,9.11,-28.4,0.36,1
2,0.276,0.289,-174.0,0.0,8.5,157.0,0.109232,-0.3,0.3,1,...,-0.01837,668.95,0.027,12.8,1.736952,0.031,891.96,-0.136,0.079,0
3,0.07,0.1,-211.0,0.733,0.88,169.0,0.003751,-0.3,0.25,1,...,-1.74541,874.33,0.37,16.9,2.525592,0.35,926.16,-4.0,0.14,1
4,8.948,8.93,-124.0,0.0,6.45,111.0,7.128076,-0.15,0.15,1,...,-0.00053,349.28,0.22,5.8,7.36179,0.25,767.22,-1.2,0.074,0


In [32]:
X = planets_v2.drop('koi_disposition',axis=1)
y = planets_v2['koi_disposition']

In [33]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [34]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='roc_auc', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 99.2284 (+/- 0.37)
Train Accuracy: 99.87 (+/- 0.05)


**Discussion**

Changing the scoring type from 'accuracy' to 'roc_auc' increased the score.

Using the fine-tuned parameters from the no-errors dataset showed a similar score as the fine-tuned parameters from the errors dataset.

The score increased as the parameters were changed with each new grid search.