Lauren Hare

**Importing and reading in csv**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_validate

from sklearn.ensemble import AdaBoostClassifier

from sklearn.preprocessing import StandardScaler

In [84]:
planets = pd.read_csv("no_errors_reduced.csv")
planets = planets.drop('Unnamed: 0',axis=1)
planets.head()

Unnamed: 0,PRF Δθ<sub>SQ</sub>(KIC) [arcsec],Number of Planets,Planet-Star Radius Ratio,Planetary Radius [Earth radii],PRF Δθ<sub>SQ</sub>(OOT) [arcsec],Maximum Multiple Event Statistic,FW Offset Significance [percent],Planet-Star Distance over Star Radius,Transit Depth [ppm],Transit Signal-to-Noise,...,Stellar Metallicity [dex],FW Source Δα(OOT) [sec],Transit Duration [hrs],FW Source Δδ(OOT) [arcsec],Maximum Single Event Statistic,Fitted Stellar Density [g/cm**3],Orbit Semi-Major Axis [AU],Orbital Period [days],Insolation Flux [Earth flux],Exoplanet Archive Disposition
0,0.32,2,0.022344,2.26,0.2,28.47082,0.002,24.81,615.8,35.8,...,0.14,0.43,2.9575,0.94,5.135849,3.20796,0.0853,9.488036,93.59,CONFIRMED
1,0.5,2,0.027954,2.83,0.39,20.109507,0.003,77.9,874.8,25.8,...,0.14,-0.63,4.507,1.23,7.027669,3.02368,0.2734,54.418383,9.11,CONFIRMED
2,0.276,1,0.387394,33.46,0.289,541.8951,0.0,3.278,8079.2,505.6,...,-0.52,-0.111,2.40641,0.002,39.06655,0.2208,0.0267,1.736952,891.96,FALSE POSITIVE
3,0.07,1,0.024064,2.75,0.1,33.1919,0.733,8.75,603.3,40.9,...,0.07,-0.01,1.6545,0.23,4.749945,1.98635,0.0374,2.525592,926.16,CONFIRMED
4,8.948,1,0.183387,39.21,8.93,46.15308,0.0,2.4,233.7,47.7,...,0.0,-13.45,5.022,24.09,10.964684,0.00485,0.082,7.36179,767.22,FALSE POSITIVE


**Create X and y**

In [3]:
X = planets.drop('Exoplanet Archive Disposition',axis=1)
y = planets['Exoplanet Archive Disposition']

**Scale X**

In [4]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

**Test and Train Split**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

**Classifier with only default parameters and using test/train split**

In [6]:
mlp = MLPClassifier()
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [7]:
print("Training set score: %f" % mlp.score(X_train,y_train))

Training set score: 0.965763


In [8]:
print("Test set score: %f" % mlp.score(X_test, y_test))

Test set score: 0.960409


In [9]:
predictions = mlp.predict(X_test)

In [10]:
print(classification_report(y_test,predictions))

                precision    recall  f1-score   support

     CONFIRMED       0.94      0.96      0.95       631
FALSE POSITIVE       0.97      0.96      0.97       935

   avg / total       0.96      0.96      0.96      1566



In [11]:
print(confusion_matrix(y_test,predictions))

[[608  23]
 [ 39 896]]


In [12]:
print(accuracy_score(y_test,predictions))

0.9604086845466155


**MLP with StratifiedShuffleSplit - default parameters**

In [19]:
parameters = {}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [20]:
print("Score: %f" % grid_mlp.score(X,y))

Score: 0.965689


In [21]:
predictions2 = grid_mlp.predict(X)

In [22]:
print(classification_report(y,predictions2))

                precision    recall  f1-score   support

     CONFIRMED       0.94      0.97      0.96      2102
FALSE POSITIVE       0.98      0.96      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [23]:
print(confusion_matrix(y,predictions2))

[[2042   60]
 [ 119 2996]]


In [24]:
from sklearn.model_selection import cross_val_score
cvs=cross_val_score(grid_mlp, X, y, cv=3, scoring="accuracy")
print(cvs)

Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.7min finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished


Fitting 100 folds for each of 1 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.7min finished


[0.9454023  0.97354802 0.94994246]


**MLP with Stratified Split - 1st set of parameters**

In [25]:
parameters = {'solver':['lbfgs','sgd','adam'],'activation':['identity','logistic','tanh','relu'],
            'hidden_layer_sizes':[20,40,60],'alpha':[1e-5],
            'learning_rate':['constant','invscaling','adaptive'],
           'max_iter':[20,40,60],'tol':[1e-3]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 324 candidates, totalling 32400 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 8176 tasks      | elapsed: 17.9m

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs', 'sgd', 'adam'], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'hidden_layer_sizes': [20, 40, 60], 'alpha': [1e-05], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'max_iter': [20, 40, 60], 'tol': [0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verb

In [26]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9691393521180756
{'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': 40, 'learning_rate': 'invscaling', 'max_iter': 40, 'solver': 'lbfgs', 'tol': 0.001}


In [27]:
predictions3 = grid_mlp.predict(X)

In [28]:
print(classification_report(y,predictions3))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.97      0.96      2102
FALSE POSITIVE       0.98      0.97      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [29]:
print(confusion_matrix(y,predictions3))

[[2040   62]
 [  99 3016]]


In [30]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=1e-05, hidden_layer_sizes=40, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.7713 (+/- 0.91)
Train Accuracy: 97.03 (+/- 0.45)


**MLP with Stratified Split - 2nd set of parameters**

In [31]:
parameters = {'solver':['lbfgs'],'activation':['relu'],
            'hidden_layer_sizes':[30,40,50],'alpha':[1e-5],
            'learning_rate':['invscaling'],
           'max_iter':[30,40,50],'tol':[1e-3]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 9 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  1.6min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['relu'], 'hidden_layer_sizes': [30, 40, 50], 'alpha': [1e-05], 'learning_rate': ['invscaling'], 'max_iter': [30, 40, 50], 'tol': [0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9660724554341575
{'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': 50, 'learning_rate': 'invscaling', 'max_iter': 40, 'solver': 'lbfgs', 'tol': 0.001}


In [33]:
predictions4 = grid_mlp.predict(X)

In [34]:
print(classification_report(y,predictions4))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.97      0.96      2102
FALSE POSITIVE       0.98      0.97      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [35]:
print(confusion_matrix(y,predictions4))

[[2033   69]
 [ 108 3007]]


In [36]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=1e-05, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.7757 (+/- 0.90)
Train Accuracy: 97.09 (+/- 0.44)


**MLP With Stratified Split - 3rd set of parameters**

In [37]:
parameters = {'solver':['lbfgs'],'activation':['relu'],
            'hidden_layer_sizes':[50],'alpha':[1e-7,1e-6,1e-5,1e-4,1e-2,1e-1],
            'learning_rate':['invscaling'],
           'max_iter':[40],'tol':[0.00001,0.0001,0.001,0.01,0.1]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 30 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   44.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  4.2min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['relu'], 'hidden_layer_sizes': [50], 'alpha': [1e-07, 1e-06, 1e-05, 0.0001, 0.01, 0.1], 'learning_rate': ['invscaling'], 'max_iter': [40], 'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [38]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9672225416906268
{'activation': 'relu', 'alpha': 0.1, 'hidden_layer_sizes': 50, 'learning_rate': 'invscaling', 'max_iter': 40, 'solver': 'lbfgs', 'tol': 1e-05}


In [57]:
predictions5 = grid_mlp.predict(X)

In [58]:
print(classification_report(y,predictions5))

                precision    recall  f1-score   support

     CONFIRMED       0.96      0.98      0.97      2102
FALSE POSITIVE       0.99      0.97      0.98      3115

   avg / total       0.98      0.98      0.98      5217



In [59]:
print(confusion_matrix(y,predictions5))

[[2064   38]
 [  80 3035]]


In [39]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.1, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8096 (+/- 0.91)
Train Accuracy: 97.09 (+/- 0.44)


**MLP with StratifiedShuffleSplit - 4th set of parameters**

In [40]:
parameters = {'solver':['lbfgs'],'activation':['relu'],
            'hidden_layer_sizes':[50],'alpha':[0.1,0.25,0.5,0.75],
            'learning_rate':['invscaling'],
           'max_iter':[40],'tol':[0.0000001,0.000001,0.00001]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X,y)

Fitting 100 folds for each of 12 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  2.6min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['relu'], 'hidden_layer_sizes': [50], 'alpha': [0.1, 0.25, 0.5, 0.75], 'learning_rate': ['invscaling'], 'max_iter': [40], 'tol': [1e-07, 1e-06, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [41]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9666474985623922
{'activation': 'relu', 'alpha': 0.25, 'hidden_layer_sizes': 50, 'learning_rate': 'invscaling', 'max_iter': 40, 'solver': 'lbfgs', 'tol': 1e-05}


In [42]:
predictions6 = grid_mlp.predict(X)

In [43]:
print(classification_report(y,predictions6))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.97      0.96      2102
FALSE POSITIVE       0.98      0.97      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [44]:
print(confusion_matrix(y,predictions6))

[[2029   73]
 [ 101 3014]]


In [45]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8257 (+/- 0.88)
Train Accuracy: 97.06 (+/- 0.44)


**Top parameters but learning rate is 'constant' instead of 'invscaling'**

In [47]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='constant', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.8065 (+/- 0.89)
Train Accuracy: 97.07 (+/- 0.42)


**GridSearch with best parameters and including default max_iter value (using only training data)**

In [55]:
parameters={'solver':['lbfgs'],'activation':['relu'],
            'hidden_layer_sizes':[50],'alpha':[0.25],
            'learning_rate':['invscaling'],
           'max_iter':[40,100,200],'tol':[0.00001]}
mlp = MLPClassifier()
ss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=3)
grid_mlp = GridSearchCV(mlp, parameters, cv=ss, n_jobs=-1, verbose=3)
grid_mlp.fit(X_train,y_train)

Fitting 100 folds for each of 3 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.6min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=3, test_size=0.3,
            train_size=None),
       error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs'], 'activation': ['relu'], 'hidden_layer_sizes': [50], 'alpha': [0.25], 'learning_rate': ['invscaling'], 'max_iter': [40, 100, 200], 'tol': [1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [56]:
print(grid_mlp.score(X, y))
print(grid_mlp.best_params_)

0.9674142227333716
{'activation': 'relu', 'alpha': 0.25, 'hidden_layer_sizes': 50, 'learning_rate': 'invscaling', 'max_iter': 40, 'solver': 'lbfgs', 'tol': 1e-05}


In [57]:
predictions7 = grid_mlp.predict(X)

In [58]:
print(classification_report(y,predictions7))

                precision    recall  f1-score   support

     CONFIRMED       0.95      0.97      0.96      2102
FALSE POSITIVE       0.98      0.97      0.97      3115

   avg / total       0.97      0.97      0.97      5217



In [59]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores = cross_validate(mlp_clf, X_train, y_train, cv=ss, return_train_score=True, scoring='accuracy', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 95.6854 (+/- 1.12)
Train Accuracy: 97.41 (+/- 0.54)


**Changing the scoring to 'roc_auc'**

In [86]:
#the values for disposition were replaced with numbers to get the roc_auc method of scoring to run correctly

planets_v2 = planets.replace({'Exoplanet Archive Disposition':{'FALSE POSITIVE':0,'CONFIRMED':1}})
planets_v2.head()

Unnamed: 0,PRF Δθ<sub>SQ</sub>(KIC) [arcsec],Number of Planets,Planet-Star Radius Ratio,Planetary Radius [Earth radii],PRF Δθ<sub>SQ</sub>(OOT) [arcsec],Maximum Multiple Event Statistic,FW Offset Significance [percent],Planet-Star Distance over Star Radius,Transit Depth [ppm],Transit Signal-to-Noise,...,Stellar Metallicity [dex],FW Source Δα(OOT) [sec],Transit Duration [hrs],FW Source Δδ(OOT) [arcsec],Maximum Single Event Statistic,Fitted Stellar Density [g/cm**3],Orbit Semi-Major Axis [AU],Orbital Period [days],Insolation Flux [Earth flux],Exoplanet Archive Disposition
0,0.32,2,0.022344,2.26,0.2,28.47082,0.002,24.81,615.8,35.8,...,0.14,0.43,2.9575,0.94,5.135849,3.20796,0.0853,9.488036,93.59,1
1,0.5,2,0.027954,2.83,0.39,20.109507,0.003,77.9,874.8,25.8,...,0.14,-0.63,4.507,1.23,7.027669,3.02368,0.2734,54.418383,9.11,1
2,0.276,1,0.387394,33.46,0.289,541.8951,0.0,3.278,8079.2,505.6,...,-0.52,-0.111,2.40641,0.002,39.06655,0.2208,0.0267,1.736952,891.96,0
3,0.07,1,0.024064,2.75,0.1,33.1919,0.733,8.75,603.3,40.9,...,0.07,-0.01,1.6545,0.23,4.749945,1.98635,0.0374,2.525592,926.16,1
4,8.948,1,0.183387,39.21,8.93,46.15308,0.0,2.4,233.7,47.7,...,0.0,-13.45,5.022,24.09,10.964684,0.00485,0.082,7.36179,767.22,0


In [87]:
X = planets_v2.drop('Exoplanet Archive Disposition',axis=1)
y = planets_v2['Exoplanet Archive Disposition']

In [88]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [89]:
from sklearn.model_selection import cross_validate
mlp_clf = MLPClassifier(activation='relu', alpha=0.25, hidden_layer_sizes=50, learning_rate='invscaling', max_iter=40, solver='lbfgs', tol=0.00001)
ss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3, random_state=3)
scores=cross_validate(mlp_clf, X, y, cv=ss, return_train_score=True, scoring='roc_auc', n_jobs=-1)

print("Validation Accuracy: %0.4f (+/- %0.2f)" % (scores['test_score'].mean()*100, scores['test_score'].std()*2*100))
print("Train Accuracy: %0.2f (+/- %0.2f)" % (scores['train_score'].mean()*100, scores['train_score'].std()*2*100))

Validation Accuracy: 98.9528 (+/- 0.38)
Train Accuracy: 99.48 (+/- 0.12)


**Discussion**

The accuracy score increased when changing scoring method from 'accuracy' to 'roc_auc'.

Each iteration of the gridsearch involved fine-tuned parameters. The score increased with each round, showing that the new parameters were better than the previous set.