In [57]:
import numpy as np
import pandas as pd




In [58]:
###import Kepler Data
kepler_data = pd.read_csv("KeplerData\kepler_koi_data.csv")
kepler_koi_data = pd.DataFrame(kepler_data)

### Clean up data by removing rows that are full 0s, links to images that do not contain calcuable data, or ID names that are arbitrary to the model
kepler_koi_data = kepler_koi_data.drop(['rowid',
                                        'kepler_name',
                                        'koi_vet_stat',
                                        'koi_vet_date',
                                        'koi_disp_prov',
                                        'koi_comment', 
                                        'koi_fpflag_ec', 
                                        'koi_longp', 
                                        'koi_ingress', 
                                        'koi_ldm_coeff4', 
                                        'koi_ldm_coeff3', 
                                        'koi_model_chisq', 
                                        'koi_trans_mod', 
                                        'koi_model_dof', 
                                        'koi_datalink_dvr', 
                                        'koi_datalink_dvs', 
                                        'koi_sparprov',
                                        'koi_fittype',
                                        'koi_limbdark_mod',
                                        'koi_parm_prov',
                                        'koi_tce_delivname',
                                        'koi_sage'], axis = 1)
print(kepler_koi_data)

         kepid kepoi_name koi_disposition koi_pdisposition  koi_score  \
0     10797460  K00752.01       CONFIRMED        CANDIDATE      1.000   
1     10797460  K00752.02       CONFIRMED        CANDIDATE      0.969   
2     10811496  K00753.01       CANDIDATE        CANDIDATE      0.000   
3     10848459  K00754.01  FALSE POSITIVE   FALSE POSITIVE      0.000   
4     10854555  K00755.01       CONFIRMED        CANDIDATE      1.000   
...        ...        ...             ...              ...        ...   
9559  10090151  K07985.01  FALSE POSITIVE   FALSE POSITIVE      0.000   
9560  10128825  K07986.01       CANDIDATE        CANDIDATE      0.497   
9561  10147276  K07987.01  FALSE POSITIVE   FALSE POSITIVE      0.021   
9562  10155286  K07988.01       CANDIDATE        CANDIDATE      0.092   
9563  10156110  K07989.01  FALSE POSITIVE   FALSE POSITIVE      0.000   

      koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_period  koi_time0bk  \
0                 0              0           

In [59]:
###import K2 Objects of Interest Data
k2_data = pd.read_csv("KeplerData\k2_oi_data.csv")

print(k2_data)

      rowid            pl_name        hostname pl_letter   k2_name  \
0         1        BD+20 594 b       BD+20 594         b   K2-56 b   
1         2        BD+20 594 b       BD+20 594         b   K2-56 b   
2         3        BD+20 594 b       BD+20 594         b   K2-56 b   
3         4  EPIC 201111557.01  EPIC 201111557       NaN       NaN   
4         5  EPIC 201111557.01  EPIC 201111557       NaN       NaN   
...     ...                ...             ...       ...       ...   
3512   3513        WASP-85 A b       WASP-85 A         b   K2-94 b   
3513   3514        WASP-85 A b       WASP-85 A         b   K2-94 b   
3514   3515        WASP-85 A b       WASP-85 A         b   K2-94 b   
3515   3516         Wolf 503 b        Wolf 503         b  K2-262 b   
3516   3517         Wolf 503 b        Wolf 503         b  K2-262 b   

       epic_hostname      epic_candname hd_name   hip_name         tic_id  \
0     EPIC 210848071  EPIC 210848071.01     NaN        NaN   TIC 26123781   
1    

In [60]:
na_values = kepler_koi_data.isna().sum() ### Count of NA Values within the dataframe ###
na_percentage = na_values/kepler_koi_data.shape[0] * 100
print("The missing percentage of data is: ", na_percentage)

The missing percentage of data is:  kepid                0.000000
kepoi_name           0.000000
koi_disposition      0.000000
koi_pdisposition     0.000000
koi_score           15.788373
                      ...    
koi_dicco_mdec       6.263070
koi_dicco_msky       6.263070
koi_dikco_mra        5.959849
koi_dikco_mdec       5.959849
koi_dikco_msky       5.959849
Length: 61, dtype: float64


In [61]:
### Separate Data Labels ###

kepler_index_list = kepler_koi_data['kepid']
kepler_koi_data_labels = kepler_koi_data['koi_pdisposition']
kepler_koi_data_no_labels = kepler_koi_data[kepler_koi_data.columns[4:]]

print(kepler_koi_data_no_labels)

      koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_period  \
0         1.000              0              0              0    9.488036   
1         0.969              0              0              0   54.418383   
2         0.000              0              0              0   19.899140   
3         0.000              0              1              0    1.736952   
4         1.000              0              0              0    2.525592   
...         ...            ...            ...            ...         ...   
9559      0.000              0              1              1    0.527699   
9560      0.497              0              0              0    1.739849   
9561      0.021              0              0              1    0.681402   
9562      0.092              0              0              0  333.486169   
9563      0.000              0              0              1    4.856035   

      koi_time0bk    koi_time0  koi_eccen  koi_impact  koi_duration  ...  \
0      170.

In [62]:
### Calculate a reasonable number of neighbors for KNN ###
from math import sqrt

round(sqrt(9564),0)

98.0

In [63]:
from sklearn.impute import KNNImputer

### Use K-Nearest Neighbor to calculate for missing values within the dataset ###

neighborNumber = 98

knn_inputer = KNNImputer(n_neighbors=neighborNumber)

kepler_knn_inputer = knn_inputer.fit_transform(kepler_koi_data_no_labels)

print(kepler_knn_inputer)

[[ 1.00e+00  0.00e+00  0.00e+00 ...  8.00e-02  3.10e-01  3.20e-01]
 [ 9.69e-01  0.00e+00  0.00e+00 ...  4.90e-01  1.20e-01  5.00e-01]
 [ 0.00e+00  0.00e+00  0.00e+00 ...  2.00e-03 -2.70e-02  2.70e-02]
 ...
 [ 2.10e-02  0.00e+00  0.00e+00 ...  3.38e+00 -3.89e+00  5.16e+00]
 [ 9.20e-02  0.00e+00  0.00e+00 ...  1.45e+00  1.37e+00  2.00e+00]
 [ 0.00e+00  0.00e+00  0.00e+00 ...  9.60e-01  3.40e+00  3.50e+00]]


In [64]:
###Create a Column Name list to recreate Kepler Data Dataframe from the KNNInputer ML Algorithm ###
kepler_columns = list(kepler_koi_data_no_labels.columns.values)

In [65]:
###Convert KNN Array to Dataframe with proper Column Labels###
kepler_knn_inputer = pd.DataFrame(kepler_knn_inputer)
kepler_knn_inputer = kepler_knn_inputer.rename(columns = {0:'koi_score', 1:'koi_fpflag_nt', 2:'koi_fpflag_ss', 
                                                            3:'koi_fpflag_co', 4:'koi_period', 5:'koi_time0bk', 
                                                            6:'koi_time0', 7:'koi_eccen', 8:'koi_impact', 
                                                            9:'koi_duration', 10:'koi_depth', 11:'koi_ror', 
                                                            12:'koi_srho', 13:'koi_prad', 14:'koi_sma', 
                                                            15:'koi_incl', 16:'koi_teq', 17:'koi_insol', 
                                                            18:'koi_dor', 19:'koi_ldm_coeff2', 20:'koi_ldm_coeff1', 
                                                            21:'koi_max_sngle_ev', 22:'koi_max_mult_ev', 23:'koi_model_snr', 
                                                            24:'koi_count', 25:'koi_num_transits', 26:'koi_tce_plnt_num', 
                                                            27:'koi_quarters', 28:'koi_bin_oedp_sig', 29:'koi_steff', 
                                                            30:'koi_slogg', 31:'koi_smet', 32:'koi_srad', 
                                                            33:'koi_smass', 34:'ra', 35:'dec', 36:'koi_kepmag', 
                                                            37:'koi_gmag', 38:'koi_rmag', 39:'koi_imag', 
                                                            40:'koi_zmag', 41:'koi_jmag', 42:'koi_hmag', 
                                                            43:'koi_kmag', 44:'koi_fwm_stat_sig', 45:'koi_fwm_sra', 
                                                            46:'koi_fwm_sdec', 47:'koi_fwm_srao', 48:'koi_fwm_sdeco', 
                                                            49:'koi_fwm_prao', 50:'koi_fwm_pdeco', 51:'koi_dicco_mra', 
                                                            52:'koi_dicco_mdec', 53:'koi_dicco_msky', 54:'koi_dikco_mra', 
                                                            55:'koi_dikco_mdec', 56:'koi_dikco_msky'})
print(kepler_knn_inputer)

      koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_period  \
0         1.000            0.0            0.0            0.0    9.488036   
1         0.969            0.0            0.0            0.0   54.418383   
2         0.000            0.0            0.0            0.0   19.899140   
3         0.000            0.0            1.0            0.0    1.736952   
4         1.000            0.0            0.0            0.0    2.525592   
...         ...            ...            ...            ...         ...   
9559      0.000            0.0            1.0            1.0    0.527699   
9560      0.497            0.0            0.0            0.0    1.739849   
9561      0.021            0.0            0.0            1.0    0.681402   
9562      0.092            0.0            0.0            0.0  333.486169   
9563      0.000            0.0            0.0            1.0    4.856035   

      koi_time0bk    koi_time0  koi_eccen  koi_impact  koi_duration  ...  \
0      170.

In [66]:
### One-Hot Encode Data Labels ###
kepler_koi_data_labels = pd.get_dummies(kepler_koi_data_labels, columns = ['koi_pdisposition'], drop_first = False)
print (kepler_koi_data_labels)

      CANDIDATE  FALSE POSITIVE
0             1               0
1             1               0
2             1               0
3             0               1
4             1               0
...         ...             ...
9559          0               1
9560          1               0
9561          0               1
9562          1               0
9563          0               1

[9564 rows x 2 columns]


In [67]:
from sklearn.model_selection import train_test_split

### Split Data into Training, Testing and Validation Sets ###

X_train_full, X_test, y_train_full, y_test = train_test_split(kepler_knn_inputer, kepler_koi_data_labels, train_size = 0.90, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, train_size = 0.89, random_state = 42)

In [68]:
import tensorflow as tf
from tensorflow import keras

In [69]:
np.random.seed(42)
tf.random.set_seed(42)

In [70]:
### Create and Run 1st Model. Start with approximately half as many neurons as Data Rows and then half the neurons of the following 3 layers ###

model = keras.models.Sequential([keras.layers.Dense(4500, activation = 'sigmoid', input_shape = X_train.shape[1:]),
                                keras.layers.Dense(2250, activation = 'sigmoid'),
                                keras.layers.Dense(1125, activation = 'relu'),
                                keras.layers.Dense(665, activation = 'relu'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model.fit(X_train, y_train, epochs = 100, validation_data = (X_valid, y_valid), callbacks = [keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


<keras.callbacks.History at 0x1950fe85910>

In [71]:
### Create Model 2 by changing the Activation ###
model2 = keras.models.Sequential([keras.layers.Dense(4500, activation = 'relu', input_shape = X_train.shape[1:]),
                                keras.layers.Dense(2250, activation = 'relu'),
                                keras.layers.Dense(1125, activation = 'relu'),
                                keras.layers.Dense(665, activation = 'relu'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model2.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model2.fit(X_train, y_train, epochs = 50, validation_data = (X_valid, y_valid), callbacks = [keras.callbacks.EarlyStopping(patience = 10)]) #Not the model to use

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<keras.callbacks.History at 0x19512edc760>

In [72]:
### Create 3rd model ###

model3 = keras.models.Sequential([keras.layers.Dense(4500, activation = 'sigmoid', input_shape = X_train.shape[1:]),
                                keras.layers.Dense(2250, activation = 'sigmoid'),
                                keras.layers.Dense(1125, activation = 'sigmoid'),
                                keras.layers.Dense(665, activation = 'sigmoid'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model3.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model3.fit(X_train, y_train, epochs = 50, validation_data = (X_valid, y_valid), callbacks = [keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


<keras.callbacks.History at 0x195151d1e80>

In [73]:
### Create a Reduced Dataset from Data Types that are equal to K2 Data Types ###

kepler_knn_inputer_reduced = kepler_knn_inputer[['koi_eccen',
                                                'koi_teq',
                                                'koi_impact',
                                                'koi_incl',
                                                'koi_insol',
                                                'koi_count',
                                                'koi_period',
                                                'koi_steff',
                                                'koi_smass',
                                                'koi_smet',
                                                'koi_slogg']]

print(kepler_knn_inputer_reduced)

      koi_eccen  koi_teq  koi_impact  koi_incl  koi_insol  koi_count  \
0           0.0    793.0       0.146     89.66      93.59        2.0   
1           0.0    443.0       0.586     89.57       9.11        2.0   
2           0.0    638.0       0.969     88.96      39.30        1.0   
3           0.0   1395.0       1.276     67.09     891.96        1.0   
4           0.0   1406.0       0.701     85.41     926.16        1.0   
...         ...      ...         ...       ...        ...        ...   
9559        0.0   2088.0       1.252     20.78    4500.53        1.0   
9560        0.0   1608.0       0.043     89.42    1585.81        1.0   
9561        0.0   2218.0       0.147     88.60    5713.41        1.0   
9562        0.0    557.0       0.214     89.98      22.68        1.0   
9563        0.0   1266.0       0.134     89.36     607.42        1.0   

      koi_period  koi_steff  koi_smass  koi_smet  koi_slogg  
0       9.488036     5455.0      0.919      0.14      4.467  
1      54.4

In [74]:
X_train_full2, X_test2, y_train_full2, y_test2 = train_test_split(kepler_knn_inputer_reduced, kepler_koi_data_labels, train_size = 0.75, random_state = 42)
X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_train_full2, y_train_full2, train_size = 0.80, random_state = 42)

In [75]:
### Repeat of Model 1 with the Reduced Dataset ###

model4 = keras.models.Sequential([keras.layers.Dense(4500, activation = 'sigmoid'),
                                keras.layers.Dense(2250, activation = 'sigmoid'),
                                keras.layers.Dense(1125, activation = 'relu'),
                                keras.layers.Dense(665, activation = 'relu'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model4.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model4.fit(X_train2, y_train2, epochs = 50, validation_data = (X_valid2, y_valid2), callbacks = [keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


<keras.callbacks.History at 0x195151bc850>

In [76]:
### Repeat of Model 2 with the Reduced Dataset ###

model5 = keras.models.Sequential([keras.layers.Dense(4500, activation = 'relu'),
                                keras.layers.Dense(2250, activation = 'relu'),
                                keras.layers.Dense(1125, activation = 'relu'),
                                keras.layers.Dense(665, activation = 'relu'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model5.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model5.fit(X_train2, y_train2, epochs = 50, validation_data = (X_valid2, y_valid2), callbacks = [keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


<keras.callbacks.History at 0x195240e4f10>

In [77]:
### Repeat of Model 3 with the Reduced Dataset ###

model6 = keras.models.Sequential([keras.layers.Dense(4500, activation = 'sigmoid'),
                                keras.layers.Dense(2250, activation = 'sigmoid'),
                                keras.layers.Dense(1125, activation = 'sigmoid'),
                                keras.layers.Dense(665, activation = 'sigmoid'),
                                keras.layers.Dense(2, activation = 'softmax')
                                ])

model6.compile(loss = "mean_squared_error", optimizer=keras.optimizers.SGD(learning_rate=0.01), metrics = ['accuracy'])

model6.fit(X_train2, y_train2, epochs = 50, validation_data = (X_valid2, y_valid2), callbacks = [keras.callbacks.EarlyStopping(patience = 10)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50


<keras.callbacks.History at 0x1952221f190>

In [78]:
model.evaluate(X_test, y_test)



[0.25129643082618713, 0.5005224943161011]

In [79]:
model2.evaluate(X_test, y_test)



[0.4994775354862213, 0.5005224943161011]

In [80]:
model3.evaluate(X_test, y_test)



[0.25430020689964294, 0.5005224943161011]

In [81]:
model4.evaluate(X_test2, y_test2)



[0.22386498749256134, 0.654119610786438]

In [82]:
model5.evaluate(X_test2, y_test2)



[0.5014638304710388, 0.4985361695289612]

In [83]:
model6.evaluate(X_test2, y_test2)



[0.21852503716945648, 0.6545378565788269]

In [84]:
### Create the K2 Dataset similar to the Reduced Kepler Dataset ###

k2_prep_data = k2_data[['pl_orbeccen',
                        'pl_eqt',
                        'pl_imppar',
                        'pl_orbincl',
                        'pl_insol',
                        'sy_pnum',
                        'pl_orbper',
                        'st_teff',
                        'st_mass',
                        'st_met',
                        'st_logg']]

print(k2_prep_data)

      pl_orbeccen  pl_eqt  pl_imppar  pl_orbincl  pl_insol  sy_pnum  \
0            0.00     NaN        NaN      89.550       NaN        1   
1            0.00   546.0        NaN      89.550       NaN        1   
2             NaN     NaN        NaN      89.526       NaN        1   
3             NaN     NaN        NaN      87.444       NaN        0   
4             NaN  1054.0      0.420         NaN       NaN        0   
...           ...     ...        ...         ...       ...      ...   
3512         0.00  1452.0      0.047      89.690       NaN        1   
3513          NaN     NaN        NaN         NaN       NaN        1   
3514          NaN     NaN        NaN         NaN       NaN        1   
3515         0.41   790.0      0.650         NaN       NaN        1   
3516          NaN   805.0      0.387         NaN      69.6        1   

      pl_orbper  st_teff  st_mass  st_met  st_logg  
0     41.685500  5766.00     1.67  -0.150     4.50  
1     41.685500  5766.00     0.96  -0.150

In [85]:
### Calculate an appropriate n_neighbors value for KNN of K2 Dataset ###

k2_n_neighbors = round(sqrt(3517),0)
print(k2_n_neighbors)

59.0


In [86]:
k2neighborNumber = 59

knn_inputer = KNNImputer(n_neighbors=k2neighborNumber)

k2_knn_inputer = knn_inputer.fit_transform(k2_prep_data)

print(k2_knn_inputer)

[[ 0.00000000e+00  9.03355932e+02  4.78559322e-01 ...  1.67000000e+00
  -1.50000000e-01  4.50000000e+00]
 [ 0.00000000e+00  5.46000000e+02  5.00288136e-01 ...  9.60000000e-01
  -1.50000000e-01  4.50000000e+00]
 [ 8.40313559e-02  8.66694915e+02  4.78203390e-01 ...  9.60000000e-01
  -6.00000000e-02  4.38000000e+00]
 ...
 [ 1.23882203e-01  9.47457627e+02  4.99559322e-01 ...  7.65084746e-01
  -8.76271186e-03  4.50000000e+00]
 [ 4.10000000e-01  7.90000000e+02  6.50000000e-01 ...  6.90000000e-01
  -4.70000000e-01  4.62000000e+00]
 [ 1.16817797e-01  8.05000000e+02  3.87000000e-01 ...  6.90000000e-01
  -4.70000000e-01  4.62000000e+00]]


In [87]:
### Create K2 Dataset with no NaN values from KNN Inputer Algorithm ###

k2_knn_inputer = pd.DataFrame(k2_knn_inputer, columns = ['pl_orbeccen',
                                                        'pl_eqt',
                                                        'pl_imppar',
                                                        'pl_orbincl',
                                                        'pl_insol',
                                                        'sy_pnum',
                                                        'pl_orbper',
                                                        'st_teff',
                                                        'st_mass',
                                                        'st_met',
                                                        'st_logg'])

print(k2_knn_inputer)

      pl_orbeccen       pl_eqt  pl_imppar  pl_orbincl    pl_insol  sy_pnum  \
0        0.000000   903.355932   0.478559   89.550000  438.145254      1.0   
1        0.000000   546.000000   0.500288   89.550000  316.306780      1.0   
2        0.084031   866.694915   0.478203   89.526000  342.801186      1.0   
3        0.081611   823.440678   0.608966   87.444000  136.744746      0.0   
4        0.106272  1054.000000   0.420000   86.697153  154.878983      0.0   
...           ...          ...        ...         ...         ...      ...   
3512     0.000000  1452.000000   0.047000   89.690000  517.530508      1.0   
3513     0.097709   913.067797   0.520203   88.087169  284.330000      1.0   
3514     0.123882   947.457627   0.499559   88.163729  253.031356      1.0   
3515     0.410000   790.000000   0.650000   86.963898  115.297797      1.0   
3516     0.116818   805.000000   0.387000   86.910576   69.600000      1.0   

      pl_orbper  st_teff   st_mass    st_met   st_logg  
0     

In [88]:
### Reorder K2 Dataset to Match Kepler Dataset order for Neural Network Models ###

k2_knn_inputer = k2_knn_inputer[['sy_pnum',
                                'pl_orbper',
                                'pl_orbeccen',
                                'pl_insol',
                                'pl_eqt',
                                'pl_orbincl',
                                'pl_imppar',
                                'st_teff',
                                'st_mass',
                                'st_met',
                                'st_logg',]]

print(k2_knn_inputer)

      sy_pnum  pl_orbper  pl_orbeccen    pl_insol       pl_eqt  pl_orbincl  \
0         1.0  41.685500     0.000000  438.145254   903.355932   89.550000   
1         1.0  41.685500     0.000000  316.306780   546.000000   89.550000   
2         1.0  41.688644     0.084031  342.801186   866.694915   89.526000   
3         0.0   2.302368     0.081611  136.744746   823.440678   87.444000   
4         0.0   2.301830     0.106272  154.878983  1054.000000   86.697153   
...       ...        ...          ...         ...          ...         ...   
3512      1.0   2.655678     0.000000  517.530508  1452.000000   89.690000   
3513      1.0   2.656627     0.097709  284.330000   913.067797   88.087169   
3514      1.0   2.655682     0.123882  253.031356   947.457627   88.163729   
3515      1.0   6.001270     0.410000  115.297797   790.000000   86.963898   
3516      1.0   6.001180     0.116818   69.600000   805.000000   86.910576   

      pl_imppar  st_teff   st_mass    st_met   st_logg  
0     

In [89]:
### Transpose Model output into DataFrame ###

k2_model4_output = pd.DataFrame(model4.predict(k2_knn_inputer), columns = ['Confirmed', 'False Positive'])
print(k2_model4_output)

      Confirmed  False Positive
0      0.518109        0.481891
1      0.532657        0.467343
2      0.520652        0.479348
3      0.501157        0.498843
4      0.467103        0.532897
...         ...             ...
3512   0.455974        0.544026
3513   0.518111        0.481889
3514   0.509082        0.490918
3515   0.506479        0.493521
3516   0.499644        0.500356

[3517 rows x 2 columns]


In [90]:
### Transpose Model output into DataFrame ###

k2_model6_output = pd.DataFrame(model6.predict(k2_knn_inputer), columns = ['Confirmed', 'False Positive'])
print(k2_model6_output)

      Confirmed  False Positive
0      0.550586        0.449414
1      0.575936        0.424064
2      0.554432        0.445568
3      0.546288        0.453712
4      0.498383        0.501617
...         ...             ...
3512   0.486657        0.513343
3513   0.550052        0.449948
3514   0.544371        0.455629
3515   0.549534        0.450466
3516   0.547752        0.452248

[3517 rows x 2 columns]


In [91]:
### Create a Function that Encodes Values into Labels for the Model Outputs ###

def exoplanet_classification(dataframe):
    col_1 = dataframe['Confirmed']
    col_2 = dataframe['False Positive']
    
    out_col_1 = []
    out_col_2 = []

    for x in col_1:
        if x >= 0.528:
            out_col_1.append("Confirmed")
        else:
            out_col_1.append("Candidate")

    for y in col_2:
        if y >= 0.535:
            out_col_2.append("False Positive")
        else:
            out_col_2.append("Candidate")
       
    
    return out_col_1, out_col_2

In [92]:
### Apply Classification Function ###

k2_mod4_conf, k2_mod4_fp = exoplanet_classification(k2_model4_output)

k2_mod4_conf = pd.DataFrame(k2_mod4_conf, columns = ['Confirmed'])
k2_mod4_fp = pd.DataFrame(k2_mod4_fp, columns = ['False Positive'])

k2_mod4_full = pd.concat([k2_mod4_conf, k2_mod4_fp], axis = 1)

print(k2_mod4_full)

      Confirmed  False Positive
0     Candidate       Candidate
1     Confirmed       Candidate
2     Candidate       Candidate
3     Candidate       Candidate
4     Candidate       Candidate
...         ...             ...
3512  Candidate  False Positive
3513  Candidate       Candidate
3514  Candidate       Candidate
3515  Candidate       Candidate
3516  Candidate       Candidate

[3517 rows x 2 columns]


In [93]:
### Apply Classification Function ###

k2_mod6_conf, k2_mod6_fp = exoplanet_classification(k2_model6_output)

k2_mod6_conf = pd.DataFrame(k2_mod6_conf, columns = ['Confirmed'])
k2_mod6_fp = pd.DataFrame(k2_mod6_fp, columns = ['False Positive'])

k2_mod6_full = pd.concat([k2_mod6_conf, k2_mod6_fp], axis = 1)

print(k2_mod6_full)

      Confirmed False Positive
0     Confirmed      Candidate
1     Confirmed      Candidate
2     Confirmed      Candidate
3     Confirmed      Candidate
4     Candidate      Candidate
...         ...            ...
3512  Candidate      Candidate
3513  Confirmed      Candidate
3514  Confirmed      Candidate
3515  Confirmed      Candidate
3516  Confirmed      Candidate

[3517 rows x 2 columns]


In [94]:
### Create a Function to Combine 2 Columns into 1, then Create another function to reduce the combined values into a single label ###

def column_reduction(dataframe):
    trans_col = dataframe['Confirmed'] + ',' + dataframe['False Positive']
    return trans_col

def entry_reduction(dataframe):
    output_col = []
    for x in dataframe:
        if x == ("Confirmed,Candidate"):
            output_col.append("CONFIRMED")
        elif x == ("Candidate,False Positive"):
            output_col.append("FALSE POSITIVE")
        else:
            output_col.append("CANDIDATE")

    return output_col


In [95]:
### Apply Column Reduction and Label Functions to Model Outputs ###

k2_mod4_trans = column_reduction(k2_mod4_full)
k2_mod4_for_class = pd.DataFrame(entry_reduction(k2_mod4_trans), columns = ['Class'])
k2_mod4_output = pd.DataFrame(entry_reduction(k2_mod4_trans), columns = ['disposition'])


print(k2_mod4_for_class)

               Class
0          CANDIDATE
1          CONFIRMED
2          CANDIDATE
3          CANDIDATE
4          CANDIDATE
...              ...
3512  FALSE POSITIVE
3513       CANDIDATE
3514       CANDIDATE
3515       CANDIDATE
3516       CANDIDATE

[3517 rows x 1 columns]


In [96]:
### Apply Column Reduction and Label Functions to Model Outputs ###

k2_mod6_trans = column_reduction(k2_mod6_full)
k2_mod6_for_class = pd.DataFrame(entry_reduction(k2_mod6_trans), columns = ['Class'])
k2_mod6_output = pd.DataFrame(entry_reduction(k2_mod6_trans), columns = ['disposition'])


print(k2_mod6_for_class)

          Class
0     CONFIRMED
1     CONFIRMED
2     CONFIRMED
3     CONFIRMED
4     CANDIDATE
...         ...
3512  CANDIDATE
3513  CONFIRMED
3514  CONFIRMED
3515  CONFIRMED
3516  CONFIRMED

[3517 rows x 1 columns]


In [97]:
k2_labels = k2_data['disposition']
k2_conf = k2_labels.to_frame()
print(k2_conf)

     disposition
0      CONFIRMED
1      CONFIRMED
2      CONFIRMED
3      CANDIDATE
4      CANDIDATE
...          ...
3512   CONFIRMED
3513   CONFIRMED
3514   CONFIRMED
3515   CONFIRMED
3516   CONFIRMED

[3517 rows x 1 columns]


In [98]:
### Compare the Model 4 output of the K2 data to the original K2 Labels ###

k2_mod4_output.compare(k2_conf)


Unnamed: 0_level_0,disposition,disposition
Unnamed: 0_level_1,self,other
0,CANDIDATE,CONFIRMED
2,CANDIDATE,CONFIRMED
8,FALSE POSITIVE,CANDIDATE
10,FALSE POSITIVE,CANDIDATE
11,FALSE POSITIVE,CANDIDATE
...,...,...
3512,FALSE POSITIVE,CONFIRMED
3513,CANDIDATE,CONFIRMED
3514,CANDIDATE,CONFIRMED
3515,CANDIDATE,CONFIRMED


In [99]:
### Compare the Model 6 output of the K2 data to the original K2 Labels ###

k2_mod6_output.compare(k2_conf)


Unnamed: 0_level_0,disposition,disposition
Unnamed: 0_level_1,self,other
3,CONFIRMED,CANDIDATE
5,CONFIRMED,CANDIDATE
6,CONFIRMED,CANDIDATE
7,CONFIRMED,CANDIDATE
9,CONFIRMED,CANDIDATE
...,...,...
3503,CANDIDATE,CONFIRMED
3507,CANDIDATE,CONFIRMED
3508,CANDIDATE,CONFIRMED
3510,CANDIDATE,CONFIRMED


In [100]:
### Create a Function that would combine any Confirmed or False Positive Values from the original K2 Labels into the Model Outputs ###

def disposition_adjustment(dataframe):
    output_col = []
    for x in dataframe['Classification']:
        if x == ("CANDIDATE,CONFIRMED"):
            output_col.append("CONFIRMED")
        elif x == ("CONFIRMED,CANDIDATE"):
            output_col.append("CONFIRMED")
        elif x == ("CANDIDATE,FALSE POSITIVE"):
            output_col.append("FALSE POSITIVE")
        elif x == ("FALSE POSITIVE,CONDIDATE"):
            output_col.append("FALSE POSITIVE")
        elif x == ("CONFIRMED,CONFIRMED"):
            output_col.append("CONFIRMED")
        elif x == ("FALSE POSITIVE,FALSE POSITIVE"):
            output_col.append("FALSE POSITIVE")
        else:
            output_col.append("CANDIDATE")

    return output_col

In [101]:
### Combine Model 4 Output with K2 Labels ###

k2_mod4_class_trans = k2_mod4_for_class['Class'] + ',' + k2_conf['disposition']
k2_mod4_class_trans = pd.DataFrame(k2_mod4_class_trans)
k2_mod4_class_trans.columns = ['Classification']
print(k2_mod4_class_trans)

                Classification
0          CANDIDATE,CONFIRMED
1          CONFIRMED,CONFIRMED
2          CANDIDATE,CONFIRMED
3          CANDIDATE,CANDIDATE
4          CANDIDATE,CANDIDATE
...                        ...
3512  FALSE POSITIVE,CONFIRMED
3513       CANDIDATE,CONFIRMED
3514       CANDIDATE,CONFIRMED
3515       CANDIDATE,CONFIRMED
3516       CANDIDATE,CONFIRMED

[3517 rows x 1 columns]


In [102]:
### Combine Model 6 Output with K2 Labels ###

k2_mod6_class_trans = k2_mod6_for_class['Class'] + ',' + k2_conf['disposition']
k2_mod6_class_trans = pd.DataFrame(k2_mod6_class_trans)
k2_mod6_class_trans.columns = ['Classification']
print(k2_mod6_class_trans)

           Classification
0     CONFIRMED,CONFIRMED
1     CONFIRMED,CONFIRMED
2     CONFIRMED,CONFIRMED
3     CONFIRMED,CANDIDATE
4     CANDIDATE,CANDIDATE
...                   ...
3512  CANDIDATE,CONFIRMED
3513  CONFIRMED,CONFIRMED
3514  CONFIRMED,CONFIRMED
3515  CONFIRMED,CONFIRMED
3516  CONFIRMED,CONFIRMED

[3517 rows x 1 columns]


In [103]:
### Apply the Label Combine Function from Above with the Model 4 Output ###

k2_model4_adjusted = disposition_adjustment(k2_mod4_class_trans)
k2_model4_adjusted = pd.DataFrame(k2_model4_adjusted)
k2_model4_adjusted.columns = ['disposition']

print(k2_model4_adjusted)

     disposition
0      CONFIRMED
1      CONFIRMED
2      CONFIRMED
3      CANDIDATE
4      CANDIDATE
...          ...
3512   CANDIDATE
3513   CONFIRMED
3514   CONFIRMED
3515   CONFIRMED
3516   CONFIRMED

[3517 rows x 1 columns]


In [104]:
### Apply the Label Combine Function from Above with the Model 6 Output ###

k2_model6_adjusted = disposition_adjustment(k2_mod6_class_trans)
k2_model6_adjusted = pd.DataFrame(k2_model6_adjusted)
k2_model6_adjusted.columns = ['disposition']

print(k2_model6_adjusted)

     disposition
0      CONFIRMED
1      CONFIRMED
2      CONFIRMED
3      CONFIRMED
4      CANDIDATE
...          ...
3512   CONFIRMED
3513   CONFIRMED
3514   CONFIRMED
3515   CONFIRMED
3516   CONFIRMED

[3517 rows x 1 columns]


In [105]:
k2_conf.compare(k2_model4_adjusted)

Unnamed: 0_level_0,disposition,disposition
Unnamed: 0_level_1,self,other
18,CANDIDATE,CONFIRMED
41,CANDIDATE,CONFIRMED
49,CANDIDATE,CONFIRMED
51,CANDIDATE,CONFIRMED
54,CANDIDATE,CONFIRMED
...,...,...
3460,CONFIRMED,CANDIDATE
3502,CONFIRMED,CANDIDATE
3507,CONFIRMED,CANDIDATE
3510,CONFIRMED,CANDIDATE


In [106]:
k2_conf.compare(k2_model6_adjusted)

Unnamed: 0_level_0,disposition,disposition
Unnamed: 0_level_1,self,other
3,CANDIDATE,CONFIRMED
5,CANDIDATE,CONFIRMED
6,CANDIDATE,CONFIRMED
7,CANDIDATE,CONFIRMED
9,CANDIDATE,CONFIRMED
...,...,...
3441,CONFIRMED,CANDIDATE
3442,CONFIRMED,CANDIDATE
3443,CONFIRMED,CANDIDATE
3458,CONFIRMED,CANDIDATE
