
1. Data source: https://www.kaggle.com/nasa/kepler-exoplanet-search-results
2. Column explenation (not complete): https://keplerexoplanets.readthedocs.io/en/latest/dataset.html
3. Column explenation (from a repo): https://github.com/bnarath/find_Exoplanet
4. Column explenation (NASA web): https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html
5. Exoplanet explenation: https://exoplanets.nasa.gov/what-is-an-exoplanet/overview/



In [71]:
import pandas as pd
import numpy as np


import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score 

import graphviz

sns.set()
%matplotlib inline

In [72]:
df = pd.read_csv('cumulative.csv', sep=',')
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


## Inspect and Explore the Data

##### We start with over 9,000 rows

In [73]:
df.shape

(9564, 50)

##### The non-number columns are the names of the KOI (2 columns), the label for each KOI (2 columns) and the TCE delivery name (column 37)

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              9564 non-null   int64  
 1   kepid              9564 non-null   int64  
 2   kepoi_name         9564 non-null   object 
 3   kepler_name        2294 non-null   object 
 4   koi_disposition    9564 non-null   object 
 5   koi_pdisposition   9564 non-null   object 
 6   koi_score          8054 non-null   float64
 7   koi_fpflag_nt      9564 non-null   int64  
 8   koi_fpflag_ss      9564 non-null   int64  
 9   koi_fpflag_co      9564 non-null   int64  
 10  koi_fpflag_ec      9564 non-null   int64  
 11  koi_period         9564 non-null   float64
 12  koi_period_err1    9110 non-null   float64
 13  koi_period_err2    9110 non-null   float64
 14  koi_time0bk        9564 non-null   float64
 15  koi_time0bk_err1   9110 non-null   float64
 16  koi_time0bk_err2   9110 

In [75]:
#see the values of the koi_tce_delivname column
#df['koi_tce_delivname'].value_counts()

##### Remove the following columns as they don't give us any computational power. We remove the koi_score column because it represents the confidence level that a KOI (Kepler Object of Interest) is a planet.

In [76]:
df = df.drop(['rowid','kepoi_name','kepler_name','koi_pdisposition','koi_score','koi_tce_delivname'], 1)
df.head()

Unnamed: 0,kepid,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [77]:
df.shape

(9564, 44)

##### Our goal is to compute if a KOI is or is not a planet. The koi_disposition column has three values which are the following:

In [78]:
df['koi_disposition'].value_counts()

FALSE POSITIVE    5023
CONFIRMED         2293
CANDIDATE         2248
Name: koi_disposition, dtype: int64

##### The CONFIRMED KOI are the planets that have passed all the Kepler's test and are verified by NASA's database. The FALSE POSSITIVE KOI is an object which is NOT a planet. The CANDIDATE KOI is a object that has not yet passed all the Kepler's test.

In [79]:
df.shape

(9564, 44)

##### In this model we will remove the CANDIDATE KOIs (all 2248 of them)

In [80]:
df.drop(df[df['koi_disposition'] == 'CANDIDATE'].index, inplace = True)
df.shape

(7316, 44)

In [81]:
9564 - 2248

7316

##### Now, we create a dummy table for column 'koi_disposition'. 

In [82]:
df = pd.get_dummies(df, 'koi_disposition', drop_first=True)
df.rename(columns = {'koi_disposition_FALSE POSITIVE':'koi_false_positive'}, inplace = True)
df.head()

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
2,10811496,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,1
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,0


##### Check for duplicated rows and columns that have null values

In [83]:
df.duplicated().sum()

0

##### Notice that columns 'koi_teq_err1' and 'koi_teq_err2' are empty. Also the number of null values are oddly the same in many columns, these null values might be on the same rows.
##### Assumtion one: 
##### There is a possibility that they could be on the FALSE POSITIVE KOIs since we can assume that: the observation of a KOI has terminated after that KOI did not pass any of the test, hence leaving the other columns with null values.

In [84]:
df.isnull().sum()

kepid                    0
koi_fpflag_nt            0
koi_fpflag_ss            0
koi_fpflag_co            0
koi_fpflag_ec            0
koi_period               0
koi_period_err1        377
koi_period_err2        377
koi_time0bk              0
koi_time0bk_err1       377
koi_time0bk_err2       377
koi_impact             300
koi_impact_err1        377
koi_impact_err2        377
koi_duration             0
koi_duration_err1      377
koi_duration_err2      377
koi_depth              300
koi_depth_err1         377
koi_depth_err2         377
koi_prad               300
koi_prad_err1          300
koi_prad_err2          300
koi_teq                300
koi_teq_err1          7316
koi_teq_err2          7316
koi_insol              259
koi_insol_err1         259
koi_insol_err2         259
koi_model_snr          300
koi_tce_plnt_num       279
koi_steff              300
koi_steff_err1         397
koi_steff_err2         412
koi_slogg              300
koi_slogg_err1         397
koi_slogg_err2         397
k

In [85]:
df = df.drop(['koi_teq_err1', 'koi_teq_err2'], 1)
df.head()

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
2,10811496,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,1
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,0


##### The above assumtion holds true on the part that these null values are _mostly_ on the FALSE POSITIVE columns. Note, however, that there seems to be a CONFIRMED KOI which could have these columns null. 

In [86]:
cond = df['koi_false_positive'] == 1
df[cond].isnull().sum()

kepid                   0
koi_fpflag_nt           0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_period              0
koi_period_err1       376
koi_period_err2       376
koi_time0bk             0
koi_time0bk_err1      376
koi_time0bk_err2      376
koi_impact            299
koi_impact_err1       376
koi_impact_err2       376
koi_duration            0
koi_duration_err1     376
koi_duration_err2     376
koi_depth             299
koi_depth_err1        376
koi_depth_err2        376
koi_prad              299
koi_prad_err1         299
koi_prad_err2         299
koi_teq               299
koi_insol             258
koi_insol_err1        258
koi_insol_err2        258
koi_model_snr         299
koi_tce_plnt_num      271
koi_steff             299
koi_steff_err1        396
koi_steff_err2        398
koi_slogg             299
koi_slogg_err1        396
koi_slogg_err2        396
koi_srad              299
koi_srad_err1         396
koi_srad_err2         396
ra          

##### Assumption 2:
##### There might be a CONFIRMED KOI with most featuers null. This KOI could be an error.

In [87]:
c1 = df['koi_false_positive'] == 0
df[c1].isnull().sum()

kepid                  0
koi_fpflag_nt          0
koi_fpflag_ss          0
koi_fpflag_co          0
koi_fpflag_ec          0
koi_period             0
koi_period_err1        1
koi_period_err2        1
koi_time0bk            0
koi_time0bk_err1       1
koi_time0bk_err2       1
koi_impact             1
koi_impact_err1        1
koi_impact_err2        1
koi_duration           0
koi_duration_err1      1
koi_duration_err2      1
koi_depth              1
koi_depth_err1         1
koi_depth_err2         1
koi_prad               1
koi_prad_err1          1
koi_prad_err2          1
koi_teq                1
koi_insol              1
koi_insol_err1         1
koi_insol_err2         1
koi_model_snr          1
koi_tce_plnt_num       8
koi_steff              1
koi_steff_err1         1
koi_steff_err2        14
koi_slogg              1
koi_slogg_err1         1
koi_slogg_err2         1
koi_srad               1
koi_srad_err1          1
koi_srad_err2          1
ra                     0
dec                    0


##### The following proves the second assumtion we made. There is a KOI which is CONFIRMED but has not values for most of the featuers.

In [88]:
c1 = df['koi_false_positive'] == 0
c2 = df['koi_period_err1'].isna()
df[c1 & c2].isnull().sum()

kepid                 0
koi_fpflag_nt         0
koi_fpflag_ss         0
koi_fpflag_co         0
koi_fpflag_ec         0
koi_period            0
koi_period_err1       1
koi_period_err2       1
koi_time0bk           0
koi_time0bk_err1      1
koi_time0bk_err2      1
koi_impact            1
koi_impact_err1       1
koi_impact_err2       1
koi_duration          0
koi_duration_err1     1
koi_duration_err2     1
koi_depth             1
koi_depth_err1        1
koi_depth_err2        1
koi_prad              1
koi_prad_err1         1
koi_prad_err2         1
koi_teq               1
koi_insol             1
koi_insol_err1        1
koi_insol_err2        1
koi_model_snr         1
koi_tce_plnt_num      1
koi_steff             1
koi_steff_err1        1
koi_steff_err2        1
koi_slogg             1
koi_slogg_err1        1
koi_slogg_err2        1
koi_srad              1
koi_srad_err1         1
koi_srad_err2         1
ra                    0
dec                   0
koi_kepmag            0
koi_false_positi

In [89]:
c1 = df['koi_false_positive'] == 0
c2 = df['koi_period_err1'].isna()
df[c1 & c2]

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
2183,8478994,0,0,0,0,51.206903,,,195.7303,,...,,,,,,,284.05954,44.518215,9.705,0


##### Let's remove all rows that contain a null column

In [90]:
df = df.dropna()
df.shape

(6630, 42)

In [91]:
df.isnull().sum()

kepid                 0
koi_fpflag_nt         0
koi_fpflag_ss         0
koi_fpflag_co         0
koi_fpflag_ec         0
koi_period            0
koi_period_err1       0
koi_period_err2       0
koi_time0bk           0
koi_time0bk_err1      0
koi_time0bk_err2      0
koi_impact            0
koi_impact_err1       0
koi_impact_err2       0
koi_duration          0
koi_duration_err1     0
koi_duration_err2     0
koi_depth             0
koi_depth_err1        0
koi_depth_err2        0
koi_prad              0
koi_prad_err1         0
koi_prad_err2         0
koi_teq               0
koi_insol             0
koi_insol_err1        0
koi_insol_err2        0
koi_model_snr         0
koi_tce_plnt_num      0
koi_steff             0
koi_steff_err1        0
koi_steff_err2        0
koi_slogg             0
koi_slogg_err1        0
koi_slogg_err2        0
koi_srad              0
koi_srad_err1         0
koi_srad_err2         0
ra                    0
dec                   0
koi_kepmag            0
koi_false_positi

## Now we are ready to build out Random Forest Classifier

In [92]:
df.head()

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
2,10811496,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,1
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,0


##### The featuers of out model will be all the columns except the id and prediction column respectively the first and last column

In [93]:
features = df.columns
features = features.delete(0)
features = features.delete(len(features)-1)
print(features)

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk',
       'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1',
       'koi_impact_err2', 'koi_duration', 'koi_duration_err1',
       'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
       'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg',
       'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
       'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')


##### Preparing and seperating the data to be fed to the model

In [94]:
X = df[features]
y = df['koi_false_positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)

Lenght of our Training data: (5304, 40) 
Length of our Testing data: (1326,)


##### Creating the model and fitting the data

In [95]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

##### Evaluating the prediction

In [96]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print('Accuracy Score = ', accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print('Precision Score = ', precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print('Recall Score = ', accuracy)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score = ', accuracy)

Accuracy Score =  0.9864253393665159
Precision Score =  0.979706877113867
Recall Score =  0.9864253393665159
F1 Score =  0.9864253393665159


In [97]:
model.feature_importances_

array([0.05448075, 0.0769777 , 0.12293925, 0.03675257, 0.02216405,
       0.00956214, 0.01071489, 0.00690221, 0.01422467, 0.02543782,
       0.01054013, 0.00516816, 0.00385321, 0.00636119, 0.04379032,
       0.04453762, 0.01115468, 0.00852603, 0.01188908, 0.06805342,
       0.0455396 , 0.04002352, 0.01670452, 0.01689325, 0.02675758,
       0.01740421, 0.02771764, 0.0034527 , 0.00347015, 0.07365718,
       0.06688447, 0.00342016, 0.00334461, 0.01505744, 0.00688768,
       0.01988458, 0.00548574, 0.00564052, 0.00341472, 0.00432984])

In [98]:
y_pred.sum()

887

## Trying to play with parameters to see if anything changes

In [109]:
df.head()

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,10811496,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,0
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [110]:
c1 = df['koi_false_positive'] == 0
df['koi_false_positive'] = np.where(c1, 1, 0)

df.head()

Unnamed: 0,kepid,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_false_positive
0,10797460,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
1,10797460,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,0
2,10811496,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1
3,10848459,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,1
4,10854555,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,0


In [137]:
features = ['koi_fpflag_co', 'koi_period', 'koi_time0bk', 'koi_impact', 
                 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 
                 'koi_insol', 'koi_steff', 'koi_slogg', 'koi_srad']
print(features)

['koi_fpflag_co', 'koi_period', 'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq', 'koi_insol', 'koi_steff', 'koi_slogg', 'koi_srad']


In [138]:
X = df[features]
y = df['koi_false_positive']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)

Lenght of our Training data: (5304, 12) 
Length of our Testing data: (1326,)


In [139]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [140]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print('Accuracy Score = ', accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred)
print('Precision Score = ', precision)

recall = recall_score(y_true=y_test, y_pred=y_pred)
print('Recall Score = ', accuracy)

f1 = f1_score(y_true=y_test, y_pred=y_pred)
print('F1 Score = ', accuracy)

Accuracy Score =  0.9539969834087482
Precision Score =  0.968677494199536
Recall Score =  0.9539969834087482
F1 Score =  0.9539969834087482


##### Note: simply adding 'koi_fpflag_co' as part of the featuers give s bump up of ~6% in the prediction scores