In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import keras
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import RepeatedKFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras import layers
from keras import regularizers
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time

import warnings
warnings.filterwarnings('ignore')

### Importing Data

Importing dataframe from: https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

In [2]:
#To import data from a .csv file to a pandas dataframe
# This .csv file was taken from: https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

dfraw = pd.read_csv('Data\cumulative_2021.03.02_23.25.21.csv', skiprows=75)
dfraw.head()

Unnamed: 0,kepid,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_period,...,koi_srad_err2,koi_smass,koi_smass_err1,koi_smass_err2,koi_sage,koi_sage_err1,koi_sage_err2,ra,dec,koi_kepmag
0,10797460,CONFIRMED,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,9.488036,...,-0.061,0.919,0.052,-0.046,,,,291.93423,48.141651,15.347
1,10797460,CONFIRMED,CANDIDATE,0.969,0,0,0,0,q1_q17_dr25_sup_koi,54.418383,...,-0.061,0.919,0.052,-0.046,,,,291.93423,48.141651,15.347
2,10811496,CANDIDATE,CANDIDATE,0.0,0,0,0,0,q1_q17_dr25_sup_koi,19.89914,...,-0.078,0.961,0.11,-0.121,,,,297.00482,48.134129,15.436
3,10848459,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,q1_q17_dr25_sup_koi,1.736952,...,-0.067,0.836,0.093,-0.077,,,,285.53461,48.28521,15.597
4,10854555,CONFIRMED,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,2.525592,...,-0.133,1.095,0.151,-0.136,,,,288.75488,48.2262,15.509


### Data Exploration

In [3]:
#Print a concise summary of a DataFrame
#In this way, we can start looking better at our data
dfraw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 69 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   koi_disposition    9564 non-null   object 
 2   koi_pdisposition   9564 non-null   object 
 3   koi_score          8054 non-null   float64
 4   koi_fpflag_nt      9564 non-null   int64  
 5   koi_fpflag_ss      9564 non-null   int64  
 6   koi_fpflag_co      9564 non-null   int64  
 7   koi_fpflag_ec      9564 non-null   int64  
 8   koi_disp_prov      9564 non-null   object 
 9   koi_period         9564 non-null   float64
 10  koi_period_err1    9110 non-null   float64
 11  koi_period_err2    9110 non-null   float64
 12  koi_time0bk        9564 non-null   float64
 13  koi_time0bk_err1   9110 non-null   float64
 14  koi_time0bk_err2   9110 non-null   float64
 15  koi_time0          9564 non-null   float64
 16  koi_time0_err1     9110 

We can notice above the error columns that measure each measurement error of the telescope. To make the exploration simpler, let's ignore those columns for now:

In [4]:
#To remove the error columns from our dataframe
new_col = [column for column in dfraw.columns if '_err' not in column]

df = dfraw[ new_col ]

df.head()

Unnamed: 0,kepid,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_period,...,koi_tce_delivname,koi_steff,koi_slogg,koi_smet,koi_srad,koi_smass,koi_sage,ra,dec,koi_kepmag
0,10797460,CONFIRMED,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,9.488036,...,q1_q17_dr25_tce,5455.0,4.467,0.14,0.927,0.919,,291.93423,48.141651,15.347
1,10797460,CONFIRMED,CANDIDATE,0.969,0,0,0,0,q1_q17_dr25_sup_koi,54.418383,...,q1_q17_dr25_tce,5455.0,4.467,0.14,0.927,0.919,,291.93423,48.141651,15.347
2,10811496,CANDIDATE,CANDIDATE,0.0,0,0,0,0,q1_q17_dr25_sup_koi,19.89914,...,q1_q17_dr25_tce,5853.0,4.544,-0.18,0.868,0.961,,297.00482,48.134129,15.436
3,10848459,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,q1_q17_dr25_sup_koi,1.736952,...,q1_q17_dr25_tce,5805.0,4.564,-0.52,0.791,0.836,,285.53461,48.28521,15.597
4,10854555,CONFIRMED,CANDIDATE,1.0,0,0,0,0,q1_q17_dr25_sup_koi,2.525592,...,q1_q17_dr25_tce,6031.0,4.438,0.07,1.046,1.095,,288.75488,48.2262,15.509


In [5]:
# Summary statistics for all variables
df.describe(include='all').transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
kepid,9564.0,,,,7690628.327373,2653459.080974,757450.0,5556034.25,7906892.0,9873066.5,12935144.0
koi_disposition,9564.0,3.0,FALSE POSITIVE,4840.0,,,,,,,
koi_pdisposition,9564.0,2.0,FALSE POSITIVE,4847.0,,,,,,,
koi_score,8054.0,,,,0.480829,0.476928,0.0,0.0,0.334,0.998,1.0
koi_fpflag_nt,9564.0,,,,0.208595,4.76729,0.0,0.0,0.0,0.0,465.0
koi_fpflag_ss,9564.0,,,,0.232748,0.422605,0.0,0.0,0.0,0.0,1.0
koi_fpflag_co,9564.0,,,,0.197512,0.398142,0.0,0.0,0.0,0.0,1.0
koi_fpflag_ec,9564.0,,,,0.120033,0.325018,0.0,0.0,0.0,0.0,1.0
koi_disp_prov,9564.0,1.0,q1_q17_dr25_sup_koi,9564.0,,,,,,,
koi_period,9564.0,,,,75.671358,1334.744046,0.241843,2.733684,9.752831,40.715178,129995.7784


In [6]:
#Checking for missing data
df.isna().sum()

kepid                   0
koi_disposition         0
koi_pdisposition        0
koi_score            1510
koi_fpflag_nt           0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_disp_prov           0
koi_period              0
koi_time0bk             0
koi_time0               0
koi_eccen             363
koi_longp            9564
koi_impact            363
koi_duration            0
koi_depth             363
koi_ror               363
koi_prad              363
koi_teq               363
koi_insol             321
koi_model_snr         363
koi_tce_plnt_num      346
koi_tce_delivname     346
koi_steff             363
koi_slogg             363
koi_smet              386
koi_srad              363
koi_smass             363
koi_sage             9564
ra                      0
dec                     0
koi_kepmag              1
dtype: int64

In [7]:
# Check duplicated observations
df.loc[df.duplicated(keep=False), :]

Unnamed: 0,kepid,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_disp_prov,koi_period,...,koi_tce_delivname,koi_steff,koi_slogg,koi_smet,koi_srad,koi_smass,koi_sage,ra,dec,koi_kepmag


### Data Preparation

As we can see, there are some features with Nan values. So, let's to check wich features have Nan values and how many each has.

In [8]:
#Checking for missing data
df.isna().sum()

kepid                   0
koi_disposition         0
koi_pdisposition        0
koi_score            1510
koi_fpflag_nt           0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_disp_prov           0
koi_period              0
koi_time0bk             0
koi_time0               0
koi_eccen             363
koi_longp            9564
koi_impact            363
koi_duration            0
koi_depth             363
koi_ror               363
koi_prad              363
koi_teq               363
koi_insol             321
koi_model_snr         363
koi_tce_plnt_num      346
koi_tce_delivname     346
koi_steff             363
koi_slogg             363
koi_smet              386
koi_srad              363
koi_smass             363
koi_sage             9564
ra                      0
dec                     0
koi_kepmag              1
dtype: int64

In [9]:
#To check the total number of Nan values that we have in our dataset.
df.isna().sum().sum()

26031

Therefore, before doing the treatment of Nan values, let's check the percentage of Nan values in our dataset to see if it is possible to remove them.

In [10]:
#Total percentage of Nan values in our dataset
null = df.isnull().sum() / len(df) * 100

null[null>0]

koi_score             15.788373
koi_eccen              3.795483
koi_longp            100.000000
koi_impact             3.795483
koi_depth              3.795483
koi_ror                3.795483
koi_prad               3.795483
koi_teq                3.795483
koi_insol              3.356336
koi_model_snr          3.795483
koi_tce_plnt_num       3.617733
koi_tce_delivname      3.617733
koi_steff              3.795483
koi_slogg              3.795483
koi_smet               4.035968
koi_srad               3.795483
koi_smass              3.795483
koi_sage             100.000000
koi_kepmag             0.010456
dtype: float64

We can drop koi_sage and koi_longp since are columns with 100% nan values.

In [11]:
#To drop 'koi_longp' and 'kpo_sage'
new_col.remove('koi_longp')
new_col.remove('koi_sage')

# Some other string columns with clasification information and no relevance:
new_col.remove('koi_tce_delivname')
new_col.remove('koi_disp_prov')

df = df[new_col]

As we can see above there are a lot of Nan values, by just eliminating them, we would remove a lot of important data and compromise our model.

To better model the missing values, we use the KNN algorithm to fill them.

## Imputing NaN values with KNN:

##### For KNN = 3 (single layer)

In [12]:
#So we applied the KNN with KNNImputer to numerical features with NaN values
X_knn = df[['koi_score','koi_smass','koi_srad','koi_smet','koi_slogg','koi_steff','koi_tce_plnt_num','koi_model_snr','koi_insol','koi_teq','koi_prad','koi_ror', 'koi_depth','koi_impact','koi_eccen',]]

In [13]:
# define imputer
imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X_knn)

# transform the dataset
Xtrans = imputer.transform(X_knn)

In [14]:
#To create a new Dataframe with the results that came from applying the knn
df_aux = pd.DataFrame.from_records(columns= X_knn.columns, data = Xtrans)

df_aux.isna().sum().sum()

0

In [15]:
#Dataframe without the df_aux columns
df_aux_1 = df.drop(columns = df_aux.columns )

#### Joining the results:

In [16]:
#To join the 2 dataframes
df1 = df_aux_1.merge(df_aux, how='outer', left_index=True, right_index=True)

df1.head()

Unnamed: 0,kepid,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_time0,...,koi_steff,koi_tce_plnt_num,koi_model_snr,koi_insol,koi_teq,koi_prad,koi_ror,koi_depth,koi_impact,koi_eccen
0,10797460,CONFIRMED,CANDIDATE,0,0,0,0,9.488036,170.53875,2455003.539,...,5455.0,1.0,35.8,93.59,793.0,2.26,0.022344,615.8,0.146,0.0
1,10797460,CONFIRMED,CANDIDATE,0,0,0,0,54.418383,162.51384,2454995.514,...,5455.0,2.0,25.8,9.11,443.0,2.83,0.027954,874.8,0.586,0.0
2,10811496,CANDIDATE,CANDIDATE,0,0,0,0,19.89914,175.850252,2455008.85,...,5853.0,1.0,76.3,39.3,638.0,14.6,0.154046,10829.0,0.969,0.0
3,10848459,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2455003.308,...,5805.0,1.0,505.6,891.96,1395.0,33.46,0.387394,8079.2,1.276,0.0
4,10854555,CONFIRMED,CANDIDATE,0,0,0,0,2.525592,171.59555,2455004.596,...,6031.0,1.0,40.9,926.16,1406.0,2.75,0.024064,603.3,0.701,0.0


In [17]:
#To check if there are nan values
df1.isna().sum()

kepid               0
koi_disposition     0
koi_pdisposition    0
koi_fpflag_nt       0
koi_fpflag_ss       0
koi_fpflag_co       0
koi_fpflag_ec       0
koi_period          0
koi_time0bk         0
koi_time0           0
koi_duration        0
ra                  0
dec                 0
koi_kepmag          1
koi_score           0
koi_smass           0
koi_srad            0
koi_smet            0
koi_slogg           0
koi_steff           0
koi_tce_plnt_num    0
koi_model_snr       0
koi_insol           0
koi_teq             0
koi_prad            0
koi_ror             0
koi_depth           0
koi_impact          0
koi_eccen           0
dtype: int64

In [18]:
#To delete the nan value of the 'koi_kepmag' feature
df1 = df1.dropna()

We have now our dataframe free of NaN values:

In [19]:
df1.to_csv('Data\preprocessed_keppler_data(knn=3).csv', index=False)

##### For KNN = 5 (multilayer)

In [20]:
# define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [21]:
# fit on the dataset
imputer.fit(X_knn)

KNNImputer()

In [22]:
# transform the dataset
Xtrans = imputer.transform(X_knn)

In [23]:
#To create a new Dataframe with the results that came from applying the knn
df_aux = pd.DataFrame.from_records(columns= X_knn.columns, data = Xtrans)

In [24]:
#To check if is everything correct
df_aux.head()

Unnamed: 0,koi_score,koi_smass,koi_srad,koi_smet,koi_slogg,koi_steff,koi_tce_plnt_num,koi_model_snr,koi_insol,koi_teq,koi_prad,koi_ror,koi_depth,koi_impact,koi_eccen
0,1.0,0.919,0.927,0.14,4.467,5455.0,1.0,35.8,93.59,793.0,2.26,0.022344,615.8,0.146,0.0
1,0.969,0.919,0.927,0.14,4.467,5455.0,2.0,25.8,9.11,443.0,2.83,0.027954,874.8,0.586,0.0
2,0.0,0.961,0.868,-0.18,4.544,5853.0,1.0,76.3,39.3,638.0,14.6,0.154046,10829.0,0.969,0.0
3,0.0,0.836,0.791,-0.52,4.564,5805.0,1.0,505.6,891.96,1395.0,33.46,0.387394,8079.2,1.276,0.0
4,1.0,1.095,1.046,0.07,4.438,6031.0,1.0,40.9,926.16,1406.0,2.75,0.024064,603.3,0.701,0.0


In [25]:
df_aux.isna().sum().sum()

0

In [26]:
#Dataframe without the df_aux columns
df_aux_1 = df.drop(columns = df_aux.columns )

#### Joining the KNN result of NaN values substitution: 

In [27]:
#So we applied the KNN with KNNImputer to numerical features with NaN values
X_knn = df[['koi_score','koi_smass','koi_srad','koi_smet','koi_slogg','koi_steff','koi_tce_plnt_num','koi_model_snr','koi_insol','koi_teq','koi_prad','koi_ror', 'koi_depth','koi_impact','koi_eccen',]]

In [28]:
#To join the 2 dataframes
df1 = df_aux_1.merge(df_aux, how='outer', left_index=True, right_index=True)

df1.head()

Unnamed: 0,kepid,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_time0,...,koi_steff,koi_tce_plnt_num,koi_model_snr,koi_insol,koi_teq,koi_prad,koi_ror,koi_depth,koi_impact,koi_eccen
0,10797460,CONFIRMED,CANDIDATE,0,0,0,0,9.488036,170.53875,2455003.539,...,5455.0,1.0,35.8,93.59,793.0,2.26,0.022344,615.8,0.146,0.0
1,10797460,CONFIRMED,CANDIDATE,0,0,0,0,54.418383,162.51384,2454995.514,...,5455.0,2.0,25.8,9.11,443.0,2.83,0.027954,874.8,0.586,0.0
2,10811496,CANDIDATE,CANDIDATE,0,0,0,0,19.89914,175.850252,2455008.85,...,5853.0,1.0,76.3,39.3,638.0,14.6,0.154046,10829.0,0.969,0.0
3,10848459,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2455003.308,...,5805.0,1.0,505.6,891.96,1395.0,33.46,0.387394,8079.2,1.276,0.0
4,10854555,CONFIRMED,CANDIDATE,0,0,0,0,2.525592,171.59555,2455004.596,...,6031.0,1.0,40.9,926.16,1406.0,2.75,0.024064,603.3,0.701,0.0


In [29]:
#To check if there are nan values
df1.isna().sum()

kepid               0
koi_disposition     0
koi_pdisposition    0
koi_fpflag_nt       0
koi_fpflag_ss       0
koi_fpflag_co       0
koi_fpflag_ec       0
koi_period          0
koi_time0bk         0
koi_time0           0
koi_duration        0
ra                  0
dec                 0
koi_kepmag          1
koi_score           0
koi_smass           0
koi_srad            0
koi_smet            0
koi_slogg           0
koi_steff           0
koi_tce_plnt_num    0
koi_model_snr       0
koi_insol           0
koi_teq             0
koi_prad            0
koi_ror             0
koi_depth           0
koi_impact          0
koi_eccen           0
dtype: int64

As we can see, we have 1 more nan value. So, let's going to remove it. 

In [30]:
#To delete the nan value of the 'koi_kepmag' feature
df1 = df1.dropna()

In [31]:
#To check if we don't have more None values
df1.isna().sum().sum()

0

In [32]:
#To check dtype of variables
df1.dtypes

kepid                 int64
koi_disposition      object
koi_pdisposition     object
koi_fpflag_nt         int64
koi_fpflag_ss         int64
koi_fpflag_co         int64
koi_fpflag_ec         int64
koi_period          float64
koi_time0bk         float64
koi_time0           float64
koi_duration        float64
ra                  float64
dec                 float64
koi_kepmag          float64
koi_score           float64
koi_smass           float64
koi_srad            float64
koi_smet            float64
koi_slogg           float64
koi_steff           float64
koi_tce_plnt_num    float64
koi_model_snr       float64
koi_insol           float64
koi_teq             float64
koi_prad            float64
koi_ror             float64
koi_depth           float64
koi_impact          float64
koi_eccen           float64
dtype: object

We have now, our dataframe free of NaN values.

In [33]:
df1.to_csv('Data\preprocessed_keppler_data(knn=5).csv', index=False)

### Creating multi-layer target features for multiclass model

Using OneHotEncoding: 

In [34]:
#To apply the one hot enconding on categorical variables
ohc_features = ['koi_disposition']

ohc = OneHotEncoder(sparse=False)

df_ohc = pd.DataFrame(ohc.fit_transform(df1[ohc_features]),index=df1.index, columns=ohc.get_feature_names(ohc_features))

In [35]:
df_ohc

Unnamed: 0,koi_disposition_CANDIDATE,koi_disposition_CONFIRMED,koi_disposition_FALSE POSITIVE
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
...,...,...,...
9559,0.0,0.0,1.0
9560,1.0,0.0,0.0
9561,0.0,0.0,1.0
9562,1.0,0.0,0.0


In [36]:
#To add the new 2 enconding columns to the dataframe
#To join the numerical categorical features to the numerical features
df_new = pd.concat([df1.loc[:, df1.columns.drop(ohc_features)], df_ohc], axis=1)


In [37]:
#To equal df_new to df1
df1 = df_new.copy()

#Final columns for the model
final_columns = [feature for feature in df1.columns if feature not in ['kepid', 'koi_disposition'] ]

#Final dataframe with the final columns
df1 = df1[final_columns]

df1

Unnamed: 0,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_time0,koi_duration,ra,...,koi_insol,koi_teq,koi_prad,koi_ror,koi_depth,koi_impact,koi_eccen,koi_disposition_CANDIDATE,koi_disposition_CONFIRMED,koi_disposition_FALSE POSITIVE
0,CANDIDATE,0,0,0,0,9.488036,170.538750,2455003.539,2.95750,291.93423,...,93.59,793.0,2.26,0.022344,615.8,0.146,0.0,0.0,1.0,0.0
1,CANDIDATE,0,0,0,0,54.418383,162.513840,2454995.514,4.50700,291.93423,...,9.11,443.0,2.83,0.027954,874.8,0.586,0.0,0.0,1.0,0.0
2,CANDIDATE,0,0,0,0,19.899140,175.850252,2455008.850,1.78220,297.00482,...,39.30,638.0,14.60,0.154046,10829.0,0.969,0.0,1.0,0.0,0.0
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2455003.308,2.40641,285.53461,...,891.96,1395.0,33.46,0.387394,8079.2,1.276,0.0,0.0,0.0,1.0
4,CANDIDATE,0,0,0,0,2.525592,171.595550,2455004.596,1.65450,288.75488,...,926.16,1406.0,2.75,0.024064,603.3,0.701,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,2454964.705,3.22210,297.18875,...,4500.53,2088.0,29.35,0.297633,1579.2,1.252,0.0,0.0,0.0,1.0
9560,CANDIDATE,0,0,0,0,1.739849,133.001270,2454966.001,3.11400,286.50937,...,1585.81,1608.0,0.72,0.006379,48.5,0.043,0.0,1.0,0.0,0.0
9561,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,2454965.182,0.86500,294.16489,...,5713.41,2218.0,1.07,0.009444,103.6,0.147,0.0,0.0,0.0,1.0
9562,CANDIDATE,0,0,0,0,333.486169,153.615010,2454986.615,3.19900,296.76288,...,22.68,557.0,19.30,0.022590,639.1,0.214,0.0,1.0,0.0,0.0


In [39]:
df1.to_csv('Data\preprocessed_keppler_data(knn=5)(multilayer).csv', index=False)