In [57]:
import numpy as np
import pandas as pd

## Data collection and preparation

After taking a closer look to the dataset and its meaning, we have decided to use the variable `koi_disposition` as our target variable, which is the data observed by humans. We can see that is a categorical variable containing three classes; False Positive, Confirmed, and Candidate. In order to simplify this analysis, we are going to remove the False Positive cases while trying to predict the Confirmed and Candidate cases, transforming our analysis into a binary classification.

We have removed the variables `kepid` and `kepler_name` since they only serve to identify the points and add no information.\
We also removed `koi_score` since this is the conffidence on the machines prediction and this should not be used as predictive variable.\
We also removed `koi_teq_err1` and `koi_teq_err2` since they only contained missing values.\
Finaly we used `kepoi_name` as the index to identify the rows, as it shows the scientific name of the Kepler object of interest.

In [58]:
data = pd.read_csv('exoplanets.csv')

In [59]:
data.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [60]:
# Drop unnecesary variables
exclude_variables = ["kepid","kepler_name","koi_pdisposition","koi_score","koi_teq_err1","koi_teq_err2"]
df = data.drop(columns=exclude_variables).set_index("kepoi_name")

In [61]:
# Set up target variable
# Use koi_disposition only when it is not a FALSE POSITIVE
df = df.where(df["koi_disposition"] != "FALSE POSITIVE").dropna()
df["koi_disposition"] = df["koi_disposition"].astype("category")

## Data exploration

In [63]:
nrow=len(df)
for col in df:
    var = data[col]
    print(f"Variable name: {col}")
    print(f" - # missing:\t{(var.isnull().sum()/nrow)*100:.2f} %")
    print(f" - # distinct:\t{len(var.drop_duplicates()):.2f}")
    dtype = var.dtype.kind
    if dtype != "O":
        vmin = var.min()
        vmax = var.max()
        median = var.median()
        print(f" - mean:\t{var.mean():.2f}")
        print(f" - median:\t{median:.2f}")
        print(f" - std:\t\t{var.std():.2f}")
        print(f" - range:\t[{vmin:.2f} , {vmax:.2f}]")
    print()

Variable name: koi_disposition
 - # missing:	0.00 %
 - # distinct:	3.00

Variable name: koi_fpflag_nt
 - # missing:	0.00 %
 - # distinct:	3.00
 - mean:	0.21
 - median:	0.00
 - std:		4.77
 - range:	[0.00 , 465.00]

Variable name: koi_fpflag_ss
 - # missing:	0.00 %
 - # distinct:	2.00
 - mean:	0.23
 - median:	0.00
 - std:		0.42
 - range:	[0.00 , 1.00]

Variable name: koi_fpflag_co
 - # missing:	0.00 %
 - # distinct:	2.00
 - mean:	0.20
 - median:	0.00
 - std:		0.40
 - range:	[0.00 , 1.00]

Variable name: koi_fpflag_ec
 - # missing:	0.00 %
 - # distinct:	2.00
 - mean:	0.12
 - median:	0.00
 - std:		0.33
 - range:	[0.00 , 1.00]

Variable name: koi_period
 - # missing:	0.00 %
 - # distinct:	9564.00
 - mean:	75.67
 - median:	9.75
 - std:		1334.74
 - range:	[0.24 , 129995.78]

Variable name: koi_period_err1
 - # missing:	10.02 %
 - # distinct:	3507.00
 - mean:	0.00
 - median:	0.00
 - std:		0.01
 - range:	[0.00 , 0.17]

Variable name: koi_period_err2
 - # missing:	10.02 %
 - # distinct:	3507.00


In [42]:
print(data["koi_disposition"].value_counts())
print(data["koi_pdisposition"].value_counts())

koi_disposition
FALSE POSITIVE    4840
CANDIDATE         2367
CONFIRMED         2357
Name: count, dtype: int64
koi_pdisposition
FALSE POSITIVE    4847
CANDIDATE         4717
Name: count, dtype: int64


In [None]:
def preprocessing(df):
    """
    Perform preprocessing steps:
    1. Drop all missing variables
    """
    return 0
