In [175]:
import numpy as np
import pandas as pd
import xmltodict
import requests
import json
import csv
import matplotlib.pyplot as plt
import seaborn as sns

#Helper function exploring features in the dataset
from functions_iot import summary_feature as s_f

### KEPLER MISSION (KOIs)

In [176]:
#url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative&format=json"
url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=q1_q17_dr25_koi&format=json"
resp = requests.get(url)

In [177]:
#Exploring response attributes
print(resp.url)
print(resp.history)
print(resp.status_code) # Check if 200 (All OK)
print(resp.headers)
print(resp.headers['content-type']) # Check if 'application/json'

https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=q1_q17_dr25_koi&format=json
[]
200
{'Access-Control-Allow-Origin': '*', 'Content-type': 'text/plain'}
text/plain


In [178]:
json_data = json.loads(resp.content)
df = pd.DataFrame(json_data) # Saving the json data requested from API to a raw data frame
df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
0,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.544,0.044,-0.176,0.868,0.233,-0.078,19h48m01.16s,+48d08m02.9s,15.436,
1,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,19h02m08.31s,+48d17m06.8s,15.597,
2,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.438,0.07,-0.21,1.046,0.334,-0.133,19h15m01.17s,+48d13m34.3s,15.509,
3,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.486,0.054,-0.229,0.972,0.315,-0.105,19h45m08.67s,+48d13m28.8s,15.714,
4,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.486,0.054,-0.229,0.972,0.315,-0.105,19h45m08.67s,+48d13m28.8s,15.714,


### Identification Columns
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`kepid`|Target identification number, as listed in the Kepler input Catalog (KIC)|int64|
|`kepoi_name`|Number used to identify and track a Kepler Object of Interest (KOI)|object|

### Exoplanet Archive Information
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`kepler_name`|Kepler name in the form "Kepler-N" for identifying the planet|object|
|`koi_disposition`|The category of this KOI from the Exoplanet Archive|object|

### Project Disposition Columns
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`koi_pdisposition`|The pipeline flag that designates the most probable physical explanation of the KOI|object|
|`koi_score`|A value between 0 and 1 that indicates the confidence in the KOI disposition|float64|
|`koi_fpflag_nt`|A KOI whose light curve is not consistent with that of a transiting planet|int32|
|`koi_fpflag_ss`|A KOI that is observed to have a significant secondary event, transit shape, or out-of-eclipse variability, which indicates that the transit-like event is most likely caused by an eclipsing binary|int32|
|`koi_fpflag_co`|The source of the signal is from a nearby star, as inferred by measuring the centroid location of the image both in and out of transit|int32|
|`koi_fpflag_ec`|The KOI shares the same period and epoch as another object and is judged to be the result of flux contamination in the aperture or electronic crosstalk|int32|

### Transit Properties
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`koi_period`|The interval between consecutive planetary transits|float64|
|`koi_period_err1`|The interval between consecutive planetary transits (positive uncertanty)|float64|
|`koi_period_err2`|The interval between consecutive planetary transits (negative uncertanty)|float64|
|`koi_time0bk`|The time corresponding to the center of the first detected transit in Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days|float64|
|`koi_time0bk_err1`|The time corresponding to the center of the first detected transit in Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days (positive uncertanty)|float64|
|`koi_time0bk_err2`|The time corresponding to the center of the first detected transit in Barycentric Julian Day (BJD) minus a constant offset of 2,454,833.0 days (negative uncertanty)|float64|
|`koi_impact`|The sky-projected distance between the center of the stellar disc and the center of the planet disc at conjunction, normalized by the stellar radius|float64|
|`koi_impact_err1`|The sky-projected distance between the center of the stellar disc and the center of the planet disc at conjunction, normalized by the stellar radius (positive uncertanty)|float64|
|`koi_impact_err2`|The sky-projected distance between the center of the stellar disc and the center of the planet disc at conjunction, normalized by the stellar radius (negative uncertanty)|float64|
|`koi_duration`|The duration of the observed transits|float64|
|`koi_duration_err1`|The duration of the observed transits (positive uncertanty)|float64|
|`koi_duration_err2`|The duration of the observed transits (negative uncertanty)|float64|
|`koi_depth`|The fraction of stellar flux lost at the minimum of the planetary transit|float64|
|`koi_depth_err1`|The fraction of stellar flux lost at the minimum of the planetary transit (positive uncertanty)|float64|
|`koi_depth_err2`|The fraction of stellar flux lost at the minimum of the planetary transit (negative uncertanty)|float64|
|`koi_prad`|Product of the planet star radius and the stellar radius|float64|
|`koi_prad_err1`|Product of the planet star radius and the stellar radius (positive uncertanty)|float64|
|`koi_prad_err2`|Product of the planet star radius and the stellar radius (negative uncertanty)|float64|
|`koi_teq`|Approximation for the temperature of the planet|float64|
|`koi_teq_err1`|Approximation of the temperature of the planet (positive uncertanty)|object| 
|`koi_teq_err2`|Approximation of the temperature of the planet (negative uncertanty)|object|
|`koi_insol`|Insulation flux: another way to give the equilibrium temperature|float64|
|`koi_insol_err1`|Insulation flux: another way to give the equilibrium temperature (positive uncertanty)|float64|
|`koi_insol_err2`|Insulation flux: another way to give the equilibrium temperature (negative uncertanty)|float64|

### Threshold-Crossing Event (TCE Information)
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`koi_model_snr`|Transit depth normalized by the mean uncertainty in the flux during the transits|float64|
|`koi_tce_plnt_num`|TCE Planet Number federated to the KOI|int64|
|`koi_tce_delivname`|TCE delivery name corresponding to the TCE data federated to the KOI|object|

### Stellar Parameters
Stellar effective temperature, surface gravity, metallicity, radius, mass, and age should comprise a consistent set. Associated error estimates are 1-σ uncertainties.

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`koi_steff`|Photospheric temperature of the star|float64|
|`koi_steff_err1`|Photospheric temperature of the star (positive uncertanty)|float64|
|`koi_steff_err2`|Photospheric temperature of the star (negative uncertanty)|float64|
|`koi_slogg`|Base-10 logarithm of the acceleration due to gravity at the surface of the star|float64|
|`koi_slogg_err1`|Base-10 logarithm of the acceleration due to gravity at the surface of the star (positive uncertanty)|float64|
|`koi_slogg_err2`|Base-10 logarithm of the acceleration due to gravity at the surface of the star (negative uncertanty)|float64|
|`koi_srad`|Photospheric radius of the star|float64|
|`koi_srad_err1`|Photospheric radius of the star (positive uncertanty)|float64|
|`koi_srad_err2`|Photospheric radius of the star (negative uncertanty)|float64|

### KIC Parameters
Information

|Feature|Description|Datatype|
|:-:|:-:|:-:|
|`ra_str`|KIC Right Ascension|object|
|`dec_str`|KIC Declination|object|
|`koi_kepmag`| Kepler telescope's band (magnitude)|object| 
|`koi_kepmag_err`| Kepler telescope's band error (magnitude)| object| 

* The description of each feature was taken from [NASA Exoplanet Archive Data Columns in Kepler Objects of Interest Table]()

Exploring Data Frame

In [179]:
df.shape # Getting raw data frame shape

(8054, 50)

In [180]:
df.info() # Getting raw data frame information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8054 entries, 0 to 8053
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              8054 non-null   int64  
 1   kepoi_name         8054 non-null   object 
 2   kepler_name        2656 non-null   object 
 3   koi_disposition    8054 non-null   object 
 4   koi_pdisposition   8054 non-null   object 
 5   koi_score          8054 non-null   float64
 6   koi_fpflag_nt      8054 non-null   int64  
 7   koi_fpflag_ss      8054 non-null   int64  
 8   koi_fpflag_co      8054 non-null   int64  
 9   koi_fpflag_ec      8054 non-null   int64  
 10  koi_period         8054 non-null   float64
 11  koi_period_err1    7904 non-null   float64
 12  koi_period_err2    7904 non-null   float64
 13  koi_time0bk        8054 non-null   float64
 14  koi_time0bk_err1   7904 non-null   float64
 15  koi_time0bk_err2   7904 non-null   float64
 16  koi_impact         7995 

In [181]:
#Creating a data frame for visualizing the missing values in raw data frame
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
koi_kepmag_err,8054,100.0
koi_teq_err2,8054,100.0
koi_teq_err1,8054,100.0
kepler_name,5398,67.022597
koi_steff_err2,162,2.011423
koi_time0bk_err2,150,1.862429
koi_impact_err1,150,1.862429
koi_depth_err2,150,1.862429
koi_depth_err1,150,1.862429
koi_duration_err2,150,1.862429


#### Dropping empty and not useful columns

`koi_kepmag_err`, `koi_teq_err1` and `koi_teq_err2` are completely empty columns, so we can comfortably drop them.

Identification Columns (`kepid` and `kepoi_name`) are identification columns that are not really useful for a multi classification model. Some of them are unique ID's for each KOI. 

As a matter of fact, `kepid` was derived from a ground-based imaging survey of the Kepler field conducted prior to launch. The survey's purpose was to identify stars for the Kepler exoplanet survey by magnitude and color. The full catalog of 13 million sources can be searched at the [MAST archive](https://archive.stsci.edu/kepler/kic10/search.php). The Kepler ID is unique to a target and there is only one Kepler ID per target.

Talking about uniqueness, let's explore which features have all unique values in it.

In [182]:
# Checking which features have unique values
for i in df.columns:
    print(i,':', df[i].is_unique)

kepid : False
kepoi_name : True
kepler_name : False
koi_disposition : False
koi_pdisposition : False
koi_score : False
koi_fpflag_nt : False
koi_fpflag_ss : False
koi_fpflag_co : False
koi_fpflag_ec : False
koi_period : True
koi_period_err1 : False
koi_period_err2 : False
koi_time0bk : False
koi_time0bk_err1 : False
koi_time0bk_err2 : False
koi_impact : False
koi_impact_err1 : False
koi_impact_err2 : False
koi_duration : False
koi_duration_err1 : False
koi_duration_err2 : False
koi_depth : False
koi_depth_err1 : False
koi_depth_err2 : False
koi_prad : False
koi_prad_err1 : False
koi_prad_err2 : False
koi_teq : False
koi_teq_err1 : False
koi_teq_err2 : False
koi_insol : False
koi_insol_err1 : False
koi_insol_err2 : False
koi_model_snr : False
koi_tce_plnt_num : False
koi_tce_delivname : False
koi_steff : False
koi_steff_err1 : False
koi_steff_err2 : False
koi_slogg : False
koi_slogg_err1 : False
koi_slogg_err2 : False
koi_srad : False
koi_srad_err1 : False
koi_srad_err2 : False
ra_str :

We confirm that `kepoi_name` is unique, but we also interestingly see that `koi_period` is also unique. So, let's start exploring this feature firsta and see if we can find relevant information on it.

In [183]:
s_f(df, 'koi_period') #Summary feature 'koi_period'
'''
0 missing values (not equal to positive and negative uncertanties)
need to add uncertanties and handle it as a range
all records non null, we may disregard the ranges or fill the ones with zeros
all unique values
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_period'
Number of unique values: 8054
Unique Values: [19.89913995  1.73695245  2.52559178 ... 21.3851584  11.42755098
 23.44970494]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_period
Non-Null Count  Dtype  
--------------  -----  
8054 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n0 missing values (not equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a range\nall records non null, we may disregard the ranges or fill the ones with zeros\nall unique values\nall float64\n'

In [184]:
# Easy drops
df.drop(columns=['koi_kepmag_err', 'koi_teq_err1', 'koi_teq_err2', 'kepid', 'kepoi_name'], inplace=True)

In [185]:
#Creating a data frame for visualizing the missing values in raw data
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by='count', ascending=False)

Unnamed: 0,count,%
kepler_name,5398,67.022597
koi_steff_err2,162,2.011423
koi_period_err2,150,1.862429
koi_depth_err1,150,1.862429
koi_duration_err2,150,1.862429
koi_duration_err1,150,1.862429
koi_impact_err2,150,1.862429
koi_impact_err1,150,1.862429
koi_time0bk_err2,150,1.862429
koi_time0bk_err1,150,1.862429


In [186]:
df.columns

Index(['kepler_name', 'koi_disposition', 'koi_pdisposition', 'koi_score',
       'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk',
       'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1',
       'koi_impact_err2', 'koi_duration', 'koi_duration_err1',
       'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
       'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra_str', 'dec_str', 'koi_kepmag'],
      dtype='object')

In [187]:
s_f(df, 'koi_disposition') #Summary feature 'koi_disposition'
'''
This will be the label (3 classes)
non null
need to check class imbalance 
READY FOR EDA 
'''

There are 8054 non-zero values out of 8054 in 'koi_disposition'
Number of unique values: 3
Unique Values: ['FALSE POSITIVE' 'CONFIRMED' 'CANDIDATE']

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_disposition
Non-Null Count  Dtype 
--------------  ----- 
8054 non-null   object
dtypes: object(1)
memory usage: 63.0+ KB


'\nThis will be the label (3 classes)\nnon null\nneed to check class imbalance \nREADY FOR EDA \n'

In [188]:
s_f(df, 'koi_pdisposition') #Summary feature 'koi_pdisposition'
'''
considering dropping Project Disposition Columns
considering one model with and without Project Disposition Columns
non null
binary flag feature
'''

There are 8054 non-zero values out of 8054 in 'koi_pdisposition'
Number of unique values: 2
Unique Values: ['FALSE POSITIVE' 'CANDIDATE']

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_pdisposition
Non-Null Count  Dtype 
--------------  ----- 
8054 non-null   object
dtypes: object(1)
memory usage: 63.0+ KB


'\nconsidering dropping Project Disposition Columns\nconsidering one model with and without Project Disposition Columns\nnon null\nbinary flag feature\n'

In [189]:
s_f(df, 'koi_score') #Summary feature 'koi_score'
'''
considering dropping Project Disposition Columns
considering one model with and without Project Disposition Columns
non null
probabilities from 0 to 1 of koi_pdisposition
'''

There are 4572 non-zero values out of 8054 in 'koi_score'
Number of unique values: 650
Unique Values: [0.    1.    0.871 0.992 0.999 0.996 0.997 0.989 0.959 0.998 0.966 0.758
 0.884 0.983 0.981 0.984 0.978 0.913 0.002 0.909 0.995 0.98  0.971 0.878
 0.632 0.557 0.008 0.974 0.285 0.68  0.994 0.95  0.91  0.939 0.991 0.754
 0.811 0.987 0.957 0.762 0.985 0.942 0.711 0.881 0.014 0.573 0.876 0.848
 0.961 0.975 0.895 0.086 0.242 0.94  0.545 0.993 0.149 0.99  0.964 0.795
 0.752 0.098 0.976 0.894 0.483 0.952 0.866 0.046 0.695 0.821 0.8   0.819
 0.436 0.973 0.934 0.965 0.117 0.96  0.773 0.953 0.704 0.705 0.378 0.988
 0.053 0.635 0.745 0.875 0.949 0.04  0.885 0.927 0.936 0.415 0.986 0.137
 0.047 0.896 0.174 0.982 0.931 0.912 0.037 0.228 0.968 0.945 0.765 0.815
 0.001 0.915 0.929 0.969 0.006 0.92  0.729 0.908 0.829 0.003 0.565 0.938
 0.979 0.889 0.404 0.638 0.552 0.922 0.883 0.944 0.464 0.234 0.243 0.899
 0.951 0.015 0.036 0.305 0.868 0.958 0.808 0.907 0.946 0.802 0.004 0.956
 0.4   0.496 0.013 0.9

'\nconsidering dropping Project Disposition Columns\nconsidering one model with and without Project Disposition Columns\nnon null\nprobabilities from 0 to 1 of koi_pdisposition\n'

In [190]:
s_f(df, 'koi_fpflag_nt') #Summary feature 'koi_fpflag_nt'
'''
considering dropping Project Disposition Columns
considering one model with and without Project Disposition Columns
non null
binary flag feature
'''

There are 995 non-zero values out of 8054 in 'koi_fpflag_nt'
Number of unique values: 2
Unique Values: [0 1]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_fpflag_nt
Non-Null Count  Dtype
--------------  -----
8054 non-null   int64
dtypes: int64(1)
memory usage: 63.0 KB


'\nconsidering dropping Project Disposition Columns\nconsidering one model with and without Project Disposition Columns\nnon null\nbinary flag feature\n'

In [191]:
s_f(df, 'koi_fpflag_ss') #Summary feature 'koi_fpflag_ss'
'''
considering dropping Project Disposition Columns
considering one model with and without Project Disposition Columns
non null
binary flag feature
'''

There are 2145 non-zero values out of 8054 in 'koi_fpflag_ss'
Number of unique values: 2
Unique Values: [1 0]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_fpflag_ss
Non-Null Count  Dtype
--------------  -----
8054 non-null   int64
dtypes: int64(1)
memory usage: 63.0 KB


'\nconsidering dropping Project Disposition Columns\nconsidering one model with and without Project Disposition Columns\nnon null\nbinary flag feature\n'

In [192]:
s_f(df, 'koi_fpflag_co') #Summary feature 'koi_fpflag_co'
'''
considering dropping Project Disposition Columns
considering one model with and without Project Disposition Columns
non null
binary flag feature
'''

There are 1752 non-zero values out of 8054 in 'koi_fpflag_co'
Number of unique values: 2
Unique Values: [0 1]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_fpflag_co
Non-Null Count  Dtype
--------------  -----
8054 non-null   int64
dtypes: int64(1)
memory usage: 63.0 KB


'\nconsidering dropping Project Disposition Columns\nconsidering one model with and without Project Disposition Columns\nnon null\nbinary flag feature\n'

In [193]:
s_f(df, 'koi_fpflag_ec') #Summary feature 'koi_fpflag_ec'
'''
considering dropping Project Disposition Columns
non null
binary flag feature
'''

There are 1087 non-zero values out of 8054 in 'koi_fpflag_ec'
Number of unique values: 2
Unique Values: [0 1]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_fpflag_ec
Non-Null Count  Dtype
--------------  -----
8054 non-null   int64
dtypes: int64(1)
memory usage: 63.0 KB


'\nconsidering dropping Project Disposition Columns\nnon null\nbinary flag feature\n'

In [194]:
s_f(df, 'koi_period_err1') #Summary feature 'koi_period_err1'
'''
142 missing values 
need to add this value to koi_period feature
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_period_err1'
Number of unique values: 6438
Unique Values: [1.494e-05 2.630e-07 3.761e-06 ... 3.579e-05 8.150e-06 7.193e-05]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_period_err1
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to add this value to koi_period feature\nall float64\n'

In [195]:
s_f(df, 'koi_period_err2') #Summary feature 'koi_period_err2'
'''
142 missing values 
need to substract this value to koi_period feature
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_period_err2'
Number of unique values: 6438
Unique Values: [-1.494e-05 -2.630e-07 -3.761e-06 ... -3.579e-05 -8.150e-06 -7.193e-05]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_period_err2
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to substract this value to koi_period feature\nall float64\n'

In [196]:
s_f(df, 'koi_time0bk') #Summary feature 'koi_time0bk'
'''
0 missing values (not equal to positive and negative uncertanties)
need to add uncertanties and handle it as a range
all records non null, we may disregard the ranges or fill the ones with zeros
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_time0bk'
Number of unique values: 8034
Unique Values: [175.850252 170.307565 171.59555  ... 178.78285  137.613711 182.92295 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_time0bk
Non-Null Count  Dtype  
--------------  -----  
8054 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n0 missing values (not equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a range\nall records non null, we may disregard the ranges or fill the ones with zeros\nall float64\n'

In [197]:
s_f(df, 'koi_time0bk_err1') #Summary feature 'koi_time0bk_err1'
'''
142 missing values 
need to add this value to koi_time0bk feature
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_time0bk_err1'
Number of unique values: 2375
Unique Values: [0.000581 0.000115 0.00113  ... 0.000888 0.000937 0.00215 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_time0bk_err1
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to add this value to koi_time0bk feature\nall float64\n'

In [198]:
s_f(df, 'koi_time0bk_err2') #Summary feature 'koi_time0bk_err2'
'''
142 missing values 
need to substract this value to koi_time0bk feature
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_time0bk_err2'
Number of unique values: 2375
Unique Values: [-0.000581 -0.000115 -0.00113  ... -0.000888 -0.000937 -0.00215 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_time0bk_err2
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to substract this value to koi_time0bk feature\nall float64\n'

In [199]:
s_f(df, 'koi_impact') #Summary feature 'koi_impact'
'''
51 missing values (not equal to positive and negative uncertanties)
need to add uncertanties and handle it as a range
records with koi_depth values but not uncertanties can be considered as uncertanties zero
all float64
'''

There are 8039 non-zero values out of 8054 in 'koi_impact'
Number of unique values: 1391
Unique Values: [0.969 1.276 0.701 ... 1.475 2.824 1.588]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_impact
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values (not equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a range\nrecords with koi_depth values but not uncertanties can be considered as uncertanties zero\nall float64\n'

In [200]:
s_f(df, 'koi_impact_err1') #Summary feature 'koi_impact_err1'
'''
142 missing values 
need to add this value to koi_impact feature
all float64
'''

There are 8030 non-zero values out of 8054 in 'koi_impact_err1'
Number of unique values: 1266
Unique Values: [5.126 0.115 0.235 ... 0.548 7.582 6.629]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_impact_err1
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to add this value to koi_impact feature\nall float64\n'

In [201]:
s_f(df, 'koi_impact_err2') #Summary feature 'koi_impact_err2'
'''
142 missing values 
need to substract this value to koi_impact feature
all float64
'''

There are 8021 non-zero values out of 8054 in 'koi_impact_err2'
Number of unique values: 872
Unique Values: [-7.700e-02 -9.200e-02 -4.780e-01 -4.280e-01 -5.320e-01 -4.760e-01
 -5.230e-01 -8.000e-03 -1.400e-02 -2.350e-01 -5.210e-01 -3.780e-01
 -3.500e-02 -4.800e-02 -1.150e-01 -5.430e-01 -3.300e-02 -5.200e-02
 -1.810e-01 -2.300e-02 -5.090e-01 -3.290e-01 -5.410e-01 -2.200e-02
 -8.800e-02 -6.400e-02 -4.100e-02 -3.100e-02 -3.910e-01 -6.730e-01
 -5.760e-01 -2.900e-02 -4.080e-01 -4.000e-03 -2.662e+00 -4.300e-02
 -3.570e-01 -3.600e-02 -3.300e-01 -1.200e-01 -5.160e-01 -3.850e-01
 -2.570e-01 -4.190e-01 -5.310e-01 -3.900e-01 -1.000e-02 -4.730e-01
 -5.400e-02 -2.500e-02 -1.900e-02 -1.100e-01 -7.450e-01 -5.190e-01
 -4.420e-01 -5.530e-01 -5.130e-01 -2.040e-01 -1.300e-02 -5.950e-01
 -4.970e-01 -2.960e-01 -8.700e-02 -2.800e-01 -1.017e+01 -1.780e-01
 -6.600e-02 -8.400e-02 -7.000e-03 -1.700e-02 -1.730e-01 -2.130e-01
 -3.800e-01 -4.160e-01 -6.710e-01 -6.230e-01 -5.830e-01 -3.980e-01
 -1.350e-01 -5.370e-0

'\n142 missing values \nneed to substract this value to koi_impact feature\nall float64\n'

In [202]:
s_f(df, 'koi_duration') #Summary feature 'koi_duration'
'''
0 missing values (not equal to positive and negative uncertanties)
need to add uncertanties and handle it as a range
all records non null, we may disregard the ranges or fill the ones with zeros
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_duration'
Number of unique values: 6982
Unique Values: [1.7822  2.40641 1.6545  ... 4.0819  4.6074  8.6567 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_duration
Non-Null Count  Dtype  
--------------  -----  
8054 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n0 missing values (not equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a range\nall records non null, we may disregard the ranges or fill the ones with zeros\nall float64\n'

In [203]:
s_f(df, 'koi_duration_err1') #Summary feature 'koi_duration_err1'
'''
142 missing values 
need to add this value to koi_duration feature
all float64
'''

There are 8037 non-zero values out of 8054 in 'koi_duration_err1'
Number of unique values: 2153
Unique Values: [0.0341  0.00537 0.042   ... 0.0724  0.0902  0.0751 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_duration_err1
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to add this value to koi_duration feature\nall float64\n'

In [204]:
s_f(df, 'koi_duration_err2') #Summary feature 'koi_duration_err2'
'''
142 missing values 
need to substract this value to koi_duration feature
all float64
'''

There are 8037 non-zero values out of 8054 in 'koi_duration_err2'
Number of unique values: 2153
Unique Values: [-0.0341  -0.00537 -0.042   ... -0.0724  -0.0902  -0.0751 ]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_duration_err2
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to substract this value to koi_duration feature\nall float64\n'

In [205]:
s_f(df, 'koi_depth') #Summary feature 'koi_depth'
'''
51 missing values (not equal to positive and negative uncertanties)
need to add uncertanties and handle it as a range
records with koi_depth values but not uncertanties can be considered as uncertanties zero
all float64
'''

There are 8053 non-zero values out of 8054 in 'koi_depth'
Number of unique values: 6343
Unique Values: [10829.   8079.2   603.3 ...   663.9   500.9   315.6]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_depth
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values (not equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a range\nrecords with koi_depth values but not uncertanties can be considered as uncertanties zero\nall float64\n'

In [206]:
s_f(df, 'koi_depth_err1') #Summary feature 'koi_depth_err1'
'''
142 missing values 
need to add this value to koi_depth feature
all float64
'''

There are 8043 non-zero values out of 8054 in 'koi_depth_err1'
Number of unique values: 1339
Unique Values: [ 171.    12.8   16.9 ...  575.  5102.    64.1]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_depth_err1
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to add this value to koi_depth feature\nall float64\n'

In [207]:
s_f(df, 'koi_depth_err2') #Summary feature 'koi_depth_err2'
'''
142 missing values 
need to substract this value to koi_depth feature
all float64
'''

There are 8043 non-zero values out of 8054 in 'koi_depth_err2'
Number of unique values: 1339
Unique Values: [ -171.    -12.8   -16.9 ...  -575.  -5102.    -64.1]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_depth_err2
Non-Null Count  Dtype  
--------------  -----  
7904 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n142 missing values \nneed to substract this value to koi_depth feature\nall float64\n'

In [208]:
s_f(df, 'koi_prad') #Summary feature 'koi_prad'
'''
51 missing values (equal to positive and negative uncertanties)
need to add uncertanties and handle it as a rnage
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_prad'
Number of unique values: 2831
Unique Values: [14.6  33.46  2.75 ...  5.89 42.19  4.76]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_prad
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values (equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a rnage\nall float64\n'

In [209]:
s_f(df, 'koi_prad_err1') #Summary feature 'koi_prad_err1'
'''
51 missing values 
need to add this value to koi_prad feature
all float64
'''

There are 7957 non-zero values out of 8054 in 'koi_prad_err1'
Number of unique values: 1704
Unique Values: [ 3.92  8.5   0.88 ... 30.61 19.62 12.65]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_prad_err1
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values \nneed to add this value to koi_prad feature\nall float64\n'

In [210]:
s_f(df, 'koi_prad_err2') #Summary feature 'koi_prad_err2'
'''
51 missing values 
need to substract this value to koi_prad feature
all float64
'''

There are 7958 non-zero values out of 8054 in 'koi_prad_err2'
Number of unique values: 1492
Unique Values: [ -1.31  -2.83  -0.35 ... -65.95 -45.77  -8.43]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_prad_err2
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values \nneed to substract this value to koi_prad feature\nall float64\n'

In [211]:
s_f(df, 'koi_teq') #Summary feature 'koi_teq'
'''
51 missing values
all float64
READY FOR EDA
'''

There are 8054 non-zero values out of 8054 in 'koi_teq'
Number of unique values: 2413
Unique Values: [ 638. 1395. 1406. ... 1376. 2713. 2337.]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_teq
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values\nall float64\nREADY FOR EDA\n'

In [212]:
s_f(df, 'koi_insol') #Summary feature 'koi_insol'
'''
51 missing values (equal to positive and negative uncertanties)
need to add uncertanties and handle it as a rnage
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_insol'
Number of unique values: 7202
Unique Values: [ 39.3  891.96 926.16 ... 104.04 308.61 124.31]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_insol
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values (equal to positive and negative uncertanties)\nneed to add uncertanties and handle it as a rnage\nall float64\n'

In [213]:
s_f(df, 'koi_insol_err1') #Summary feature 'koi_insol_err1'
'''
51 missing values
need to add this value to koi_insol feature
all float64
'''

There are 7957 non-zero values out of 8054 in 'koi_insol_err1'
Number of unique values: 6589
Unique Values: [ 31.04 668.95 874.33 ...  51.4  152.48  48.42]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_insol_err1
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values\nneed to add this value to koi_insol feature\nall float64\n'

In [214]:
s_f(df, 'koi_insol_err2') #Summary feature 'koi_insol_err2'
'''
51 missing values 
need to substract this value to koi_insol feature
all float64
'''

There are 7958 non-zero values out of 8054 in 'koi_insol_err2'
Number of unique values: 6158
Unique Values: [ -10.49 -230.35 -314.24 ...   -5.12  -34.01  -33.09]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_insol_err2
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values \nneed to substract this value to koi_insol feature\nall float64\n'

In [215]:
s_f(df, 'koi_model_snr') #Summary feature 'koi_model_snr'
'''
51 missing values
READY FOR EDA
'''

There are 8052 non-zero values out of 8054 in 'koi_model_snr'
Number of unique values: 2694
Unique Values: [  76.3  505.6   40.9 ... 1230.7  117.3  165.5]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_model_snr
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values\nREADY FOR EDA\n'

In [216]:
s_f(df, 'koi_tce_plnt_num') #Summary feature 'koi_tce_plnt_num'
'''
not missing values
values range from 1 to 8
not entirely clear what this feature means
'''

There are 8054 non-zero values out of 8054 in 'koi_tce_plnt_num'
Number of unique values: 8
Unique Values: [1 2 3 5 4 6 7 8]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_tce_plnt_num
Non-Null Count  Dtype
--------------  -----
8054 non-null   int64
dtypes: int64(1)
memory usage: 63.0 KB


'\nnot missing values\nvalues range from 1 to 8\nnot entirely clear what this feature means\n'

In [217]:
s_f(df, 'koi_tce_delivname') #Summary feature 'koi_tce_delivname'
'''
unique value feature, I can drop it
'''

There are 8054 non-zero values out of 8054 in 'koi_tce_delivname'
Number of unique values: 1
Unique Values: ['q1_q17_dr25_tce']

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_tce_delivname
Non-Null Count  Dtype 
--------------  ----- 
8054 non-null   object
dtypes: object(1)
memory usage: 63.0+ KB


'\nunique value feature, I can drop it\n'

In [218]:
s_f(df, 'koi_steff') #Summary feature 'koi_steff'
'''
51 missing values
less than the both negative and positive uncertanties
need to add uncertanties to this value and handle the range
we can assume 0 negative and positive uncertanties where koi_steff values are existent
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_steff'
Number of unique values: 2288
Unique Values: [5853. 5805. 6031. ... 5867. 6561. 6663.]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_steff
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values\nless than the both negative and positive uncertanties\nneed to add uncertanties to this value and handle the range\nwe can assume 0 negative and positive uncertanties where koi_steff values are existent\nall float64\n'

In [219]:
s_f(df, 'koi_steff_err1') #Summary feature 'koi_steff_err1'
'''
139 missing values less than the negative uncertanty
need to add this value to koi_steff feature
all float64
'''

There are 8045 non-zero values out of 8054 in 'koi_steff_err1'
Number of unique values: 258
Unique Values: [158. 157. 169. 189. 104. 114.  89.  99.  71. 176. 560. 118. 146. 117.
  90.  44. 304. 171. 136. 164. 167. 120. 159. 216. 187. 194.  80. 184.
 115.  98. 168. 172. 105. 153.  83.  54. 183. 133.  84.  82.  74. 113.
  81.  88. 186. 191. 190. 185. 179. 204.  76. 139. 193. 100. 161.  75.
 131. 156. 155. 149.  85.  79. 152.  77. 130. 154. 151. 108. 197.  50.
 267.  78. 215. 170. 201. 174. 182.  nan 112. 119.  51. 111. 162.  60.
 121. 166. 109. 116.  72. 123. 180. 173. 128. 200. 129. 226. 106. 145.
 202.  52. 175. 177. 125. 676. 110. 206. 219. 280. 150. 101.  65.  73.
 165. 181. 163.  87.  57. 228. 148.  40.  68. 328. 160. 103.  58. 107.
 196.  64. 140.  59. 481. 135. 195. 291.  92.  61. 242. 240.  93. 141.
  70. 102. 143. 147.  49. 205.  56. 126. 272. 224.  94. 138. 192. 142.
  97.   0.  69. 198. 122. 221.  86. 235.  62. 134. 203. 199. 209.  67.
 137. 144. 265. 178. 124. 447. 217. 302. 

'\n139 missing values less than the negative uncertanty\nneed to add this value to koi_steff feature\nall float64\n'

In [220]:
s_f(df, 'koi_steff_err2') #Summary feature 'koi_steff_err2'
'''
154 missing values
more than positive uncertanty, we can ssume a 0 negative uncertanty when a positive is existent
need to substract this value to koi_steff feature
all float64
'''

There are 8032 non-zero values out of 8054 in 'koi_steff_err2'
Number of unique values: 359
Unique Values: [ -176.  -174.  -211.  -232.  -115.  -103.   -78.    nan   -89.  -193.
  -301.  -130.  -190.  -117.   -90.   -24.  -457.  -206.  -151.  -200.
  -230.  -120.  -195.  -324.  -267.   -80.  -205.  -104.  -152.   -71.
  -155.  -108.  -191.  -220.   -91.   -74.  -183.  -146.   -73.  -210.
   -76.  -101.   -81.   -64.  -186.  -212.  -185.  -162.  -168.   -84.
  -125.  -164.  -214.  -100.  -179.  -136.  -173.  -172.  -149.   -77.
   -79.   -83.     0.  -144.   -85.  -171.  -158.   -75.  -145.  -241.
  -121.  -326.   -69.  -247.  -259.  -189.  -204.  -182.  -217.  -138.
  -157.  -201.  -111.   -72.  -180.   -67.  -133.  -240.  -209.  -228.
  -160.   -98.  -110.  -116.  -124.   -82.  -153.  -123.  -239.  -105.
  -142.  -166.  -343.   -60.  -119.  -340.  -106.  -194.  -202.   -58.
  -159.  -140.  -169.  -177.  -416.  -135.  -335.  -175.  -406.  -229.
  -147.   -66.   -94.  -163.   -86.  -148

'\n154 missing values\nmore than positive uncertanty, we can ssume a 0 negative uncertanty when a positive is existent\nneed to substract this value to koi_steff feature\nall float64\n'

In [221]:
s_f(df, 'koi_slogg') #Summary feature 'koi_slogg'
'''
139 missing values
need to add its uncertanties and handle the value in ranges
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_slogg'
Number of unique values: 1438
Unique Values: [4.544 4.564 4.438 ... 4.164 3.676 4.022]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_slogg
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n139 missing values\nneed to add its uncertanties and handle the value in ranges\nall float64\n'

In [222]:
s_f(df, 'koi_slogg_err1') #Summary feature 'koi_slogg_err1'
'''
139 missing values
need to substract this value to koi_slogg range
all float64
'''

There are 8045 non-zero values out of 8054 in 'koi_slogg_err1'
Number of unique values: 464
Unique Values: [0.044 0.053 0.07  0.054 0.132 0.162 0.04  0.011 0.055 0.136 0.104 0.045
 0.042 0.196 0.013 0.22  0.084 0.707 0.052 0.352 0.195 0.13  0.392 0.058
 0.127 0.038 0.168 0.018 0.099 0.231 0.185 0.21  0.156 0.056 0.022 0.008
 0.512 0.05  0.174 0.09  0.217 0.218 0.048 0.03  0.105 0.091 0.028 0.18
 0.021 0.046 0.027 0.035 0.033 0.098 0.075 0.083 0.092 0.11  0.063 0.077
 0.144 0.639 0.272 0.137 0.74    nan 0.246 0.1   0.227 0.06  0.036 0.065
 0.115 0.264 0.125 0.149 0.126 0.012 0.203 0.101 0.024 0.029 0.085 0.182
 0.039 0.072 0.088 0.02  0.153 0.01  0.252 0.112 0.225 0.258 0.145 0.066
 0.84  0.376 0.032 0.015 0.135 0.08  0.078 0.121 0.034 0.241 0.062 0.124
 0.094 0.076 0.186 0.016 0.116 0.026 0.051 0.014 0.282 0.52  0.087 0.025
 0.315 0.138 0.037 0.832 0.067 0.117 0.3   0.238 0.378 0.093 0.023 0.031
 0.443 0.293 0.049 0.175 0.12  0.917 0.343 0.064 0.188 0.198 0.128 0.071
 0.323 0.266 0.14 

'\n139 missing values\nneed to substract this value to koi_slogg range\nall float64\n'

In [223]:
s_f(df, 'koi_slogg_err2') #Summary feature 'koi_slogg_err2'
'''
139 missing values
need to add this value to koi_slogg range
all float64
'''

There are 8045 non-zero values out of 8054 in 'koi_slogg_err2'
Number of unique values: 300
Unique Values: [-0.176 -0.168 -0.21  -0.229 -0.108 -0.036 -0.06  -0.055 -0.187 -0.085
 -0.04  -0.084 -0.043 -0.098 -0.22  -0.196 -0.303 -0.208 -0.088 -0.105
 -0.159 -0.232 -0.104 -0.212 -0.102 -0.121 -0.099 -0.185 -0.117 -0.216
 -0.224 -0.025 -0.009 -0.128 -0.032 -0.065 -0.2   -0.116 -0.093 -0.087
 -0.192 -0.09  -0.115 -0.039 -0.027 -0.171 -0.195 -0.053 -0.052 -0.021
 -0.063 -0.045 -0.02  -0.028 -0.15  -0.11  -0.016 -0.184 -0.152 -0.071
 -0.048 -0.18  -0.173 -0.125 -0.131 -0.03  -0.188 -0.056 -0.165    nan
 -0.164 -0.1   -0.033 -0.075 -0.066 -0.008 -0.112 -0.182 -0.138 -0.064
 -0.12  -0.072 -0.024 -0.162 -0.114 -0.022 -0.035 -0.169 -0.144 -0.189
 -0.203 -0.137 -0.07  -0.211 -0.145 -0.143 -0.015 -0.094 -0.16  -0.217
 -0.044 -0.136 -0.13  -0.201 -0.175 -0.049 -0.122 -0.077 -0.198 -0.097
 -0.156 -0.135 -0.103 -0.041 -0.161 -0.081 -0.213 -0.221 -0.08  -0.038
 -0.031 -0.095 -0.202 -0.018 -0.179 -0.08

'\n139 missing values\nneed to add this value to koi_slogg range\nall float64\n'

In [224]:
s_f(df, 'koi_srad') #Summary feature 'koi_srad'
'''
51 missing values
need to add this value to koi_srad to put it in a range and handle the range
all float64
'''

There are 8054 non-zero values out of 8054 in 'koi_srad'
Number of unique values: 2075
Unique Values: [0.868 0.791 1.046 ... 2.872 1.338 1.694]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_srad
Non-Null Count  Dtype  
--------------  -----  
7995 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n51 missing values\nneed to add this value to koi_srad to put it in a range and handle the range\nall float64\n'

In [225]:
s_f(df, 'koi_srad_err1') #Summary feature 'koi_srad_err1'
'''
139 missing values
need to add this value to koi_srad range
'''

There are 8046 non-zero values out of 8054 in 'koi_srad_err1'
Number of unique values: 937
Unique Values: [2.3300e-01 2.0100e-01 3.3400e-01 3.1500e-01 1.5700e-01 2.4200e-01
 1.2500e-01 5.4000e-02 4.5000e-02 3.3300e-01 1.0700e-01 3.6000e-02
 2.0800e-01 2.8100e-01 4.3000e-02 3.9000e-02 8.1400e-01 3.0400e-01
 1.0350e+00 2.7400e-01 3.6200e-01 2.2600e-01 2.9400e-01 5.0500e-01
 4.1500e-01 5.1300e-01 1.4100e-01 3.0100e-01 2.3200e-01 9.3000e-02
 1.0200e-01 1.9900e-01 2.8300e-01 1.7800e-01 2.6400e-01 2.4700e-01
 3.5300e-01 3.6300e-01 2.7000e-02 2.0500e-01 4.5300e-01 3.8000e-02
 7.2000e-02 2.9700e-01 1.8200e-01 3.0000e-01 4.4000e-02 1.6300e-01
 2.1100e-01 3.2300e-01 2.4900e-01 2.4600e-01 3.1600e-01 1.0400e-01
 1.7300e-01 1.3000e-01 1.5100e-01 4.9000e-02 5.0000e-02 1.9800e-01
 3.5700e-01 3.5000e-01 4.6000e-02 2.6900e-01 6.7000e-02 3.0000e-02
 7.5000e-02 2.4400e-01 2.2400e-01 5.7000e-02 3.2000e-02 3.3000e-02
 9.1000e-02 1.8100e-01 2.3000e-02 3.3000e-01 2.2700e-01 7.1000e-02
 1.8600e-01 8.8000e-02 

'\n139 missing values\nneed to add this value to koi_srad range\n'

In [226]:
s_f(df, 'koi_srad_err2') #Summary feature 'koi_srad_err2'
'''
139 missing values
need to substract this value to koi_srad range
'''

There are 8046 non-zero values out of 8054 in 'koi_srad_err2'
Number of unique values: 1232
Unique Values: [-0.078 -0.067 -0.133 ... -1.66  -1.969 -0.439]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_srad_err2
Non-Null Count  Dtype  
--------------  -----  
7907 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\n139 missing values\nneed to substract this value to koi_srad range\n'

In [227]:
s_f(df, 'ra_str') #Summary feature 'ra_str'
'''
Date-time stamps
no missing values
Considering date-time objects relevant for the project
%h%m%s
'''

There are 8054 non-zero values out of 8054 in 'ra_str'
Number of unique values: 6852
Unique Values: ['19h48m01.16s' '19h02m08.31s' '19h15m01.17s' ... '19h16m18.61s'
 '19h34m54.73s' '19h09m19.87s']

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: ra_str
Non-Null Count  Dtype 
--------------  ----- 
8054 non-null   object
dtypes: object(1)
memory usage: 63.0+ KB


'\nDate-time stamps\nno missing values\nConsidering date-time objects relevant for the project\n%h%m%s\n'

In [228]:
s_f(df, 'dec_str') #Summary feature 'dec_str'
'''
Date-time stamps
no missing values
Considering date-time objects relevant for the project
%d%m%s
'''

There are 8054 non-zero values out of 8054 in 'dec_str'
Number of unique values: 6873
Unique Values: ['+48d08m02.9s' '+48d17m06.8s' '+48d13m34.3s' ... '+46d00m18.8s'
 '+46d07m44.9s' '+46d12m12.6s']

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: dec_str
Non-Null Count  Dtype 
--------------  ----- 
8054 non-null   object
dtypes: object(1)
memory usage: 63.0+ KB


'\nDate-time stamps\nno missing values\nConsidering date-time objects relevant for the project\n%d%m%s\n'

In [229]:
s_f(df, 'koi_kepmag') #Summary feature 'koi_kepmag'
'''
No missing values
float64
READY FOR EDA
'''

There are 8054 non-zero values out of 8054 in 'koi_kepmag'
Number of unique values: 3562
Unique Values: [15.436 15.597 15.509 ... 13.58  13.759 13.765]

<class 'pandas.core.series.Series'>
RangeIndex: 8054 entries, 0 to 8053
Series name: koi_kepmag
Non-Null Count  Dtype  
--------------  -----  
8053 non-null   float64
dtypes: float64(1)
memory usage: 63.0 KB


'\nNo missing values\nfloat64\nREADY FOR EDA\n'

Using the API

https://exoplanetarchive.ipac.caltech.edu/docs/program_interfaces.html#koi

Table Documentation

https://exoplanetarchive.ipac.caltech.edu/docs/PurposeOfKOITable.html#cumulative

Table Columns

https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html

Table

https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=koi




