In [44]:
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [45]:
# Remove metadata and header rows
file_path = 'data/cumulative_2024.03.14_10.24.31.csv'

data_rows_clean = []

with open(file_path, mode='r', newline='') as file:
    reader = csv.reader(file)
    # Skip the first 53 lines (headers and metadata)
    for _ in range(53):
        next(reader)
    # Now read the actual data
    data_rows_clean = [row for row in reader]

# Convert to dataframe
df_cumulative = pd.DataFrame(data_rows_clean[1:], columns=data_rows_clean[0])

# Convert koi_disposition false positive to 0, candidate to 1, confirmed to 2
df_cumulative['koi_disposition'] = df_cumulative['koi_disposition'].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})

# Convert koi_pdisposition false positive to 0, candidate to 1
df_cumulative['koi_pdisposition'] = df_cumulative['koi_pdisposition'].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1})

# Convert columns to float
non_numeric_columns = ['kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_pdisposition', 'koi_tce_delivname']
numeric_columns = [col for col in df_cumulative.columns if col not in non_numeric_columns]

for column in numeric_columns:
    df_cumulative[column] = pd.to_numeric(df_cumulative[column], errors='coerce')

# Remove empty columns
df_cumulative = df_cumulative.drop(['koi_teq_err1', 'koi_teq_err2'], axis='columns')

# Remove rows that have empty koi_impact or koi_duration values
df_cumulative = df_cumulative.dropna(subset=['koi_impact', 'koi_duration'])
display(df_cumulative.describe())


Unnamed: 0,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,9201.0,9201.0,7995.0,9201.0,9201.0,9201.0,9201.0,9201.0,9110.0,9110.0,...,9081.0,9201.0,9096.0,9096.0,9201.0,9096.0,9096.0,9201.0,9201.0,9200.0
mean,0.799804,0.50125,0.483768,0.195848,0.239322,0.203782,0.1239,74.307951,0.002148,-0.002148,...,-162.265059,4.310157,0.120738,-0.143161,1.728712,0.362292,-0.394806,292.062986,43.808212,14.276052
std,0.86936,0.500026,0.47701,4.85898,0.426693,0.402831,0.329485,1360.538847,0.008236,0.008236,...,72.746348,0.432606,0.132837,0.085477,6.127185,0.93087,2.168213,4.760401,3.602567,1.378187
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241843,0.0,-0.1725,...,-1762.0,0.047,0.0,-1.207,0.109,0.0,-116.137,279.85272,36.577381,6.966
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.682768,5e-06,-0.000276,...,-198.0,4.218,0.042,-0.196,0.829,0.129,-0.25,288.68259,40.77697,13.46
50%,1.0,1.0,0.371,0.0,0.0,0.0,0.0,9.296746,3.5e-05,-3.5e-05,...,-160.0,4.438,0.07,-0.128,1.0,0.251,-0.111,292.26291,43.680962,14.531
75%,2.0,1.0,0.999,0.0,0.0,0.0,0.0,36.856776,0.000276,-5e-06,...,-114.0,4.543,0.149,-0.088,1.345,0.364,-0.069,295.86533,46.703129,15.326
max,2.0,1.0,1.0,465.0,1.0,1.0,1.0,129995.7784,0.1725,0.0,...,0.0,5.364,1.472,0.0,229.908,33.091,0.0,301.72076,52.33601,20.003


In [46]:
# convert cleaned data to csv
df_cumulative.to_csv('data/cumulative_clean.csv', index=False)

In [47]:
# Scrape data columns from the Kepler Objects of Interest html.

# Read in the html file and soup it 
with open('data/Data columns in Kepler Objects of Interest Table.html','r') as f:
    html = f.read()
    soup = BeautifulSoup(html, 'html.parser')

# "Database Column Names" Colummn
column_name = []
for element in soup.find_all('td', class_='name'):
    column_name.append(element.text.strip())

# "Table Label" Column
tabel_labels = []
for element in soup.find_all('td', class_='label'):
    text = element.text.strip()
    if text != '' and '_str' not in text:
        tabel_labels.append(element.text.strip())

# "Description" Column
description = []
for element in soup.find_all('td', class_='description'):
    description.append(element.text.strip())

# "Uncertainty" Column  
uncertainty = []
columns = soup.find_all('tr', class_='column')

for item in columns:
    uncertainty_elements = item.find_all('td', class_='uncertainty')
    if not uncertainty_elements: 
        uncertainty.append(None)
    else:
        for element in uncertainty_elements:
            element = element.text.strip()
            if element!= '&nbsp;' and element != '':
                uncertainty.append(element)
            else:
                uncertainty.append(None)

# Create a dataframe
data_dict = {'Database Column Name': column_name, 
             'Table Label': tabel_labels, 
             'Description': description, 
             'Uncertainties Column (positive +) (negative - )': uncertainty}

df_columns = pd.DataFrame(data_dict)

# Remove the † from the column names
df_columns.replace('[†]','', regex=True,inplace=True)

# Identify relavent columns that are in the cleaned data
relavent_columns = df_columns[df_columns['Database Column Name'].isin(df_cumulative.columns)]
display(relavent_columns)

Unnamed: 0,Database Column Name,Table Label,Description,Uncertainties Column (positive +) (negative - )
0,kepid,Kepler Identification or KepID,"Target identification number, as listed in the...",
1,kepoi_name,KOI Name,A number used to identify and track a Kepler O...,
2,kepler_name,Kepler Name,"Kepler number name in the form ""Kepler-N,"" plu...",
3,koi_disposition,Exoplanet Archive Disposition,The category of this KOI from the Exoplanet Ar...,
6,koi_pdisposition,Disposition Using Kepler Data,The pipeline flag that designates the most pro...,
7,koi_score,Disposition Score,A value between 0 and 1 that indicates the con...,
8,koi_fpflag_nt,Not Transit-Like Flag,A KOI whose light curve is not consistent with...,
9,koi_fpflag_ss,Stellar Eclipse Flag,A KOI that is observed to have a significant s...,
10,koi_fpflag_co,Centroid Offset Flag,The source of the signal is from a nearby star...,
11,koi_fpflag_ec,Ephemeris Match Indicates Contamination Flag,The KOI shares the same period and epoch as an...,


In [48]:
# Convert to csv file
relavent_columns.to_csv('data/Data columns in Kepler Objects of Interest Table.csv', index=False)

This research has made use of the NASA Exoplanet Archive, which is operated by the California Institute of Technology, under contract with the National Aeronautics and Space Administration under the Exoplanet Exploration Program.

DOI 10.26133/NEA4