In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [21]:
# Clean CSV file to remove metadata and headers
file_path = 'data/cumulative_2024.03.14_10.24.31.csv'

data_rows_clean = []

with open(file_path, mode='r', newline='') as file:
    reader = csv.reader(file)
    # Skip the first 53 lines (headers and metadata)
    for _ in range(53):
        next(reader)
    # Now read the actual data
    data_rows_clean = [row for row in reader]

# Convert to dataframe
df_cumulative = pd.DataFrame(data_rows_clean[1:], columns=data_rows_clean[0])
df_cumulative.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [24]:
# Scrape data columns from the Kepler Objects of Interest html.

# Read in the html file and soup it 
with open('data/Data columns in Kepler Objects of Interest Table.html','r') as f:
    html = f.read()
    soup = BeautifulSoup(html, 'html.parser')

# "Database Column Names" Colummn
column_name = []
for element in soup.find_all('td', class_='name'):
    column_name.append(element.text.strip())

# "Table Label" Column
tabel_labels = []
for element in soup.find_all('td', class_='label'):
    text = element.text.strip()
    if text != '' and '_str' not in text:
        tabel_labels.append(element.text.strip())

# "Description" Column
description = []
for element in soup.find_all('td', class_='description'):
    description.append(element.text.strip())

# "Uncertainty" Column  
uncertainty = []
columns = soup.find_all('tr', class_='column')

for item in columns:
    uncertainty_elements = item.find_all('td', class_='uncertainty')
    if not uncertainty_elements: 
        uncertainty.append(None)
    else:
        for element in uncertainty_elements:
            element = element.text.strip()
            if element!= '&nbsp;' and element != '':
                uncertainty.append(element)
            else:
                uncertainty.append(None)

# Create a dataframe
data_dict = {'Database Column Name': column_name, 
             'Table Label': tabel_labels, 
             'Description': description, 
             'Uncertainties Column (positive +) (negative - )': uncertainty}

df_columns = pd.DataFrame(data_dict)
df_columns.replace('[†]','', regex=True,inplace=True)

relavent_columns = df_columns[df_columns['Database Column Name'].isin(df_cumulative.columns)]
display(relavent_columns)
print(len(relavent_columns))


Unnamed: 0,Database Column Name,Table Label,Description,Uncertainties Column (positive +) (negative - )
0,kepid,Kepler Identification or KepID,"Target identification number, as listed in the...",
1,kepoi_name,KOI Name,A number used to identify and track a Kepler O...,
2,kepler_name,Kepler Name,"Kepler number name in the form ""Kepler-N,"" plu...",
3,koi_disposition,Exoplanet Archive Disposition,The category of this KOI from the Exoplanet Ar...,
6,koi_pdisposition,Disposition Using Kepler Data,The pipeline flag that designates the most pro...,
7,koi_score,Disposition Score,A value between 0 and 1 that indicates the con...,
8,koi_fpflag_nt,Not Transit-Like Flag,A KOI whose light curve is not consistent with...,
9,koi_fpflag_ss,Stellar Eclipse Flag,A KOI that is observed to have a significant s...,
10,koi_fpflag_co,Centroid Offset Flag,The source of the signal is from a nearby star...,
11,koi_fpflag_ec,Ephemeris Match Indicates Contamination Flag,The KOI shares the same period and epoch as an...,


27


In [6]:
relavent_columns.to_csv('data/Data columns in Kepler Objects of Interest Table.csv', index=False)

This research has made use of the NASA Exoplanet Archive, which is operated by the California Institute of Technology, under contract with the National Aeronautics and Space Administration under the Exoplanet Exploration Program.

DOI 10.26133/NEA4