In [24]:
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [25]:
# Remove metadata and header rows
file_path = 'data/cumulative_2024.03.14_10.24.31.csv'

data_rows_clean = []

with open(file_path, mode='r', newline='') as file:
    reader = csv.reader(file)
    # Skip the first 53 lines (headers and metadata)
    for _ in range(53):
        next(reader)
    # Now read the actual data
    data_rows_clean = [row for row in reader]

# Convert to dataframe
df_cumulative = pd.DataFrame(data_rows_clean[1:], columns=data_rows_clean[0])

# Convert koi_disposition false positive to 0, candidate to 1, confirmed to 2
df_cumulative['koi_disposition'] = df_cumulative['koi_disposition'].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1, 'CONFIRMED': 2})

# Convert koi_pdisposition false positive to 0, candidate to 1
df_cumulative['koi_pdisposition'] = df_cumulative['koi_pdisposition'].map({'FALSE POSITIVE': 0, 'CANDIDATE': 1})

# Convert columns to float
non_numeric_columns = ['kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_pdisposition', 'koi_tce_delivname']
numeric_columns = [col for col in df_cumulative.columns if col not in non_numeric_columns]

for column in numeric_columns:
    df_cumulative[column] = pd.to_numeric(df_cumulative[column], errors='coerce')

# Remove error columns
columns_to_drop = [column for column in df_cumulative.columns if 'err' in column]
df_cumulative = df_cumulative.drop(columns=columns_to_drop, axis=1)

# Remove rows that have empty koi_impact or koi_duration values
df_cumulative = df_cumulative.dropna(subset=['koi_impact', 'koi_duration'])


# Remove rows that have flags set to 1
flags = ['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec']
df_cumulative = df_cumulative[(df_cumulative[flags] == 0).all(axis=1)]

# Remove flag columns
df_cumulative = df_cumulative.drop(columns=flags)

display(df_cumulative.describe())

Unnamed: 0,koi_disposition,koi_pdisposition,koi_score,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
count,4647.0,4647.0,4082.0,4647.0,4647.0,4647.0,4647.0,4647.0,4647.0,4647.0,4647.0,4647.0,4554.0,4647.0,4647.0,4647.0,4647.0,4647.0,4647.0
mean,1.57564,0.989886,0.907249,86.257842,166.795129,0.547936,4.738035,1166.007015,44.495091,823.628362,725.939742,54.160017,1.377251,5561.298687,4.370596,1.29309,291.352629,44.194748,14.348464
std,0.514776,0.10007,0.227633,1910.111811,63.866021,2.871402,4.011881,6171.726627,1690.711783,473.580513,8285.135379,169.638422,0.778286,687.020039,0.322688,3.532783,4.792697,3.630707,1.262344
min,0.0,0.0,0.0,0.241843,121.119423,0.0,0.052,0.0,0.22,25.0,0.0,0.0,1.0,2703.0,0.114,0.109,280.2066,36.577381,6.974
25%,1.0,1.0,0.961,5.468943,134.09093,0.112,2.361,162.45,1.36,503.0,15.14,11.7,1.0,5223.0,4.275,0.816,287.65262,41.2556,13.615
50%,2.0,1.0,0.998,13.153841,140.6557,0.386,3.565,353.9,2.0,750.0,74.8,19.6,1.0,5662.0,4.45,0.973,291.52951,44.143581,14.621
75%,2.0,1.0,1.0,42.059039,171.93693,0.7475,5.6845,759.4,2.94,1034.0,270.295,38.9,1.0,6007.0,4.551,1.236,295.22879,47.014204,15.3265
max,2.0,1.0,1.0,129995.7784,907.04471,98.6021,54.52,348130.0,109061.0,6285.0,369989.68,4304.3,7.0,10894.0,5.364,152.969,301.72076,52.220341,17.475


In [26]:
# convert cleaned data to csv
df_cumulative.to_csv('data/cumulative_clean.csv', index=False)

In [27]:
# Scrape data columns from the Kepler Objects of Interest html.

# Read in the html file and soup it 
with open('data/Data columns in Kepler Objects of Interest Table.html','r') as f:
    html = f.read()
    soup = BeautifulSoup(html, 'html.parser')

# "Database Column Names" Colummn
column_name = []
for element in soup.find_all('td', class_='name'):
    column_name.append(element.text.strip())

# "Table Label" Column
tabel_labels = []
for element in soup.find_all('td', class_='label'):
    text = element.text.strip()
    if text != '' and '_str' not in text:
        tabel_labels.append(element.text.strip())

# "Description" Column
description = []
for element in soup.find_all('td', class_='description'):
    description.append(element.text.strip())


# Create a dataframe
data_dict = {'Database Column Name': column_name, 
             'Table Label': tabel_labels, 
             'Description': description}

df_columns = pd.DataFrame(data_dict)

# Remove the † from the column names
df_columns.replace('[â€]','', regex=True,inplace=True)

# Identify relavent columns that are in the cleaned data
relavent_columns = df_columns[df_columns['Database Column Name'].isin(df_cumulative.columns)]
display(relavent_columns)

Unnamed: 0,Database Column Name,Table Label,Description
0,kepid,Kepler Identification or KepID,"Target identification number, as listed in the..."
1,kepoi_name,KOI Name,A number used to identify and track a Kepler O...
2,kepler_name,Kepler Name,"Kepler number name in the form ""Kepler-N,"" plu..."
3,koi_disposition,Exoplanet Archive Disposition,The category of this KOI from the Exoplanet Ar...
6,koi_pdisposition,Disposition Using Kepler Data,The pipeline flag that designates the most pro...
7,koi_score,Disposition Score,A value between 0 and 1 that indicates the con...
14,koi_period,Orbital Period (days),The interval between consecutive planetary tra...
15,koi_time0bk,"Transit Epoch (BJD - 2,454,833.0)",The time corresponding to the center of the fi...
19,koi_impact,Impact Parameter,The sky-projected distance between the center ...
20,koi_duration,Transit Duration (hours),The duration of the observed transits. Duratio...


In [29]:
# Convert to csv file
relavent_columns.to_csv('data/Data columns in Kepler Objects of Interest Table.csv', index=False)

This research has made use of the NASA Exoplanet Archive, which is operated by the California Institute of Technology, under contract with the National Aeronautics and Space Administration under the Exoplanet Exploration Program.

DOI 10.26133/NEA4