# How Discovery Methods of Exoplanets Relate to Our Understanding of Their Features
Luna Correia, Stephanie Jacinto, Zoran Shamsi


In [None]:
#pip install altair

Note: you may need to restart the kernel to use updated packages.


In [56]:
# importing
import pandas as pd
import altair as alt

FILEPATH = "exoplanet_archive_data_sans_header.csv"

In [None]:
exoplanet_data = pd.read_csv(FILEPATH)

# removing anything flagged as controversial 
cut_data = exoplanet_data[exoplanet_data["pl_controv_flag"] != 1]

# limiting it to the relevant columns
#   •pl_name = planet name
#   •hostname = host star name
#   •discoverymethod = discovery method
#   •disc_year = discovery_year 
#   •pl_rade = planetary radius (in terms of Earth radii)
#       •pl_radeerr1 = positive error bound
#       •pl_radeerr2 = negative error bound 
#   •pl_bmasse = Planet Mass or Mass*sin(i) [Earth Mass]
#       •btw mass*sin(i) refers to the minimum mass the planet could have based on the inclination - https://en.wikipedia.org/wiki/Minimum_mass
#       •pl_bmasseerr1 = upper error bound
#       •pl_bmasseerr2 = lower error bound   
#   •pl_orbper = orbital period 
#       •pl_orbpererr1 = upper error bound
#       •pl_orbpererr2 = lower error bound 
#   •pl_orbsmax = orbital semi-major axis 
#       •pl_orbsmaxerr1 = upper error bound
#       •pl_orbsmaxerr2 = lower error bound
#   •pl_orbeccen = eccentricity 
# (could add more too if we wanted stuff like stellar features)
cut_data = cut_data[["pl_name", "hostname", 
                    "discoverymethod", "disc_facility", "disc_year", 
                    "pl_rade", "pl_radeerr1", "pl_radeerr2", 
                    "pl_bmasse", "pl_bmasseerr1", "pl_bmasseerr2", 
                    "pl_orbper", "pl_orbpererr1", "pl_orbpererr2", 
                    "pl_orbsmax", "pl_orbsmaxerr1", "pl_orbsmaxerr2", 
                    "pl_orbeccen", "pl_orbeccenerr1", "pl_orbeccenerr2"]]


# drop rows without a discovery method 
cut_data = cut_data.dropna(subset=['discoverymethod'])

# remove duplicate discoveries of the same planet
cut_data = cut_data.drop_duplicates(subset=['pl_name', 'discoverymethod', 'disc_year'])


cut_data.head()



Unnamed: 0,pl_name,hostname,discoverymethod,disc_year,pl_rade,pl_radeerr1,pl_radeerr2,pl_bmasse,pl_bmasseerr1,pl_bmasseerr2,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbsmax,pl_orbsmaxerr1,pl_orbsmaxerr2,pl_orbeccen,pl_orbeccenerr1,pl_orbeccenerr2
0,11 Com b,11 Com,Radial Velocity,2007.0,,,,4914.898486,39.092894,-39.728551,323.21,0.06,-0.05,1.178,0.0,0.0,0.238,0.007,-0.007
3,11 UMi b,11 UMi,Radial Velocity,2009.0,,,,4684.8142,794.575,-794.575,516.21997,3.2,-3.2,1.53,0.07,-0.07,0.08,0.03,-0.03
6,14 And b,14 And,Radial Velocity,2008.0,,,,1131.151301,36.232438,-38.775066,186.76,0.11,-0.12,0.775,0.0,0.0,0.0,,
9,14 Her b,14 Her,Radial Velocity,2002.0,,,,2559.472162,518.69596,-280.642483,1765.0389,1.67709,-1.87256,2.774,0.109,-0.12,0.373,0.008,-0.006
17,16 Cyg B b,16 Cyg B,Radial Velocity,1996.0,,,,565.7374,25.4264,-25.4264,798.5,1.0,-1.0,1.66,0.03,-0.03,0.68,0.02,-0.02


In [None]:
# because the dataset is so large, we have to disable the max rows thing for it to render everything (though this may be resolved later
# when we cut down the rows)
#alt.data_transformers.disable_max_rows()
# ...it seems like it was, since after removing the duplicates it seems to run fine without it now 

# making a boxplot 
alt.Chart(cut_data).encode(
    alt.Y("discoverymethod:N", title="Discovery Method"), 
    alt.X("pl_bmasse:Q", title="Mass (Earth Masses)")
).mark_boxplot()

In [52]:

df = cut_data[['pl_name', 'discoverymethod', 'disc_year']].copy()

# make every year numeric
df['disc_year'] = pd.to_numeric(df['disc_year'], errors='coerce')

# drop rows without a year
df = df.dropna(subset=['disc_year'])

# convert year to int
df['disc_year'] = df['disc_year'].astype(int)


In [54]:

yearly_counts = (df.groupby(['disc_year', 'discoverymethod'])
                 .size()
                 .reset_index(name='discoveries'))
major_methods = ['Transit', 'Radial Velocity', 'Microlensing', 'Imaging', 'Astrometry', 'Pulsar Timing']
yearly_counts_major = yearly_counts[yearly_counts['discoverymethod'].isin(major_methods)]

alt.Chart(yearly_counts_major).mark_line(point = True).encode(
    y = alt.Y('discoveries:Q'),
    x = alt.X('disc_year:O', title = 'Year'),
    color=alt.Color('discoverymethod:N', title='Discovery Method'),
    tooltip=['disc_year', 'discoverymethod', 'discoveries']
).properties(title = 'Number of Discoveries per Year by Discovery Method' )