# How Discovery Methods of Exoplanets Relate to Our Understanding of Their Features
Luna Correia, Stephanie Jacinto, Zoran Shamsi


In [None]:
#pip install altair

Note: you may need to restart the kernel to use updated packages.


In [1]:
# importing
import pandas as pd
import altair as alt

FILEPATH = "exoplanet_archive_data_sans_header.csv"

In [None]:
exoplanet_data = pd.read_csv(FILEPATH)

# removing anything flagged as controversial 
cut_data = exoplanet_data[exoplanet_data["pl_controv_flag"] != 1]

# limiting it to the relevant columns
#   •pl_name = planet name
#   •hostname = host star name
#   •discoverymethod = discovery method
#   •disc_year = discovery_year 
#   •pl_rade = planetary radius (in terms of Earth radii)
#   •pl_bmasse = Planet Mass or Mass*sin(i) [Earth Mass]
#       •TODO: FIGURE OUT WHAT THOSE DIFFERENT TYPES MEAN AND HOW TO DIFFERENTIATE THEM IF IT MATTERS 
#   •pl_orbper = orbital period 
#   •pl_orbsmax = orbital semi-major axis 
#   •pl_orbeccen = eccentricity 
# (could add more too if we wanted stuff like stellar features)
cut_data = cut_data[["pl_name", "hostname", "discoverymethod", "disc_year", "pl_rade", "pl_bmasse", "pl_orbper", "pl_orbsmax", "pl_orbeccen"]]

cut_data.head()



Unnamed: 0,pl_name,hostname,discoverymethod,disc_year,pl_rade,pl_bmasse,pl_orbper,pl_orbsmax,pl_orbeccen
0,11 Com b,11 Com,Radial Velocity,2007.0,,4914.898486,323.21,1.178,0.238
1,11 Com b,11 Com,Radial Velocity,2007.0,,6165.6,326.03,1.29,0.231
2,11 Com b,11 Com,Radial Velocity,2007.0,,5434.7,,1.21,
3,11 UMi b,11 UMi,Radial Velocity,2009.0,,4684.8142,516.21997,1.53,0.08
4,11 UMi b,11 UMi,Radial Velocity,2009.0,,3432.4,,1.51,


In [3]:
# because the dataset is so large, we have to disable the max rows thing for it to render everything 
# (this may be resolved later when we cut down the rows)
alt.data_transformers.disable_max_rows()

# making a boxplot 
alt.Chart(cut_data).encode(alt.X("discoverymethod"), alt.Y("pl_bmasse")).mark_boxplot()

In [89]:

df = cut_data[['pl_name', 'discoverymethod', 'disc_year']].copy()

# make every year numeric
df['disc_year'] = pd.to_numeric(df['disc_year'], errors='coerce')

# drop rows without a year
df = df.dropna(subset=['disc_year'])

# convert year to int
df['disc_year'] = df['disc_year'].astype(int)

# remove duplicate discoveries of the same planet
df = df.drop_duplicates(subset=['pl_name', 'discoverymethod', 'disc_year'])


In [None]:

yearly_counts = (df.groupby(['disc_year', 'discoverymethod'])
                 .size()
                 .reset_index(name='discoveries'))
major_methods = ['Transit', 'Radial Velocity', 'Microlensing', 'Imaging', 'Astrometry', 'Pulsar Timing']
yearly_counts_major = yearly_counts[yearly_counts['discoverymethod'].isin(major_methods)]

alt.Chart(yearly_counts_major).mark_line(point = True).encode(
    y = alt.Y('discoveries:Q'),
    x = alt.X('disc_year:O', title = 'Year'),
    color=alt.Color('discoverymethod:N', title='Discovery Method'),
    tooltip=['disc_year', 'discoverymethod', 'discoveries']
).properties(title = 'Number of Discoveries per Year by Discovery Method' )

