# Testing Sherlock

Date: May 15 2020

In [5]:
import numpy as np
import pandas as pd
import sherlock

# 1. Load NASA Exoplanet Catalog

In [10]:
catalog = sherlock.get_local_exoarchive()
catalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26329 entries, 0 to 26328
Columns: 296 entries, rowid to st_nspec
dtypes: float64(265), int64(6), object(25)
memory usage: 59.5+ MB


In [11]:
catalog.shape

(26329, 296)

### Nº of unique planets

In [12]:
print(np.unique(catalog.pl_name).size)
print(catalog.default_flag.sum()) #equivalent to checking which ones have default_flag = 1

4154
4154


### Parameters of Interest

In [13]:
interesting_cols = ["pl_rade", "st_rad"]

# 2. [Optional] Add a mask in Rp?

In [26]:
rp_col = 'pl_rade'
rp_limit = 4.0

In [27]:
min_radi_per_pl = sherlock.get_min_value(rp_col, catalog).set_index("pl_name")
rp_mask = min_radi_per_pl['pl_rade'] <= rp_limit 
selected_planets = min_radi_per_pl[rp_mask].index

In [28]:
print("There are %i planets with Rp <= %0.01f R%s" %(len(selected_planets), rp_limit, rp_col[-1]))

There are 2391 planets with Rp <= 4.0 Re


In [29]:
min_radi_per_pl = sherlock.get_min_value(rp_col, catalog).set_index("pl_name")
planet_ids = min_radi_per_pl[rp_mask].index

In [30]:
planet_mask = catalog.pl_name.isin(planet_ids)
planet_mask.sum() 

19644

In [31]:
df_original = catalog[planet_mask]
df_original.shape 

(19644, 296)

In [32]:
df_final = catalog.copy(deep=True)
df_final = catalog.drop_duplicates("pl_name").set_index("pl_name").drop("default_flag", axis=1)
df_final.shape

(4154, 294)

# 4. [Optional]: Retrieve Parameters of Interest with Min_error Approach

In [33]:
df_aux = catalog[catalog.default_flag == 1].drop("default_flag", axis=1).set_index("pl_name")
df_final.loc[df_aux.index] = df_aux
#df_final["pl_masse"] = np.nan  
df_final.shape

(4154, 294)

In [34]:
df_final = sherlock.update_columns_with_min_err(
    interesting_cols, 
    catalog, 
    df_final, 
    queries={"pl_masse": "pl_bmassprov == 'Mass'"})

In [35]:
df_final.shape

(936, 296)

# 5. Create your final dataset

In [36]:
#np.unique(catalog[mass_mask].dropna(subset=["pl_masse", "pl_masseerr1", "pl_masseerr2"]).pl_name).size

In [37]:
#np.unique(catalog[mass_mask].pl_name).size

In [38]:
df_final.to_csv('final_catalog_planets.csv')

In [39]:
np.unique(df_final.index).size

936