# Clean exoplanet data

This notebook generates a clean dataset from the NASA exoplanet archive
that can be found [here](http://exoplanetarchive.ipac.caltech.edu).
This file can also be used as a template to write a code for data reading.

Creator: Mattia Falco

Date: 27/03/2022

In [1]:
import pandas as pd

archive = 'PS_2022.03.28_05.32.23.csv'

data = pd.read_csv(archive, comment='#')
print(data.columns)

Index(['pl_name', 'hostname', 'pl_orbsmax', 'pl_radj', 'pl_massj', 'st_rad',
       'st_mass', 'ra', 'dec', 'sy_dist', 'sy_gaiamag'],
      dtype='object')


In [2]:
# take data with no Nan distance
data_clean = data.loc[pd.notna(data['sy_dist'])]

# remove duplicates
data_clean = data_clean.drop_duplicates(subset=['pl_name'])

data_clean = data_clean.set_index('pl_name')
data_clean

Unnamed: 0_level_0,hostname,pl_orbsmax,pl_radj,pl_massj,st_rad,st_mass,ra,dec,sy_dist,sy_gaiamag
pl_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Proxima Cen b,Proxima Cen,0.0485,,,0.14,0.12,217.393466,-62.676182,1.30119,8.95361
eps Eri b,eps Eri,3.5000,,,0.76,0.81,53.228431,-9.458172,3.20260,3.47719
GJ 887 b,GJ 887,0.0680,,,0.47,0.49,346.466827,-35.853069,3.29000,
GJ 887 c,GJ 887,0.1200,,,0.47,0.49,346.466827,-35.853069,3.29000,
Ross 128 b,Ross 128,0.0496,,,0.20,0.17,176.937604,0.799290,3.37454,9.59067
...,...,...,...,...,...,...,...,...,...,...
OGLE-2018-BLG-0677L b,OGLE-2018-BLG-0677L,0.6300,,0.01250,,0.12,268.751125,-32.016531,7580.00000,
OGLE-2018-BLG-1700L b,OGLE-2018-BLG-1700L,2.8000,,4.40000,,0.42,269.956042,-28.528639,7600.00000,
OGLE-2018-BLG-0383L b,OGLE-2018-BLG-0383L,1.8000,,0.02014,,0.10,268.680750,-28.739278,7700.00000,
OGLE-2008-BLG-092L b,OGLE-2008-BLG-092L,15.0000,,0.18000,,0.71,267.540708,-34.673194,8100.00000,


In [None]:
# save
data_clean.to_csv('exo_archive.csv', sep=',', index=True)