### Extract individual spectra from NASA exoplanet archive

In [8]:
import numpy as np
import pandas as pd
from pandas.errors import ParserWarning
import re
from pathlib import Path
import warnings
from typing import List, Dict

In [3]:
# local setup
WDIR = Path().cwd().parent

In [4]:
# load data of all planets with transmission or emission data from exoplanet archive
EMISSION_DATABASE_PATH = WDIR / "data/emissionspec.csv"
TRANSMISSION_DATABASE_PATH = WDIR / "data/transitspec.csv"

df_emission = pd.read_csv(EMISSION_DATABASE_PATH, header=18, index_col=0)
df_transmission = pd.read_csv(TRANSMISSION_DATABASE_PATH, header=26, index_col=0)



In [22]:
print(df_transmission.columns)
print()
print(df_emission.columns)

def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    column_mapping = {
        'plntname': 'Planet Name',
        'centralwavelng': 'Central Wavelength [microns]',
        'bandwidth': 'Band Width [microns]',
        'plntransdep': 'Transit Depth [percentage]',
        'plntransdeperr1': 'Transit Depth Upper Unc. [percentage]',
        'plntransdeperr2': 'Transit Depth Lower Unc. [percentage]',
        'plntransdeplim': 'Transit Depth Limit Flag',
        'plnradj': 'Planet Radius [Jupiter radii]',
        'plnradjerr1': 'Planet Radius Upper Unc. [Jupiter radii]',
        'plnradjerr2': 'Planet Radius Lower Unc. [Jupiter radii]',
        'plnradjlim': 'Planet Radius Limit Flag',
        'plnratror': 'Ratio of Planet to Stellar Radius',
        'plnratrorerr1': 'Ratio of Planet to Stellar Radius Upper Unc.',
        'plnratrorerr2': 'Ratio of Planet to Stellar Radius Lower Unc.',
        'plnratrorlim': 'Ratio of Planet to Stellar Radius Limit Flag',
        'plntranmid': 'Transit Mid-Point [BJD]',
        'plntranmiderr1': 'Transit Mid-Point Upper Unc. [BJD]',
        'plntranmiderr2': 'Transit Mid-Point Lower Unc. [BJD]',
        'plntranmidlim': 'Transit Mid-Point Limit Flag',
        'plntranreflink': 'Reference',
        'facility': 'Facility',
        'instrument': 'Instrument'
    }

    # Rename the columns using the column_mapping
    renamed_df = df.rename(columns=column_mapping)

    return renamed_df

Index(['plntname', 'centralwavelng', 'bandwidth', 'plntransdep',
       'plntransdeperr1', 'plntransdeperr2', 'plntransdeplim', 'plnradj',
       'plnradjerr1', 'plnradjerr2', 'plnradjlim', 'plnratror',
       'plnratrorerr1', 'plnratrorerr2', 'plnratrorlim', 'plntranmid',
       'plntranmiderr1', 'plntranmiderr2', 'plntranmidlim', 'plntranreflink',
       'facility', 'instrument'],
      dtype='object')

Index(['plntname', 'centralwavelng', 'bandwidth', 'especlipdep',
       'especlipdeperr1', 'especlipdeperr2', 'especlipdeplim', 'espbritemp',
       'espbritemperr1', 'espbritemperr2', 'espbritemplim', 'plntreflink',
       'facility', 'instrument'],
      dtype='object')


In [23]:
def add_source_key(data_dict: Dict) -> Dict:
    # Extract the plntranreflink value
    reflink = data_dict.get('reflink', '')

    # Search for the 'authoryear' pattern using a regular expression
    pattern = re.compile(r'>(.+? et al\. \d{4})</a>|>(.+? \d{4})</a>')
    match = pattern.search(reflink)

    if match:
        authoryear = match.group(1) if match.group(1) else match.group(2)
        if 'et al.' in authoryear:
            authoryear = authoryear.replace(' et al. ', '+')

        # Add the 'source' key with the extracted authoryear value
        data_dict['source'] = authoryear.strip()
    else:
        data_dict['source'] = None

    return data_dict

def extract_data(df: pd.DataFrame) -> List[Dict]:
    # Group the DataFrame by unique combinations of planet_name, instrument, facility, and reflink
    grouped = df.groupby(['plntname', 'facility', 'instrument', 'plntranreflink'])

    result = []

    for (planet_name, facility, instrument, reflink), group in grouped:
        # Convert each group to a dictionary
        entry = {
            'planet_name': planet_name,
            'facility': facility,
            'instrument': instrument,
            'reflink': reflink,
            'data': group
        }

        # Add the entry to the result list

        entry = add_source_key(entry)

        result.append(entry)

    return result

split_transmission = extract_data(df_transmission)
# print(split_transmission[0])
print(split_transmission[0]['reflink'])

print(len(split_transmission))

<a refstr=TSIARAS_ET_AL__2016 href=https://ui.adsabs.harvard.edu/abs/2016ApJ...820...99T/abstract target=ref>Tsiaras et al. 2016</a>
346


In [25]:
TRANSMISSION_SPECTRA_PATH = WDIR / "data/transmission"
exts = [".txt"]

def get_txt_spectra_files_in_path(path, ):
    paths = list([
        p for p in Path(path).glob('*') if (p.suffix in exts and p.is_file())
    ])
    return paths

def read_headers(file_paths: List[str]) -> List[Dict]:
    headers = []

    for file_path in file_paths:
        # Read the header lines from the file
        with open(file_path, 'r') as file:
            header_lines = [line.strip() for line in file.readlines() if line.startswith('#')]

        # Create a dictionary with the header lines and filename
        header_dict = {
            'header': header_lines,
            'filename': file_path.name,
            'path': file_path
        }

        # Append the dictionary to the headers list
        headers.append(header_dict)

    return headers

transmission_spectra_files = get_txt_spectra_files_in_path(TRANSMISSION_SPECTRA_PATH)
test = transmission_spectra_files[0:5]
print(test)

transmission_spectra_files_info = read_headers(test)

print(transmission_spectra_files_info)


[WindowsPath('C:/Users/lukas/Documents/git/compla/data/transmission/GJ1214b_transmission_Bean2011.txt'), WindowsPath('C:/Users/lukas/Documents/git/compla/data/transmission/GJ1214b_transmission_Berta2012.txt'), WindowsPath('C:/Users/lukas/Documents/git/compla/data/transmission/GJ1214b_transmission_Kreidberg2014.txt'), WindowsPath('C:/Users/lukas/Documents/git/compla/data/transmission/GJ3470b_transmission_Benneke2019.txt'), WindowsPath('C:/Users/lukas/Documents/git/compla/data/transmission/GJ436b_transmission_Knutson2014.txt')]
[{'header': ['# GJ1214b transmission spectrum', '# This file contains:', '# VLT FORS blue - Table 3 - Bean et al. 2011, ApJ, 743', '# VLT FORS red - Table 4 - Bean et al., 2011, ApJ, 743', '#', '# Analysis details:', '# values in this analysis were taken from Bean et al., 2010, Nature, 468', '# Period = 1.58040481 Â± 1.2E-7d days', '# Inclination = 88.94 degrees', '# a/R* = 14.9749', '#', '# Data history:', '# Converted by H.R. Wakeford from: Bean et al. (2011) in

In [36]:
def get_unique_planet_names(df: pd.DataFrame) -> List[str]:
    unique_planet_names = df['plntname'].unique().tolist()
    return unique_planet_names

# all_planets = get_unique_planet_names(df_transmission)

def unique_planets_and_counts(dict_list: List[Dict]) -> (List[str], List[float]):
    planet_counts = {}

    for item in dict_list:
        planet_name = item['planet_name']
        if planet_name in planet_counts:
            planet_counts[planet_name] += 1
        else:
            planet_counts[planet_name] = 1

    unique_planet_names = list(planet_counts.keys())
    counts = list(planet_counts.values())

    return unique_planet_names, counts

# TODO: write regex that checks for number of star systems: all non-special characters must be equal except for the letter at the last position.

all_planets, counts =unique_planets_and_counts(split_transmission)

print(f"Number of unique planets with transmission spectra: {len(all_planets)}.\n"
      f"\tMedian count: {np.median(counts)}, mean count: {np.median(counts):.1f} +/- {np.std(counts):.1f}, max count: {np.max(counts)} ({all_planets[np.argmax(counts)]}).")


Number of unique planets with transmission spectra: 103.
	Median count: 2.0, mean count: 2.0 +/- 3.8, max count: 21 (GJ 1214 b).


In [45]:
def extract_paper_info_from_ref(input_str):
    input_str = input_str.split('href=http', 1)[-1]

    author_pattern = r"target=ref>\s*([\w]+)\s+et al\."
    year_pattern = r"et al\. (\d{4})"
    journal_pattern = r"abs\/(?:\d+)?([A-Za-z&%]+)\.?"
    journal_number_pattern = r"abs\/\d*[A-Za-z&%]+\.?(\S+?)\/"

    author_match = re.search(author_pattern, input_str)
    year_match = re.search(year_pattern, input_str)
    journal_match = re.search(journal_pattern, input_str)
    journal_number_match = re.search(journal_number_pattern, input_str)

    author = author_match.group(1) if author_match else ""
    year = year_match.group(1) if year_match else ""
    journal = journal_match.group(1) if journal_match else ""
    journal_number = journal_number_match.group(1) if journal_number_match else ""

    # Replace "%26" with "&" for A&A journal
    journal = journal.replace("%26", "&")

    result = {
        'author': author,
        'year': year,
        'journal': journal,
        'journal_number': journal_number,
        'full_reference': f"{author} {year} {journal} {journal_number}".strip()
    }

    return result

test_strs_ref = [
    "<a refstr=WAKEFORD_ET_AL__2013 href=https://ui.adsabs.harvard.edu/abs/2013MNRAS.435.3481W/abstract target=ref> Wakeford et al. 2013</a>",
    "<a refstr=DRAGOMIR_ET_AL__2015 href=https://ui.adsabs.harvard.edu/abs/2015ApJ...814..102D/abstract target=ref>Dragomir et al. 2015</a>",
    "<a refstr=KIRK_ET_AL__2018 href=https://ui.adsabs.harvard.edu/abs/2018MNRAS.474..876K/abstract target=ref>Kirk et al. 2018</a>",
    "<a refstr=SWAIN_ET_AL__2021 href=https://ui.adsabs.harvard.edu/abs/2021arXiv210305657S/abstract target=ref>Swain et al. 2021</a>",
    "<a refstr=FU_ET_AL__2022 href=https://ui.adsabs.harvard.edu/abs/2022arXiv221113761F/abstract target=ref>Fu et al. 2022</a>",
]

test_out = [extract_paper_info_from_ref(t) for t in test_strs_ref]

print(test_out)

[{'author': 'Wakeford', 'year': '2013', 'journal': 'MNRAS', 'journal_number': '435.3481W', 'full_reference': 'Wakeford 2013 MNRAS 435.3481W'}, {'author': 'Dragomir', 'year': '2015', 'journal': 'ApJ', 'journal_number': '..814..102D', 'full_reference': 'Dragomir 2015 ApJ ..814..102D'}, {'author': 'Kirk', 'year': '2018', 'journal': 'MNRAS', 'journal_number': '474..876K', 'full_reference': 'Kirk 2018 MNRAS 474..876K'}, {'author': 'Swain', 'year': '2021', 'journal': 'arXiv', 'journal_number': '210305657S', 'full_reference': 'Swain 2021 arXiv 210305657S'}, {'author': 'Fu', 'year': '2022', 'journal': 'arXiv', 'journal_number': '221113761F', 'full_reference': 'Fu 2022 arXiv 221113761F'}]


In [64]:
def extract_paper_info_from_header(text):
    if "et al." not in text:
        return False

    author_pattern = r"([A-Za-z\.\s]+?)et al\."
    year_pattern = r"(\d{4})"
    journal_pattern = r"(ApJ|ApJL|Nature|Natur|Science|arXiv|MNRAS|AJ|A&A|A%26A)(?:(?:\s*,)|(?=\d{4}))"
    journal_number_pattern = r"(?:,\s*)(\S*?)(?:\s*-)"

    author = re.search(author_pattern, text)
    year = re.search(year_pattern, text)
    journal = re.search(journal_pattern, text)
    journal_number = re.search(journal_number_pattern, text)

    author = author.group(1) if author else ""
    year = year.group(1) if year else ""
    journal = journal.group(1) if journal else ""
    journal_number = journal_number.group(1) if journal_number else ""

    full_reference = f"{author} {year} {journal} {journal_number}".strip()

    return {
        'author': author.strip() ,
        'year': year,
        'journal': journal,
        'journal_number': journal_number,
        'full_reference': full_reference
    }

def extract_paper_info_from_header(text):
    if "et al." not in text:
        return False

    author_pattern = r"([\w\s]+(?:\s+de|\s+van)?[\w\s]+?)(?:,|,)et al\."
    year_pattern = r"(198\d|199\d|200\d|201\d|202[0-3])"
    journal_pattern = r"(ApJ|ApJL|Nature|Natur|Science|arXiv|MNRAS|AJ|A&A|A%26A)(?:(?:\s*,)|(?=\d{4}))"
    journal_number_pattern = r"(?:,\s*)(\S*?)(?:\s*-)"
    go_pattern = r"GO \d{5}"

    text = re.sub(go_pattern, "", text)

    author = re.search(author_pattern, text)
    year = re.search(year_pattern, text)
    journal = re.search(journal_pattern, text)
    journal_number = re.search(journal_number_pattern, text)

    author = author.group(1).strip() if author else ""
    year = year.group(1) if year else ""
    journal = journal.group(1) if journal else ""
    journal_number = journal_number.group(1) if journal_number else ""

    if not author:
        return False

    full_reference = f"{author} {year} {journal} {journal_number}".strip()

    return {
        'author': author,
        'year': year,
        'journal': journal,
        'journal_number': journal_number,
        'full_reference': full_reference
    }

test_strs_header = [
    "# VLT FORS blue - Table 3 - Bean et al. 2011, ApJ, 743",
    "# values in this analysis were taken from Bean et al., 2010, Nature, 468",
    "# WFC3 G141 - GO 13021 - Kreidberg, et al., 2014, Nature - DOI:10.1038/nature12888",
    "# WFC3 G141 - GO 12251 - Berta, et al., 2012, ApJ, 747, 1, 35",
    "# Berta et al. (2012) Table 2",
    "# WFC3 UVIS G280 - GO 15288 - Wakeford et al. 2020, AJ",
    "# WFC3 G141 - GO 14873 - de Wit, Wakeford, Lewis, et al. 2018, Nature Astronomy",
    "# Converted by H.R. Wakeford from: de Wit, Wakeford, Lewis, et al. 2018, Nature Astronomy",
    "# STIS G430L - GO 12473 - Sing et al. 2013, MNRAS",
    "# Re-presented in Sing et al. 2016, Nature",
    "# WFC3 G141 - GO 14468 - Evans et al. 2016, ApJL"
]

test_out = [extract_paper_info_from_header(t) for t in test_strs_header]

print(test_out)


[False, False, False, False, False, False, False, False, False, False, False]


In [None]:
{'first_author': None, 'full_author': None, 'year': None, 'journal': None, 'journal_number': None, 'doi': None, 'full_reference': None}