### Data reading

In [20]:
import pandas as pd
import re
from pathlib import Path
from astroquery import mast
from astroquery.mast import Catalogs

In [11]:
WDIR = Path().cwd().parent
EMISSION_DATABASE_PATH = WDIR / "data/emissionspec.csv"
TRANSMISSION_DATABASE_PATH = WDIR / "data/transitspec.csv"


df_emission = pd.read_csv(EMISSION_DATABASE_PATH, header=18, index_col=0)
df_transmission = pd.read_csv(TRANSMISSION_DATABASE_PATH, header=26, index_col=0)

print(df_emission)
print(df_transmission)

         plntname  centralwavelng  bandwidth  especlipdep  especlipdeperr1  \
rowid                                                                        
1       WASP-80 b           3.600        NaN       0.4550           0.1000   
2       WASP-80 b           4.500        NaN       0.9440           0.0640   
3      HAT-P-13 b           3.600        NaN       0.0662           0.0113   
4      HAT-P-13 b           4.500        NaN       0.1426           0.0130   
5          XO-3 b           4.500        NaN       0.1580           0.0036   
...           ...             ...        ...          ...              ...   
570    WASP-121 b           1.554      0.019       0.1307           0.0062   
571    WASP-121 b           1.573      0.018       0.1388           0.0063   
572    WASP-121 b           1.591      0.019       0.1299           0.0069   
573    WASP-121 b           1.610      0.018       0.1270           0.0064   
574    WASP-121 b           1.628      0.019       0.1286       

In [18]:
EMISSION_SPECTRA_PATH = WDIR / "data/emission"
TRANSMISSION_SPECTRA_PATH = WDIR / "data/transmission"

exts = [".txt", ".csv"]

emission_files = list([p for p in Path(EMISSION_SPECTRA_PATH).rglob('*') if p.suffix in exts])
transmission_files = list([p for p in Path(TRANSMISSION_SPECTRA_PATH).rglob('*') if p.suffix in exts])

emission_file_names = ["EMISSION_" + f.name for f in emission_files]
transmission_file_names = ["TRANSMISSION_" + f.name for f in transmission_files]

print(emission_file_names)

['EMISSION_HAT-P-32b_Emission_WFC3_Spitzer.txt', 'EMISSION_WASP-121b_Emission_WFC3_Spitzer_Evans2017Nature.txt', 'EMISSION_WASP-121b_Emission_WFC3_Spitzer_Evans2019.txt', 'EMISSION_CoRoT2.txt', 'EMISSION_HAT32A.txt', 'EMISSION_HAT7.txt', 'EMISSION_HD189733.txt', 'EMISSION_HD209458.txt', 'EMISSION_KELT7.txt', 'EMISSION_Kepler13A.txt', 'EMISSION_TrES3.txt', 'EMISSION_WASP103.txt', 'EMISSION_WASP12.txt', 'EMISSION_WASP121.txt', 'EMISSION_WASP18.txt', 'EMISSION_WASP33.txt', 'EMISSION_WASP4.txt', 'EMISSION_WASP43.txt', 'EMISSION_WASP76_Edwards.txt', 'EMISSION_WASP76_Fu.txt']


In [27]:
def compare_strings(s1, s2):
    # Remove special characters (spaces, underscores, hyphens) from both strings
    s1_clean = re.sub(r'[-_\s]', '', s1)
    s2_clean = re.sub(r'[-_\s]', '', s2)

    # Create a pattern to search for s1_clean in s2_clean
    pattern = '.*' + '.*'.join(s1_clean) + '.*'

    # Check if the pattern is found in s2_clean
    if re.fullmatch(pattern, s2_clean):
        return True
    else:
        return False

def add_file_info_to_df(df, file_list, path_list, verbose=False):
    if 'plntname' not in df.columns:
        raise ValueError('The DataFrame must contain a "plntname" column.')

    # Initialize new columns
    df['filename'] = ''
    df['path'] = ''

    # unassigned files
    unassigned = {"file": [], "path": []}

    for file, path in zip(file_list, path_list):
        assigned = False
        for index, row in df.iterrows():
            if compare_strings(row['plntname'], file):
                df.at[index, 'filename'] = file
                df.at[index, 'path'] = file  # Assuming file contains the full path. Update this line if required.
                assigned = True
        if not assigned:
            if verbose:
                print(f"No match found for {file}")
            unassigned["file"].append(file)
            unassigned["path"].append(path)

    if len(unassigned['file']) > 0:
        print(f"{len(unassigned['file'])} files of {len(file_list)} unassigned.")
    return df, unassigned

emission_data, emission_files_unassigned = add_file_info_to_df(df_emission, emission_file_names, emission_files)
transmission_data, transmission_files_unassigned = add_file_info_to_df(df_transmission, transmission_file_names, transmission_files)

17 files of 20 unassigned!
13 files of 55 unassigned!


In [29]:
def similarity_score(s1, s2):
    s1_clean = re.sub(r'[-_\s]', '', s1)
    s2_clean = re.sub(r'[-_\s]', '', s2)

    s1_len = len(s1_clean)
    s2_len = len(s2_clean)

    matched_chars = sum(a == b for a, b in zip(s1_clean, s2_clean))

    return matched_chars / min(s1_len, s2_len)

def add_file_info_to_df(df, file_list, path_list, verbose=False):
    if 'plntname' not in df.columns:
        raise ValueError('The DataFrame must contain a "plntname" column.')

    df['filename'] = ''
    df['path'] = ''

    unassigned_files = set(file_list)
    unassigned_rows = set(df.index)

    for file, path in zip(file_list, path_list):
        for index, row in df.iterrows():
            if row['plntname'] in file:
                df.at[index, 'filename'] = file
                df.at[index, 'path'] = file
                unassigned_files.discard(file)
                unassigned_rows.discard(index)
                break

    for file in unassigned_files:
        best_match_index = None
        best_similarity = 0
        for index in unassigned_rows:
            similarity = similarity_score(df.loc[index, 'plntname'], file)
            if similarity > best_similarity:
                best_similarity = similarity
                best_match_index = index

        if best_match_index is not None:
            df.at[best_match_index, 'filename'] = file
            df.at[best_match_index, 'path'] = file
            unassigned_rows.discard(best_match_index)

    return df

emission_data = add_file_info_to_df(df_emission, emission_file_names, emission_files)
transmission_data = add_file_info_to_df(df_transmission, transmission_file_names, transmission_files)

In [30]:
print(emission_data)

         plntname  centralwavelng  bandwidth  especlipdep  especlipdeperr1  \
rowid                                                                        
1       WASP-80 b           3.600        NaN       0.4550           0.1000   
2       WASP-80 b           4.500        NaN       0.9440           0.0640   
3      HAT-P-13 b           3.600        NaN       0.0662           0.0113   
4      HAT-P-13 b           4.500        NaN       0.1426           0.0130   
5          XO-3 b           4.500        NaN       0.1580           0.0036   
...           ...             ...        ...          ...              ...   
570    WASP-121 b           1.554      0.019       0.1307           0.0062   
571    WASP-121 b           1.573      0.018       0.1388           0.0063   
572    WASP-121 b           1.591      0.019       0.1299           0.0069   
573    WASP-121 b           1.610      0.018       0.1270           0.0064   
574    WASP-121 b           1.628      0.019       0.1286       