In [1]:
import os
import aiohttp
import asyncio
from bs4 import BeautifulSoup

"""
This will download all the files in the FOLDER of BASE SUFFIX. If there are also folders, not really sure what is would do, lol
"""


# Base URL of the page
BASE_URL = "https://pds-geosciences.wustl.edu"
BASE_SUFFIX = "/lro/lro-l-dlre-4-rdr-v1/lrodlr_1002/data/2023/202305/20230501"

# Directory to save downloaded files
DOWNLOAD_DIR = "/media/mglos/HDD1_8TB1/DIVINER_JUPYTER"

# File suffixes to download
FILE_SUFFIXES = {".zip", ".lbl", ".xml"}

# Maximum concurrent downloads
MAX_DOWNLOADS = 5

# Ensure the download directory exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

async def download_file(session, url, save_path, progress):
    """Download a single file asynchronously."""
    try:
        async with session.get(url) as response:
            if response.status == 200:
                with open(save_path, "wb") as file:
                    async for chunk in response.content.iter_chunked(8192):
                        file.write(chunk)
                progress["completed"] += 1
                print(f"Downloaded: {save_path} ({progress['completed']}/{progress['total']})")
            else:
                print(f"Failed to download: {url} (status: {response.status})")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

async def scrape_and_download():
    """Scrape the page, find files with the specified suffixes, and download them."""
    async with aiohttp.ClientSession() as session:
        response = await session.get(BASE_URL + BASE_SUFFIX)
        if response.status == 200:
            soup = BeautifulSoup(await response.text(), "html.parser")
            links = [
                link.get("href") for link in soup.find_all("a")
                if link.get("href") and any(link.get("href").endswith(suffix) for suffix in FILE_SUFFIXES)
            ]

            # Progress tracking
            progress = {"completed": 0, "total": len(links)}
            print(f"Found {progress['total']} files to download.")

            # Semaphore for limiting concurrent downloads
            semaphore = asyncio.Semaphore(MAX_DOWNLOADS)

            async def limited_download(href):
                async with semaphore:
                    full_url = f"{BASE_URL}{href}"
                    save_path = os.path.join(DOWNLOAD_DIR, href.split("/")[-1])
                    await download_file(session, full_url, save_path, progress)

            # Run downloads asynchronously
            tasks = [limited_download(href) for href in links]
            await asyncio.gather(*tasks)
        else:
            print(f"Failed to access {BASE_URL + BASE_SUFFIX} (status: {response.status})")

# Run the async function
# await scrape_and_download()

In [2]:
from collections import defaultdict
import zipfile
from tqdm import tqdm

DOWNLOAD_DIR = "/media/mglos/HDD1_8TB1/DIVINER_JUPYTER"
files = os.listdir(DOWNLOAD_DIR)
data_groups = defaultdict(dict)
for file in files:
    key = ''.join(file.split(".")[:-1])
    file_format = file.split(".")[-1]
    data_groups[key.lower()][file_format.lower()] = os.path.join(DOWNLOAD_DIR, file)


### Extract the zipfiles, necessary only once
# Extract the zipfiles in which the tab files are stored
for base_name, file_dict in tqdm(data_groups.items(), desc="Extracting zip files"):
    filename = file_dict.get("zip")
    if filename is None:
        continue

    with zipfile.ZipFile(filename, 'r') as z:
        for zip_file in z.filelist:
            with open(os.path.join(DOWNLOAD_DIR, zip_file.filename.lower()), 'wb') as f:
                f.write(z.open(zip_file).read())

Extracting zip files: 100%|██████████| 144/144 [03:11<00:00,  1.33s/it]


In [3]:
from lxml import etree
import pandas as pd
import numpy as np

def parse_pds4_metadata(xml_path):
    """Extract field metadata from the PDS4 XML file."""
    tree = etree.parse(xml_path)
    root = tree.getroot()

    # Define namespace for XML parsing
    ns = {"pds": "http://pds.nasa.gov/pds4/pds/v1"}

    # Extract field metadata from the XML
    fields = []
    for field in root.xpath(".//pds:Field_Character", namespaces=ns):
        invalid_constant = field.find(".//pds:invalid_constant", namespaces=ns)
        unknown_constant = field.find(".//pds:unknown_constant", namespaces=ns)

        fields.append({
            "name": field.findtext("pds:name", namespaces=ns).strip(),
            "field_length": int(field.findtext("pds:field_length", namespaces=ns)),
            "field_location": int(field.findtext("pds:field_location", namespaces=ns)),
            "special_constants": {
                "invalid": invalid_constant.text if invalid_constant is not None else None,
                "unknown": unknown_constant.text if unknown_constant is not None else None,
            },
        })
    return fields

def load_tab_file(tab_path, fields):
    """Load the .tab file using extracted metadata."""
    # Extract column names and calculate dynamic widths
    col_names = [field["name"] for field in fields]
    col_locations = [field["field_location"] for field in fields]

    # Calculate column widths based on locations
    col_widths = [b - a for a, b in zip(col_locations[:-1], col_locations[1:])]
    col_widths.append(fields[-1]["field_length"])  # Add the width of the last column

    special_constants = {
        field["name"]: (field["special_constants"]["invalid"], field["special_constants"]["unknown"])
        for field in fields if field["special_constants"]["invalid"] or field["special_constants"]["unknown"]
    }

    # Load the .tab file
    df = pd.read_fwf(tab_path, widths=col_widths, names=col_names, skiprows=3)

    # Replace special constants with NaN
    for col, constants in special_constants.items():
        invalid, unknown = constants
        to_replace = []
        if invalid is not None:
            to_replace.append(float(invalid))
        if unknown is not None:
            to_replace.append(float(unknown))
        df[col] = df[col].replace(to_replace, np.nan)

    # Clean up column names and extra rows
    df.columns = df.columns.str.strip()
    df = df.dropna(how="all")  # Remove rows with all NaN values

    return df.replace([-9999, -9998, -9999., -9998.], np.nan)

In [4]:
from lxml import etree
import pandas as pd
import numpy as np


def parse_pds4_metadata(xml_path):
    """Extract field metadata from the PDS4 XML file."""
    tree = etree.parse(xml_path)
    root = tree.getroot()

    # Define namespace for XML parsing
    ns = {"pds": "http://pds.nasa.gov/pds4/pds/v1"}

    # Extract field metadata from the XML
    fields = []
    for field in root.xpath(".//pds:Field_Character", namespaces=ns):
        invalid_constant = field.find(".//pds:invalid_constant", namespaces=ns)
        unknown_constant = field.find(".//pds:unknown_constant", namespaces=ns)

        fields.append({
            "name": field.findtext("pds:name", namespaces=ns).strip(),
            "field_length": int(field.findtext("pds:field_length", namespaces=ns)),
            "field_location": int(field.findtext("pds:field_location", namespaces=ns)),
            "special_constants": {
                "invalid": invalid_constant.text if invalid_constant is not None else None,
                "unknown": unknown_constant.text if unknown_constant is not None else None,
            },
        })
    return fields


def load_tab_file(tab_path, fields):
    """Load the .tab file using extracted metadata."""
    # Extract column names and calculate dynamic widths
    col_names = [field["name"] for field in fields]
    col_locations = [field["field_location"] for field in fields]

    # Calculate column widths based on locations
    col_widths = [b - a for a, b in zip(col_locations[:-1], col_locations[1:])]
    col_widths.append(fields[-1]["field_length"])  # Add the width of the last column

    # Extract special constants (invalid and unknown)
    special_constants = {
        field["name"]: [
            float(field["special_constants"]["invalid"]) if field["special_constants"]["invalid"] else None,
            float(field["special_constants"]["unknown"]) if field["special_constants"]["unknown"] else None
        ]
        for field in fields if field["special_constants"]["invalid"] or field["special_constants"]["unknown"]
    }

    # Load the .tab file and skip incorrect header rows
    df = pd.read_fwf(tab_path, widths=col_widths, names=col_names, skiprows=4)

    # Replace special constants with NaN
    for col, constants in special_constants.items():
        invalid, unknown = constants
        if col in df.columns:
            df[col] = df[col].replace([invalid, unknown], np.nan)

    # Clean up column names and ensure proper formatting
    df.columns = df.columns.str.strip()

    # Fix string columns like 'date' and 'utc' (strip quotes and clean up)
    for col in ['date', 'utc']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.replace('"', '', regex=True)

    # Drop rows with all NaN values and reset index
    df = df.dropna(how="all").reset_index(drop=True)

    return df


Try to read, parse, plot?, interpret? data ...

In [5]:
from pds4_tools import read
import pandas as pd

sample = data_groups[list(data_groups.keys())[0]]

# File paths
xml_path = sample['xml']  # Update with your XML file path
tab_path = sample['tab']  # Update with your TAB file path

# Parse metadata from the XML file
fields_metadata = parse_pds4_metadata(xml_path)

# Load the .tab file into a DataFrame
dataframe = load_tab_file(tab_path, fields_metadata)

# Inspect the DataFrame
dataframe.head()


Unnamed: 0,date,utc,jdate,orbit,sundist,sunlat,sunlon,sclk,sclat,sclon,...,tb,clat,clon,cemis,csunzen,csunazi,cloctime,qca,qge,qmi
0,"01-May-2023,","06:30:00.084,","2460065.770834301,",62321,"1.00899,","0.14611,","49.11551,","0704615400.03276,","-61.71028,","352.57893,",...,"-9999.0,","-61.79865,","352.44656,","2.94114,","75.08370,","24.57522,","8.22194,",8,12,0
1,"01-May-2023,","06:30:00.084,","2460065.770834301,",62321,"1.00899,","0.14611,","49.11551,","0704615400.03276,","-61.71028,","352.57893,",...,"-9999.0,","-61.79731,","352.46204,","2.80101,","75.07671,","27.45296,","8.22306,",8,12,0
2,"01-May-2023,","06:30:00.084,","2460065.770834301,",62321,"1.00899,","0.14611,","49.11551,","0704615400.03276,","-61.71028,","352.57893,",...,"-9999.0,","-61.79597,","352.47748,","2.66870,","75.06973,","30.62492,","8.22389,",8,12,0
3,"01-May-2023,","06:30:00.084,","2460065.770834301,",62321,"1.00899,","0.14611,","49.11551,","0704615400.03276,","-61.71028,","352.57893,",...,"-9999.0,","-61.79464,","352.49286,","2.54602,","75.06278,","34.09689,","8.22500,",8,12,0
4,"01-May-2023,","06:30:00.084,","2460065.770834301,",62321,"1.00899,","0.14611,","49.11551,","0704615400.03276,","-61.71028,","352.57893,",...,"-9999.0,","-61.79324,","352.50894,","2.42861,","75.05550,","38.09282,","8.22611,",8,12,0


In [6]:
print(fields_metadata)

[{'name': 'date', 'field_length': 11, 'field_location': 2, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'utc', 'field_length': 12, 'field_location': 17, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'jdate', 'field_length': 17, 'field_location': 32, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'orbit', 'field_length': 5, 'field_location': 51, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'sundist', 'field_length': 7, 'field_location': 58, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'sunlat', 'field_length': 8, 'field_location': 67, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'sunlon', 'field_length': 9, 'field_location': 77, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'sclk', 'field_length': 16, 'field_location': 88, 'special_constants': {'invalid': '-9998', 'unknown': '-9999'}}, {'name': 'sc

In [10]:
import spiceypy as spice

kernels = [
    "pck00008.tpc",
    "lro_frames_2014049_v01.tf",
    "lro_dlre_frames_2010132_v04.tf",
    "naif0012.tls",
    "lro_clkcor_2023193_v00.tsc",
    "de421.bsp",
    "moon_pa_de421_1900-2050.bpc",
]

# Load kernels
for kernel in kernels:
    spice.furnsh(kernel)

SpiceNOSUCHFILE: 
================================================================================

Toolkit version: CSPICE_N0067

SPICE(NOSUCHFILE) --

The attempt to load "pck00008.tpc" by the routine FURNSH failed. It could not be located.

furnsh_c --> FURNSH --> ZZLDKER

================================================================================