In [None]:
# default_exp pds.indexes

# PDS Indexes

> Support tools to work with PDS index files. 

In [None]:
# hide
from nbdev.showdoc import show_doc

In [None]:
# export
import copy
import logging
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlsplit, urlunsplit

import pandas as pd
import pvl
from dateutil import parser
from planetarypy import utils
from planetarypy.pds.ctx_index import CTXIndex
from planetarypy.config import config
from tqdm.auto import tqdm

try:
    # 3.6 compatibility
    from importlib_resources import path as resource_path
except ModuleNotFoundError:
    from importlib.resources import path as resource_path

logger = logging.getLogger(__name__)

storage_root = Path(config.storage_root)

In [None]:
# export
dynamic_urls = {
    'mro.ctx' : CTXIndex
}

In [None]:
# hide
storage_root

Path('/home/maye/big_drive/planetary_data')

In [None]:
# export
class PVLColumn:
    """Manages just one of the columns in a table that is described via PVL."""

    def __init__(self, pvlobj):
        self.pvlobj = pvlobj

    @property
    def name(self):
        return self.pvlobj["NAME"]

    @property
    def name_as_list(self):
        "needs to return a list for consistency for cases when it's an array."
        if self.items is None:
            return [self.name]
        else:
            return [self.name + "_" + str(i + 1) for i in range(self.items)]

    @property
    def start(self):
        "Decrease by one as Python is 0-indexed."
        return self.pvlobj["START_BYTE"] - 1

    @property
    def stop(self):
        return self.start + self.pvlobj["BYTES"]

    @property
    def items(self):
        return self.pvlobj.get("ITEMS")

    @property
    def item_bytes(self):
        return self.pvlobj.get("ITEM_BYTES")

    @property
    def item_offset(self):
        return self.pvlobj.get("ITEM_OFFSET")

    @property
    def colspecs(self):
        if self.items is None:
            return (self.start, self.stop)
        else:
            i = 0
            bucket = []
            for _ in range(self.items):
                off = self.start + self.item_offset * i
                bucket.append((off, off + self.item_bytes))
                i += 1
            return bucket

    def decode(self, linedata):
        if self.items is None:
            start, stop = self.colspecs
            return linedata[start:stop]
        else:
            bucket = []
            for (start, stop) in self.colspecs:
                bucket.append(linedata[start:stop])
            return bucket

    def __repr__(self):
        return self.pvlobj.__repr__()

In [None]:
# export
class IndexLabel(object):
    """Support working with label files of PDS Index tables.

    Parameters
    ----------
    labelpath : str, pathlib.Path
        Path to the labelfile for a PDS Indexfile. The actual table should reside in the same
        folder to be automatically parsed when calling the `read_index_data` method.
    """

    def __init__(self, labelpath):
        self.path = Path(labelpath)
        "search for table name pointer and store key and fpath."
        tuple = [i for i in self.pvl_lbl if i[0].startswith("^")][0]
        self.tablename = tuple[0][1:]
        self.index_name = tuple[1]

    @property
    def index_path(self):
        p = self.path.parent / self.index_name
        if not p.exists():
            import warnings
            warnings.warn("Fudging to lower case.")
            p = self.path.parent / self.index_name.lower()
        if not p.exists():
            warnings.warn("`index_path` still doesn't exist.")
        return p
    
    @property
    def pvl_lbl(self):
        return pvl.load(str(self.path))

    @property
    def table(self):
        return self.pvl_lbl[self.tablename]

    @property
    def pvl_columns(self):
        return self.table.getlist("COLUMN")

    @property
    def columns_dic(self):
        return {col["NAME"]: col for col in self.pvl_columns}

    @property
    def colnames(self):
        """Read the columns in an PDS index label file.

        The label file for the PDS indices describes the content
        of the index files.
        """
        colnames = []
        for col in self.pvl_columns:
            colnames.extend(PVLColumn(col).name_as_list)
        return colnames

    @property
    def colspecs(self):
        colspecs = []
        columns = self.table.getlist("COLUMN")
        for column in columns:
            pvlcol = PVLColumn(column)
            if pvlcol.items is None:
                colspecs.append(pvlcol.colspecs)
            else:
                colspecs.extend(pvlcol.colspecs)
        return colspecs

    def read_index_data(self, convert_times=True):
        return index_to_df(self.index_path, self, convert_times=convert_times)

In [None]:
# export
def index_to_df(indexpath, label, convert_times=True):
    """The main reader function for PDS Indexfiles.

    In conjunction with an IndexLabel object that figures out the column widths,
    this reader should work for all PDS TAB files.

    Parameters
    ----------
    indexpath : str or pathlib.Path
        The path to the index TAB file.
    label : pdstools.IndexLabel object
        Label object that has both the column names and the columns widths as attributes
        'colnames' and 'colspecs'
    convert_times : bool
        Switch to control if to convert columns with "TIME" in name (unless COUNT is as well in name) to datetime
    """
    indexpath = Path(indexpath)
    df = pd.read_fwf(
        indexpath, header=None, names=label.colnames, colspecs=label.colspecs
    )
    if convert_times:
        for column in [i for i in df.columns if "TIME" in i and "COUNT" not in i]:
            if column == "LOCAL_TIME":
                # don't convert local time
                continue
            print(f"Converting times for column {column}.")
            try:
                df[column] = pd.to_datetime(df[column])
            except ValueError:
                df[column] = pd.to_datetime(
                    df[column], format=utils.nasa_dt_format_with_ms, errors="coerce"
                )
        print("Done.")
    return df

In [None]:
# export
class Index:
    """Index manager class.

    Parameters
    ----------
    key : str
        Nested key in form of mission.instrument.indexes.index_name
    url : str, optional
        URL to index. If not given, will be read from config object.
    timestamp : str
        Timestamp in ISO time format yy-mm-ddTHH:MM:SS.
        This is read from the config object. Its value is the time of the last
        download.
    """

    def __init__(self, key, url=None):
        self.key = key if key.startswith('mission') else "missions."+key
        self.set_url(url)
        self.timestamp = config.get_value(self.key)["timestamp"]
        self.new_timestamp = None  # filled by needs_download()

    def set_url(self, url):
        try:
            self.url = config.get_value(self.key)["url"] if url is None else url
        except KeyError:
            self.url = dynamic_urls[self.instrument_key]().latest_index_label_url
        
    @property
    def needs_download(self):
        """Determine if the index needs to be downloaded.

        Download shall happen when
        (1) no local timestamp was stored or
        (2) when the remote timestamp is newer.

        Parameters
        ----------
        index : indices.Index (namedtuple)
            Index holding the timestamp attribute read from the config file

        Returns
        -------
        bool
            Boolean indicating if download shall happen.
        """
        remote_timestamp = utils.get_remote_timestamp(self.url)
        self.new_timestamp = remote_timestamp
        if self.timestamp:
            if remote_timestamp > parser.parse(self.timestamp):
                return True
        else:
            # also return True when the timestamp is not valid
            return True
        # all other cases no D/L required
        return False

    @property
    def key_tokens(self):
        return self.key.split(".")

    @property
    def mission(self):
        return self.key_tokens[1]

    @property
    def mission_key(self):
        return '.'.join(self.key_tokens[1:2])

    @property
    def instrument(self):
        return self.key_tokens[2]

    @property
    def instrument_key(self):
        return '.'.join(self.key_tokens[1:3])

    @property
    def index_name(self):
        "str: Examples: EDR, RDR, moon_summary"
        return self.key_tokens[3]

    @property
    def label_filename(self):
        return Path(self.url.split("/")[-1])

    @property
    def isupper(self):
        return self.label_filename.suffix.isupper()

    @property
    def table_filename(self):
        new_suffix = ".TAB" if self.isupper else ".tab"
        return self.label_filename.with_suffix(new_suffix)

    @property
    def label_path(self):
        return Path(urlsplit(self.url).path)

    @property
    def table_path(self):
        return self.label_path.with_name(self.table_filename.name)

    @property
    def table_url(self):
        tokens = urlsplit(self.url)
        return urlunsplit(
            tokens._replace(
                path=str(self.label_path.with_name(self.table_filename.name))
            )
        )

    @property
    def local_dir(self):
        p = storage_root / str(self.key).replace(".", "/")
        p.mkdir(parents=True, exist_ok=True)
        return p

    @property
    def local_table_path(self):
        return self.local_dir / self.table_filename

    @property
    def local_label_path(self):
        return self.local_dir / self.label_filename

    @property
    def local_hdf_path(self):
        return self.local_table_path.with_suffix(".hdf")

    @property
    def df(self):
        return pd.read_hdf(self.local_hdf_path)

    def download(self, local_dir="", convert_to_hdf=True, force_update=False):
        """Wrapping URLs for downloading PDS indices and their label files.

        Parameters
        ----------
        key : str, optional
            Period-separated key into the available index files, e.g. cassini.uvis.moon_summary
        label_url : str, optional
            Alternative to using the index system, the user can provide the URL to a label
            for an index. The table file has to be in the same folder, as usual.
        local_dir: str, pathlib.Path, optional
            Path for local storage. Default: current directory and filename from URL
        convert_to_hdf : bool
            Switch to convert the index automatically to a faster loading HDF file
        """
        if not local_dir:
            local_dir = self.local_dir
        # check timestamp
        if not self.needs_download and not force_update:
            print("Stored index is up-to-date.")
            return
        label_url = self.url
        logger.info("Downloading %s." % label_url)
        local_label_path, _ = utils.download(label_url, local_dir)
        logger.info("Downloading %s.", self.table_url)
        local_data_path, _ = utils.download(self.table_url, local_dir)
        self.update_timestamp()
        if convert_to_hdf is True:
            self.convert_to_hdf()
            print(f"Downloaded and converted to pandas HDF:\n{self.local_hdf_path}")
        else:
            print(f"Downloaded {local_label_path} and {local_data_path}")

    def update_timestamp(self):
        # Note: the config object writes itself out after setting any value
        config.set_value(f"{self.key}.timestamp", self.new_timestamp.isoformat())

    def convert_to_hdf(self):
        label = IndexLabel(self.local_label_path)
        df = label.read_index_data()
        df.to_hdf(self.local_hdf_path, "df")

    def __str__(self):
        s = f"Key: {self.key}\n"
        s += f"URL: {self.url}\n"
        s += f"Timestamp: {self.timestamp}\n"
        return s

    def __repr__(self):
        return self.__str__()

In [None]:
key = 'missions.cassini.iss.indexes.moon_summary'

In [None]:
index = Index(key)

In [None]:
index

Key: missions.cassini.iss.indexes.moon_summary
URL: https://pds-rings.seti.org/holdings/metadata/COISS_2xxx/COISS_2999/COISS_2999_moon_summary.lbl
Timestamp: 2021-07-20T11:00:52.975329

In [None]:
index.mission_key

'cassini'

In [None]:
index.local_dir

Path('/home/maye/big_drive/planetary_data/missions/cassini/iss/indexes/moon_summary')

In [None]:
index.needs_download

False

In [None]:
index.download()

Stored index is up-to-date.


In [None]:
index.download(force_update=True)

COISS_2999_moon_summary.lbl: 0.00B [00:00, ?B/s]

COISS_2999_moon_summary.tab: 0.00B [00:00, ?B/s]

Done.
Downloaded and converted to pandas HDF:
/home/maye/big_drive/planetary_data/missions/cassini/iss/indexes/moon_summary/COISS_2999_moon_summary.hdf


In [None]:
# export
def decode_line(linedata, labelpath):
    """Decode one line of tabbed data with the appropriate label file.

    Parameters
    ----------
    linedata : str
        One line of a .tab data file
    labelpath : str or pathlib.Path
        Path to the appropriate label that describes the data.
    """
    label = IndexLabel(labelpath)
    for column in label.pvl_columns:
        pvlcol = PVLColumn(column)
        print(pvlcol.name, pvlcol.decode(linedata))

In [None]:
# export
def find_mixed_type_cols(df, fix=True):
    """For a given dataframe, find the columns that are of mixed type.

    Tool to help with the performance warning when trying to save a pandas DataFrame as a HDF.
    When a column changes datatype somewhere, pickling occurs, slowing down the reading process of the HDF file.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe to be searched for mixed data-types
    fix : bool
        Switch to control if NaN values in these problem columns should be replaced by the string 'UNKNOWN'
    Returns
    -------
    List of column names that have data type changes within themselves.
    """
    result = []
    for col in df.columns:
        weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
        if len(df[weird]) > 0:
            print(col)
            result.append(col)
    if fix is True:
        for col in result:
            df[col].fillna("UNKNOWN", inplace=True)
    return result

In [None]:
# export
def fix_hirise_edrcumindex(infname, outfname):
    """Fix HiRISE EDRCUMINDEX.

    The HiRISE EDRCUMINDEX has some broken lines where the SCAN_EXPOSURE_DURATION is of format
    F10.4 instead of the defined F9.4.
    This function simply replaces those incidences with one less decimal fraction, so 20000.0000
    becomes 20000.000.

    Parameters
    ----------
    infname : str
        Path to broken EDRCUMINDEX.TAB
    outfname : str
        Path where to store the fixed TAB file
    """
    with open(infname) as f:
        with open(outfname, "w") as newf:
            for line in tqdm(f):
                exp = line.split(",")[21]
                if float(exp) > 9999.999:
                    # catching the return of write into dummy variable
                    _ = newf.write(line.replace(exp, exp[:9]))
                else:
                    _ = newf.write(line)

In [None]:
index.convert_to_hdf()

Done.


In [None]:
index.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 385719 entries, 0 to 385718
Data columns (total 35 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   VOLUME_ID                             385719 non-null  object 
 1   FILE_SPECIFICATION_NAME               385719 non-null  object 
 2   OPUS_ID                               385719 non-null  object 
 3   TARGET_NAME                           385719 non-null  object 
 4   MINIMUM_PLANETOCENTRIC_LATITUDE       385719 non-null  float64
 5   MAXIMUM_PLANETOCENTRIC_LATITUDE       385719 non-null  float64
 6   MINIMUM_PLANETOGRAPHIC_LATITUDE       385719 non-null  float64
 7   MAXIMUM_PLANETOGRAPHIC_LATITUDE       385719 non-null  float64
 8   MINIMUM_IAU_LONGITUDE                 385719 non-null  float64
 9   MAXIMUM_IAU_LONGITUDE                 385719 non-null  float64
 10  MINIMUM_LOCAL_HOUR_ANGLE              385719 non-null  float64
 11  

In [None]:
index

Key: missions.cassini.iss.indexes.moon_summary
URL: https://pds-rings.seti.org/holdings/metadata/COISS_2xxx/COISS_2999/COISS_2999_moon_summary.lbl
Timestamp: 2019-06-08T16:28:22

In [None]:
index.key_tokens[:2]

['missions', 'cassini']

In [None]:
index.mission

'cassini'

In [None]:
index.mission_key

'missions.cassini'

In [None]:
index.instrument_key

'missions.cassini.iss'

In [None]:
config.get_value(index.instrument_key).get('indexer')

In [None]:
index = Index('mro.ctx')

Scraping latest release page ...
Scraping volumes page ...


KeyError: 'timestamp'

In [None]:
getattr(pds, 'ctx_index')

AttributeError: module 'planetarypy.pds' has no attribute 'ctx_index'

In [None]:
from planetarypy.pds import ctx_index

In [None]:
ctx_index.