Adding ability to download a given label URL

Using the CTX scraper in pdstools/scraper one can get the URL for the lates label file, then to be provided to pdstools/download()
michaelaye · May 14, 2019 · 65e3794 · 65e3794
1 parent cdb0034
commit 65e3794
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 49 deletions.
diff --git a/planetpy/pdstools/indices.py b/planetpy/pdstools/indices.py
@@ -13,6 +13,7 @@
 from tqdm import tqdm
 
 from .. import utils
+from .scraper import CTXIndex
 
 try:
     from importlib_resources import read_text
@@ -57,18 +58,25 @@ def replace_url_suffix(url, new_suffix=".tab"):
     )
 
 
-def download(key, local_dir=".", convert_to_hdf=True):
+def download(key=None, label_url=None, local_dir=".", convert_to_hdf=True):
     """Wrapping URLs for downloading PDS indices and their label files.
 
     Parameters
     ==========
-    key : str
+    key : str, optional
         Colon-separated key into the available index files, e.g. cassini:uvis:moon_summary
-    localpath: str, pathlib.Path, optional
+    label_url : str, optional
+        Alternative to using the index system, the user can provide the URL to a label
+        for an index. The table file has to be in the same folder, as usual.
+    local_dir: str, pathlib.Path, optional
         Path for local storage. Default: current directory and filename from URL
     """
-    mission, instr, index = key.split(":")
-    label_url = indices_urls[mission][instr][index]
+    if label_url is None:
+        if key is not None:
+            mission, instr, index = key.split(":")
+            label_url = indices_urls[mission][instr][index]
+        else:
+            raise SyntaxError("One of key or label_url needs to be given.")
     logger.info("Downloading %s." % label_url)
     local_label_path, _ = utils.download(label_url, local_dir)
     data_url = replace_url_suffix(label_url)
@@ -315,47 +323,3 @@ def fix_hirise_edrcumindex(infname, outfname):
                     _ = newf.write(line.replace(exp, exp[:9]))
                 else:
                     _ = newf.write(line)
-
-
-# TODO:
-# if not labelpath.exists():
-#     df = pd.read_csv(indexpath, header=None)
-
-
-# FIXME
-def convert_indexfiles_to_hdf(folder):
-    """Convert all indexfiles to an HDF database.
-
-    Search for .tab files in `folder`, read them into a dataframe,
-    concat to large dataframe at the end and store as HDF file.
-
-    Parameters
-    ----------
-    folder : str or pathlib.Path
-        Folder in where to search for .tab files
-    labelpath : str or pathlb.Path
-    """
-    indexdir = Path(folder)
-    # TODO: make it work for .TAB as well
-    indexfiles = list(indexdir.glob("*.tab"))
-    bucket = []
-    if PROGRESSBAR_EXISTS:
-        bar = progressbar.ProgressBar(max_value=len(indexfiles))
-    for i, indexfile in enumerate(indexfiles):
-        # convert times later, more performant
-        df = index_to_df(indexfile, convert_times=False)
-        df["index_fname"] = str(indexfile)
-        bucket.append(df)
-        if bar:
-            bar.update(i)
-    totalindex = pd.concat(bucket, ignore_index=True)
-    # Converting timestrings to datetimes
-    print("Converting times...")
-    for column in [i for i in totalindex.columns if "TIME" in i]:
-        totalindex[column] = pd.to_datetime(
-            totalindex[column].map(utils.nasa_datetime_to_iso)
-        )
-    # TODO: Clean up old iss references
-    savepath = indexdir / "iss_totalindex.hdf"
-    totalindex.to_hdf(savepath, "df")
-    print(f"Created pandas HDF index database file here:\n{savepath}")
diff --git a/planetpy/pdstools/scraper.py b/planetpy/pdstools/scraper.py
@@ -0,0 +1,48 @@
+from pathlib import Path
+from string import Template
+
+import pandas as pd
+
+
+class CTXIndex:
+    volumes_url = "https://pds-imaging.jpl.nasa.gov/volumes/mro.html"
+    release_url_template = \
+        Template(
+            "https://pds-imaging.jpl.nasa.gov/volumes/mro/release${release}.html")
+    volume_url_template = \
+        Template(
+            "https://pds-imaging.jpl.nasa.gov/data/mro/mars_reconnaissance_orbiter/ctx/mrox_${volume}/")
+
+    @property
+    def web_tables_list(self):
+        print("Scraping volumes page ...")
+        return pd.read_html(self.volumes_url)
+
+    @property
+    def release_number(self):
+        l = self.web_tables_list
+        # The last item of last table looks like "Release XX"
+        return l[-1].iloc[-1, 0].split()[-1]
+
+    @property
+    def release_url(self):
+        return self.release_url_template.substitute(release=self.release_number)
+
+    @property
+    def latest_volume_url(self):
+        print("Scraping latest release page ...")
+        l = pd.read_html(self.release_url)
+        # get last row of 4th table
+        row = l[3].iloc[-1]
+        number = None
+        # first number that is NAN breaks the loop over last row of table
+        for elem in row.values:
+            try:
+                number = int(elem.split()[-1])
+            except AttributeError:
+                break
+        return self.volume_url_template.substitute(volume=number)
+
+    @property
+    def latest_index_label_url(self):
+        return Path(self.latest_volume_url) / 'index/cumindex.lbl'