Skip to content

Commit

Permalink
Adding ability to download a given label URL
Browse files Browse the repository at this point in the history
Using the CTX scraper in pdstools/scraper one can get the URL for the lates label file, then to be provided to pdstools/download()
  • Loading branch information
michaelaye committed May 14, 2019
1 parent cdb0034 commit 65e3794
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 49 deletions.
62 changes: 13 additions & 49 deletions planetpy/pdstools/indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from tqdm import tqdm

from .. import utils
from .scraper import CTXIndex

try:
from importlib_resources import read_text
Expand Down Expand Up @@ -57,18 +58,25 @@ def replace_url_suffix(url, new_suffix=".tab"):
)


def download(key, local_dir=".", convert_to_hdf=True):
def download(key=None, label_url=None, local_dir=".", convert_to_hdf=True):
"""Wrapping URLs for downloading PDS indices and their label files.
Parameters
==========
key : str
key : str, optional
Colon-separated key into the available index files, e.g. cassini:uvis:moon_summary
localpath: str, pathlib.Path, optional
label_url : str, optional
Alternative to using the index system, the user can provide the URL to a label
for an index. The table file has to be in the same folder, as usual.
local_dir: str, pathlib.Path, optional
Path for local storage. Default: current directory and filename from URL
"""
mission, instr, index = key.split(":")
label_url = indices_urls[mission][instr][index]
if label_url is None:
if key is not None:
mission, instr, index = key.split(":")
label_url = indices_urls[mission][instr][index]
else:
raise SyntaxError("One of key or label_url needs to be given.")
logger.info("Downloading %s." % label_url)
local_label_path, _ = utils.download(label_url, local_dir)
data_url = replace_url_suffix(label_url)
Expand Down Expand Up @@ -315,47 +323,3 @@ def fix_hirise_edrcumindex(infname, outfname):
_ = newf.write(line.replace(exp, exp[:9]))
else:
_ = newf.write(line)


# TODO:
# if not labelpath.exists():
# df = pd.read_csv(indexpath, header=None)


# FIXME
def convert_indexfiles_to_hdf(folder):
"""Convert all indexfiles to an HDF database.
Search for .tab files in `folder`, read them into a dataframe,
concat to large dataframe at the end and store as HDF file.
Parameters
----------
folder : str or pathlib.Path
Folder in where to search for .tab files
labelpath : str or pathlb.Path
"""
indexdir = Path(folder)
# TODO: make it work for .TAB as well
indexfiles = list(indexdir.glob("*.tab"))
bucket = []
if PROGRESSBAR_EXISTS:
bar = progressbar.ProgressBar(max_value=len(indexfiles))
for i, indexfile in enumerate(indexfiles):
# convert times later, more performant
df = index_to_df(indexfile, convert_times=False)
df["index_fname"] = str(indexfile)
bucket.append(df)
if bar:
bar.update(i)
totalindex = pd.concat(bucket, ignore_index=True)
# Converting timestrings to datetimes
print("Converting times...")
for column in [i for i in totalindex.columns if "TIME" in i]:
totalindex[column] = pd.to_datetime(
totalindex[column].map(utils.nasa_datetime_to_iso)
)
# TODO: Clean up old iss references
savepath = indexdir / "iss_totalindex.hdf"
totalindex.to_hdf(savepath, "df")
print(f"Created pandas HDF index database file here:\n{savepath}")
48 changes: 48 additions & 0 deletions planetpy/pdstools/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from pathlib import Path
from string import Template

import pandas as pd


class CTXIndex:
volumes_url = "https://pds-imaging.jpl.nasa.gov/volumes/mro.html"
release_url_template = \
Template(
"https://pds-imaging.jpl.nasa.gov/volumes/mro/release${release}.html")
volume_url_template = \
Template(
"https://pds-imaging.jpl.nasa.gov/data/mro/mars_reconnaissance_orbiter/ctx/mrox_${volume}/")

@property
def web_tables_list(self):
print("Scraping volumes page ...")
return pd.read_html(self.volumes_url)

@property
def release_number(self):
l = self.web_tables_list
# The last item of last table looks like "Release XX"
return l[-1].iloc[-1, 0].split()[-1]

@property
def release_url(self):
return self.release_url_template.substitute(release=self.release_number)

@property
def latest_volume_url(self):
print("Scraping latest release page ...")
l = pd.read_html(self.release_url)
# get last row of 4th table
row = l[3].iloc[-1]
number = None
# first number that is NAN breaks the loop over last row of table
for elem in row.values:
try:
number = int(elem.split()[-1])
except AttributeError:
break
return self.volume_url_template.substitute(volume=number)

@property
def latest_index_label_url(self):
return Path(self.latest_volume_url) / 'index/cumindex.lbl'

0 comments on commit 65e3794

Please sign in to comment.