Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify download for antenna models / proposal tables / shower library with additional fallback servers #673

Merged
merged 14 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 5 additions & 18 deletions NuRadioMC/EvtGen/proposal_table_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,27 +89,14 @@ def download_proposal_tables(config_file, tables_path=None):
tables_path = proposal_func._ProposalFunctions__tables_path

# does not exist yet -> download file
import requests
from NuRadioReco.utilities.dataservers import download_from_dataserver
proposal_version = proposal.__version__
URL = f'https://rnog-data.zeuthen.desy.de/proposal_tables/v{proposal_version}/{get_compiler()}/{config_file}.tar.gz'
remote_path = f'proposal_tables/v{proposal_version}/{get_compiler()}/{config_file}.tar.gz'
target_path = f"{tables_path}/{config_file}.tar.gz"

folder = tables_path #os.path.dirname(tables_path)
if not os.path.exists(folder):
os.makedirs(folder)
logger.warning(
"downloading pre-calculated proposal tables for {} from {}. This can take a while...".format(config_file, URL))
r = requests.get(URL)
if r.status_code != requests.codes.ok:
logger.error("error in download of proposal tables")
raise IOError

with open(f"{tables_path}/{config_file}.tar.gz", "wb") as code:
code.write(r.content)
logger.warning("...download finished.")
logger.warning(f"...unpacking archive to {tables_path}")
shutil.unpack_archive(f"{tables_path}/{config_file}.tar.gz", tables_path)
os.remove(f"{tables_path}/{config_file}.tar.gz")

"downloading pre-calculated proposal tables for {}. This can take a while...".format(config_file))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you forgot to delete this line?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intended to leave it here as is. The logger message command starts the line above, so just this line starting with a string looks odd here.

download_from_dataserver(remote_path, target_path, unpack_tarball=True)


if __name__ == "__main__":
Expand Down
14 changes: 4 additions & 10 deletions NuRadioMC/SignalGen/ARZ/ARZ.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,23 +375,17 @@ def __check_and_get_library(self):
if("{:d}.{:d}".format(*self._version) in lib_hashs.keys()):
if(sha1.hexdigest() != lib_hashs["{:d}.{:d}".format(*self._version)]):
logger.warning("shower library {} has changed on the server. downloading newest version...".format(self._version))
os.remove(path)
download_file = True
else:
logger.warning("no hash sum of {} available, skipping up-to-date check".format(os.path.basename(path)))
if not download_file:
return True
else:
import requests
URL = 'https://rnog-data.zeuthen.desy.de/shower_library/library_v{:d}.{:d}.pkl'.format(*self._version)
from NuRadioReco.utilities.dataservers import download_from_dataserver

logger.info("downloading shower library {} from {}. This can take a while...".format(self._version, URL))
r = requests.get(URL)
if (r.status_code != requests.codes.ok):
logger.error("error in download of antenna model")
raise IOError("error in download of antenna model")
with open(path, "wb") as code:
code.write(r.content)
logger.info("...download finished.")
remote_path = 'shower_library/library_v{:d}.{:d}.pkl'.format(*self._version)
download_from_dataserver(remote_path, path)

def __set_model_parameters(self, arz_version='ARZ2020'):
"""
Expand Down
23 changes: 6 additions & 17 deletions NuRadioReco/detector/antennapattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,6 @@ def save_preprocessed_WIPLD_forARA(path):
np.angle(H_theta[mask][i]) / units.deg,
np.angle(H_phi[mask][i]) / units.deg))


def get_pickle_antenna_response(path):
"""
opens and return the pickle file containing the preprocessed WIPL-D antenna simulation
Expand Down Expand Up @@ -577,29 +576,19 @@ def get_pickle_antenna_response(path):
if sha1.hexdigest() != antenna_hashs[os.path.basename(path)]:
logger.warning("antenna model {} has changed on the server. downloading newest version...".format(
os.path.basename(path)))
os.remove(path) # remove outdated file
download_file = True
else:
logger.warning("no hash sum of {} available, skipping up-to-date check".format(os.path.basename(path)))

if download_file:
# does not exist yet -> download file
import requests
from NuRadioReco.utilities.dataservers import download_from_dataserver

antenna_pattern_name = os.path.splitext(os.path.basename(path))[0]
URL = 'https://rnog-data.zeuthen.desy.de/AntennaModels/{name}/{name}.pkl'.format(
name=antenna_pattern_name)

folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
logger.info(
"downloading antenna pattern {} from {}. This can take a while...".format(antenna_pattern_name, URL))
r = requests.get(URL)
if r.status_code != requests.codes.ok:
logger.error("error in download of antenna model")
raise IOError
with open(path, "wb") as code:
code.write(r.content)
logger.warning("...download finished.")
remote_path = 'AntennaModels/{name}/{name}.pkl'.format(name=antenna_pattern_name)

download_from_dataserver(remote_path, path)

# # does not exist yet -> precalculating WIPLD simulations from raw WIPLD output
# preprocess_WIPLD(path)
Expand Down
115 changes: 115 additions & 0 deletions NuRadioReco/utilities/dataservers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import requests
import os
import filelock
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filelock claims it requires python >= 3.8, is that a problem?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well the tests download without problems. We require 3.6 for NuRadioMC currently. Do you know alternatives? Or would you fix filelock to v3.4.1, when they dropped python 3.6 support?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess pip should do the right thing and automatically download an older version if needed (and it looks like the old versions still exist on pypi), so I think this should be fine.

import logging
import shutil
from glob import glob

logger = logging.getLogger('NuRadioReco.dataservers')

dataservers = ["https://rnog-data.zeuthen.desy.de", "https://rno-g.uchicago.edu/data/desy-mirror"]

def get_available_dataservers_by_responsetime(dataservers=dataservers):
""" requests a small index file from the list of dataservers and returns a list of responsive ones ordered by elapsed time """
response_times = []
available_dataservers = []

for dataserver in dataservers:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it could make sense to parallelize this, though in practice it probably doesn't matter very much with just two servers...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah... right.. I don't expect that we will ever have >> 2 servers. I suggest to leave it as is.

# get the index of the shower_library directory, because it is short
testdir = f"{dataserver}/shower_library/"
try:
response = requests.get(testdir, timeout=5)
response.raise_for_status()
except:
continue
response_times.append(response.elapsed)
available_dataservers.append(dataserver)
ranked_dataservers = [x for _, x in sorted(zip(response_times, available_dataservers))]
return ranked_dataservers

def get_available_dataservers_by_timezone(dataservers=dataservers):
""" uses the server locations' timezones from the list of dataservers and returns the list of dataservers ordered by proximity """
import socket
import pytz
from datetime import datetime
from geolite2 import geolite2

geo = geolite2.reader()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than using geolocation, why not just store the server time zone along with the server names? I don't think we're going to have significantly more than two ever?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the response time test is more robust anyway though...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I removed the function entirely. Just having response time around is enough, if we are not enabling the option for now anyways (we agreed with @cg-laser that since download is only once for new files, we'd not request ordering, but leave DESY the "master")


naive = datetime.utcnow()
utcoffset_local = naive.astimezone().utcoffset().total_seconds()/3600
server_offsets = []
for dataserver in dataservers:
dataserver_ip = socket.gethostbyname(dataserver)
dataserver_timezone = geo.get(dataserver_ip)["location"]["time_zone"]
timezone = pytz.timezone(dataserver_timezone)
utcoffset_server = timezone.localize(naive).utcoffset().total_seconds()/3600

server_offsets.append((utcoffset_local-utcoffset_server)%12)

ranked_dataservers = [x for _, x in sorted(zip(server_offsets, dataservers))]
return ranked_dataservers

def download_from_dataserver(remote_path, target_path, unpack_tarball=True, dataservers=dataservers, try_ordered=False):
""" download remote_path to target_path from the list of NuRadio dataservers """

folder = os.path.dirname(target_path)
if not os.path.exists(folder):
os.makedirs(folder)

lockfile = target_path+".lock"
lock = filelock.FileLock(lockfile)

logger.warning(f"Assuring no other process is downloading. Will wait until {lockfile} is unlocked.")
with lock:
if os.path.isfile(target_path):
logger.warning(f"{target_path} already exists. Maybe download was already completed by another instance?")
return
elif unpack_tarball and (len(glob(os.path.dirname(target_path) + "/*.dat")) > 0): #just check if any .dat files present (similar to NuRadioProposal.py)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are the data files ever updated? Or will the name get changed in that case?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The download function just gets called if the file needs download (either because the hash sum is not the latest version or the file is not there. The if/elif here just catches the case when 1000 jobs are started and the first that aquired the lock did already perform the download...

logger.warning(f"{os.path.dirname(target_path)} contains .dat files. Maybe download was already completed by another instance?")
return

if try_ordered:
dataservers = get_available_dataservers_by_responsetime(dataservers)
# alternatively:
# dataservers = get_available_dataservers_by_timezone(dataservers)
requests_status = requests.codes["not_found"]
for dataserver in dataservers:
URL = f'{dataserver}/{remote_path}'

logger.warning(
"downloading file {} from {}. This can take a while...".format(target_path, URL))

try:
r = requests.get(URL)
r.raise_for_status()
requests_status = r.status_code
break
except requests.exceptions.HTTPError as errh:
logger.warning(f"HTTP Error for {dataserver}. Does the file {remote_path} exist on the server?")
pass
except requests.exceptions.ConnectionError as errc:
logger.warning(f"Error Connecting to {dataserver}. Maybe you don't have internet... or the server is down?")
pass
except requests.exceptions.Timeout as errt:
logger.warning(f"Timeout Error for {dataserver}.")
pass
except requests.exceptions.RequestException as err:
logger.warning(f"An unusual error for {dataserver} occurred:", err)
pass

logger.warning("problem downloading file {} from {}. Let's see if there is another server.".format(target_path, URL))

if requests_status != requests.codes["ok"]:
logger.error(f"error in download of file {target_path}. Tried all servers in {dataservers} without success.")
raise IOError

with open(target_path, "wb") as code:
code.write(r.content)
logger.warning("...download finished.")

if unpack_tarball and target_path.endswith(".tar.gz"):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI unpack archive supports additional archive formats. Not sure if it would make sense to support .tar.xz to save some bandwidth in some cases?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good, but I don't think this is really needed. the proposal tarballs are the only tarballed files, and we chose to have them as tar.gz... they are not so big, and you only need to download once for a NuRadioMC installation.

target_dir = os.path.dirname(target_path)
logger.warning(f"...unpacking archive to {target_dir}")
shutil.unpack_archive(target_path, target_dir)
os.remove(target_path)
3 changes: 3 additions & 0 deletions changelog.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ please update the categories "new features" and "bugfixes" before a pull request

version 2.3.0-dev
new features:
- Added download utility unifying download of antenna models/proposal tables/shower
library with possibility to add/change fallback-server(s). Added Chicago
server as fallback.
- Added new detector class for RNO-G (+db interface) which uses a mongo-db as source.
The new class allows import/export via compressed json files and has a buffer machinary.
It comes with a class to handle responses of the different signal chain components easily.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ toml = ">=0.10.2"
uproot = "4.1.1"
importlib-metadata = {version = ">=4.8.1", python = "<3.8"}
numba = "*"
filelock = "*"

[tool.poetry.dev-dependencies]
Sphinx = "*"
Expand Down