From 667853d3f819e7c4ea50ab8060fbad98a7d9e581 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Thu, 9 Apr 2020 16:28:29 -0500 Subject: [PATCH 1/7] read caches list from wlcg-wpad servers by default --- explanation.md | 4 +- setup.py | 7 +- stashcp/__init__.py | 200 +++++++++++++++++++++++--------- stashcp/caches.json | 10 -- stashcp/opensciencegrid.org.pub | 10 ++ 5 files changed, 161 insertions(+), 70 deletions(-) delete mode 100755 stashcp/caches.json create mode 100644 stashcp/opensciencegrid.org.pub diff --git a/explanation.md b/explanation.md index fc4de88..77568de 100644 --- a/explanation.md +++ b/explanation.md @@ -1,3 +1,5 @@ +WARNING: this is out of date. It is only left for historical reasons + ## Usage ``` @@ -139,4 +141,4 @@ It is recommended that `$timeout` not be set to 1 second, as tests showed that d * No record of whether the file was new to the cache it was pulled from or not -* Does not currently allow for file renaming. \ No newline at end of file +* Does not currently allow for file renaming. diff --git a/setup.py b/setup.py index 17acaa5..a34863d 100644 --- a/setup.py +++ b/setup.py @@ -152,17 +152,14 @@ # # If using Python 2.6 or earlier, then these have to be included in # MANIFEST.in as well. - package_data={'stashcp': ['caches.json'],}, - #package_data={ # Optional - # '': ['bin/caches.json'], - #}, + package_data={'stashcp': ['opensciencegrid.org.pub'],}, # Although 'package_data' is the preferred approach, in some case you may # need to place data files outside of your packages. See: # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # # In this case, 'data_file' will be installed into '/my_data' - data_files=[('share/stashcache/', ['stashcp/caches.json'])], + data_files=[('share/stashcache/', ['stashcp/opensciencegrid.org.pub'])], # To provide executable scripts, use entry points in preference to the # "scripts" keyword. Entry points provide cross-platform support and allow diff --git a/stashcp/__init__.py b/stashcp/__init__.py index 4754f73..2222ca9 100755 --- a/stashcp/__init__.py +++ b/stashcp/__init__.py @@ -12,6 +12,7 @@ import socket import random import shutil +import hashlib from urlparse import urlparse try: @@ -41,6 +42,9 @@ # Global variable for the location of the caches.json file caches_json_location = None +# Global variable for the name of a pre-configured cache list +cache_list_name = None + # Global variable for the location of the token to use for reading / writing token_location = None @@ -559,73 +563,78 @@ def get_ips(name): # always prefer IPv4 return ipv4s + ipv6s +# Return best stashcache and set nearest_cache_list global def get_best_stashcache(): global nearest_cache_list - # Check if the user provided a caches json file location - if caches_json_location and os.path.exists(caches_json_location): - cache_files = [ caches_json_location ] - else: - prefix = os.environ.get("OSG_LOCATION", "/") - cache_files = [os.path.join(prefix, "etc/stashcache/caches.json"), - os.path.join(prefix, "usr/share/stashcache/caches.json"), - os.path.join(prefix, "usr/local/share/stashcache/caches.json")] - if resource_filename: - try: - cache_files.append(resource_filename(__name__, 'caches.json')) - except IOError as ioe: - logging.debug("Unable to retrieve caches.json using resource string, trying other locations") - - for cache_file in cache_files: - if os.path.isfile(cache_file): - with open(cache_file, 'r') as f: - caches_list = json.loads(f.read()) - logging.debug("Loaded caches list from %s", cache_file) - break - else: - logging.error("Unable to find caches.json in %r", cache_files) - return None - - # Format the caches for the GeoIP query - caches_string = "" - usable_caches = [] - for cache in caches_list: - if cache['status'] == 0: - continue - usable_caches.append(cache) - parsed_url = urlparse(cache['name']) - caches_string = "%s,%s" % (caches_string, parsed_url.hostname) - caches_list = usable_caches - # Remove the first comma - caches_string = caches_string[1:] - # Use the geo ip service on the WLCG Web Proxy Auto Discovery machines geo_ip_sites = ["wlcg-wpad.cern.ch", "wlcg-wpad.fnal.gov"] - # Append text before caches string - append_text = "api/v1.0/geo/stashcp" - # Headers for the HTTP request headers = {'Cache-control': 'max-age=0', 'User-Agent': user_agent } # Randomize the geo ip sites random.shuffle(geo_ip_sites) - order_str = '' + + api_text = '' + + caches_list = [] + + # Check if the user provided a caches json file location + if caches_json_location and os.path.exists(caches_json_location): + # Use geo ip api on caches in provided json file + try: + with open(caches_json_location, 'r') as f: + caches_list = json.loads(f.read()) + logging.debug("Loaded caches list from %s", caches_json_location) + except: + logging.error("Unable to open or parse json in %s: %s", + caches_json_location, str(sys.exc_info()[1])) + return None + + # Format the caches for the GeoIP query + caches_string = "" + usable_caches = [] + for cache in caches_list: + if 'status' in cache and cache['status'] == 0: + continue + if 'name' in cache: + usable_caches.append(cache['name']) + parsed_url = urlparse(cache['name']) + caches_string = "%s,%s" % (caches_string, parsed_url.hostname) + if len(usable_caches) == 0: + logging.error("No cache names found in %s without zero status", caches_json_location) + return None + + caches_list = usable_caches + + # Remove the first comma + caches_string = caches_string[1:] + + api_text = "api/v1.0/geo/stashcp/" + caches_string + + else: + # Use stashservers.dat api + api_text = "stashservers.dat" + if cache_list_name != None: + api_text += '?list=' + cache_list_name + + responselines = [] i = 0 - while order_str == '' and i < len(geo_ip_sites): + while len(responselines) == 0 and i < len(geo_ip_sites): cur_site = geo_ip_sites[i] headers['Host'] = cur_site + logging.debug("Trying server site of %s", cur_site) for ip in get_ips(cur_site): - logging.debug("Trying geoip site of: %s [%s]", cur_site, ip) - final_url = "http://%s/%s/%s" % (ip, append_text, caches_string) - logging.debug("Querying for closest cache: %s", final_url) + final_url = "http://%s/%s" % (ip, api_text) + logging.debug("Querying %s", final_url) try: # Make the request req = urllib2.Request(final_url, headers=headers) response = urllib2.urlopen(req, timeout=10) if response.getcode() == 200: logging.debug("Got OK code 200 from %s", cur_site) - order_str = response.read() + responselines = response.read().split('\n') response.close() break response.close() @@ -633,26 +642,105 @@ def get_best_stashcache(): logging.debug("URL error: %s", str(e)) except Exception, e: logging.debug("Error: %s", str(e)) - i+=1 + i+=1 + + order_str = '' + if len(responselines) > 0: + order_str = responselines[0] if order_str == '': + if len(caches_list) == 0: + logging.error("unable to get list of caches") + return None # Unable to find a geo_ip server to use, return random choice from caches! - minsite = random.choice(caches_list)['name'] - random.shuffle(caches_list) - nearest_cache_list = [cache['name'] for cache in caches_list] + nearest_cache_list = caches_list + random.shuffle(nearest_cache_list) + minsite = nearest_cache_list[0] logging.warning("Unable to use Geoip to find closest cache! Returning random cache %s", minsite) - logging.debug("Ordered list of nearest caches: %s", str(nearest_cache_list)) + logging.debug("Randomized list of nearest caches: %s", str(nearest_cache_list)) return minsite else: # The order string should be something like: # 3,1,2 ordered_list = order_str.strip().split(",") logging.debug("Got order %s", str(ordered_list)) - minsite = caches_list[int(ordered_list[0])-1]['name'] + + if len(caches_list) == 0: + # Used the stashservers.dat api + + if len(responselines) < 8: + logging.error("stashservers response too short, less than 8 lines") + return None + hashname = responselines[4][-5:] + if hashname != "-sha1": + logging.error("stashservers response does not have sha1 hash: %s", hashname) + return None + hashedtext = '\n'.join(responselines[1:5]) + '\n' + hash = hashlib.sha1(hashedtext).hexdigest() + if responselines[6] != hash: + logging.debug("stashservers hash %s does not match expected hash %s", responselines[6], hash) + logging.debug("hashed text:\n%s", hashedtext) + logging.error("stashservers response hash does not match expected hash") + return None + + if not os.path.exists("/usr/bin/openssl"): + logging.debug("openssl not installed, skipping signature check") + else: + sig = '\n'.join(responselines[7:]) + + # Look for the OSG cvmfs public key to verify signature + prefix = os.environ.get("OSG_LOCATION", "/") + osgpub = 'opensciencegrid.org.pub' + pubkey_files = ['/etc/cvmfs/keys/opensciencegrid.org/' + osgpub, + os.path.join(prefix, "etc/stashcache", osgpub), + os.path.join(prefix, "usr/share/stashcache", osgpub)] + if resource_filename: + try: + pubkey_files.append(resource_filename(__name__, osgpub)) + except IOError as ioe: + logging.debug("Unable to retrieve caches.json using resource string, trying other locations") + + for pubkey_file in pubkey_files: + if os.path.isfile(pubkey_file): + break + else: + logging.error("Unable to find osg cvmfs key in %r", pubkey_files) + return None + + cmd = "/usr/bin/openssl rsautl -verify -pubin -inkey " + pubkey_file + logging.debug("Running %s", cmd) + p = subprocess.Popen(cmd, shell=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p.stdin.write(sig) + p.stdin.close() + decryptedhash = p.stdout.read() + p.stdout.close() + if hash != decryptedhash: + logging.debug("stashservers hash %s does not match decrypted signature %s", hash, decryptedhash) + logging.error("stashservers signature does not verify") + return None + logging.debug("Signature matched") + + lists = responselines[4].split(';') + logging.debug("Cache lists: %s", lists) + + if cache_list_name == None: + caches = lists[0].split('=')[1] + else: + for l in lists: + n=len(cache_list_name)+1 + if l[0:n] == cache_list_name + '=': + caches = l[n:] + break + caches_list = caches.split(',') + for i in range(len(caches_list)): + caches_list[i] = 'root://' + caches_list[i] + + minsite = caches_list[int(ordered_list[0])-1] nearest_cache_list = [] for ordered_index in ordered_list: - nearest_cache_list.append(caches_list[int(ordered_index)-1]['name']) + nearest_cache_list.append(caches_list[int(ordered_index)-1]) logging.debug("Returning closest cache: %s", minsite) logging.debug("Ordered list of nearest caches: %s", str(nearest_cache_list)) @@ -663,6 +751,7 @@ def main(): global nearest_cache global nearest_cache_list global caches_json_location + global cache_list_name global token_location usage = "usage: %prog [options] source destination" @@ -671,7 +760,9 @@ def main(): parser.add_option('-r', dest='recursive', action='store_true', help='recursively copy') parser.add_option('--closest', action='store_true', help="Return the closest cache and exit") parser.add_option('-c', '--cache', dest='cache', help="Cache to use") - parser.add_option('-j', '--caches-json', dest='caches_json', help="The JSON file containing the list of caches", + parser.add_option('-j', '--caches-json', dest='caches_json', help="A JSON file containing the list of caches", + default=None) + parser.add_option('-n', '--cache-list-name', dest='cache_list_name', help="Name of pre-configured cache list to use", default=None) parser.add_option('--methods', dest='methods', help="Comma separated list of methods to try, in order. Default: cvmfs,xrootd,http", default="cvmfs,xrootd,http") parser.add_option('-t', '--token', dest='token', help="Token file to use for reading and/or writing") @@ -691,6 +782,7 @@ def main(): caches_json_location = os.environ['CACHES_JSON'] else: caches_json_location = args.caches_json + cache_list_name = args.cache_list_name if args.closest: print get_best_stashcache() sys.exit(0) diff --git a/stashcp/caches.json b/stashcp/caches.json deleted file mode 100755 index 2f0a372..0000000 --- a/stashcp/caches.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"name":"root://its-condor-xrootd1.syr.edu", "status":1, "longitude":-72.9333, "latitude":40.8167}, - {"name":"root://sc-cache.chtc.wisc.edu", "status":1, "longitude":-89.4012, "latitude":43.0731}, - {"name":"root://osg-new-york-stashcache.nrp.internet2.edu", "status":1, "longitude":-74.006, "latitude":40.7143}, - {"name":"root://osg-kansas-city-stashcache.nrp.internet2.edu", "status":1, "longitude":-94.5986, "latitude":39.1024}, - {"name":"root://osg-chicago-stashcache.nrp.internet2.edu", "status":1, "longitude":-87.6181, "latitude":41.8858}, - {"name":"root://fiona.uvalight.net", "status":1, "longitude":4.8995, "latitude":52.3824}, - {"name":"root://stashcache.t2.ucsd.edu", "status":1, "longitude":-117.2359, "latitude":32.8807}, - {"name":"root://osg-gftp.pace.gatech.edu", "status":1, "longitude":-84.398, "latitude":33.7763} -] diff --git a/stashcp/opensciencegrid.org.pub b/stashcp/opensciencegrid.org.pub new file mode 100644 index 0000000..3fbbbd1 --- /dev/null +++ b/stashcp/opensciencegrid.org.pub @@ -0,0 +1,10 @@ +-----BEGIN PUBLIC KEY----- +MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAqQGYXTp9cRcMbGeDoijB +gKNTCEpIWB7XcqIHVXJjfxEkycQXMyZkB7O0CvV3UmmY2K7CQqTnd9ddcApn7BqQ +/7QGP0H1jfXLfqVdwnhyjIHxmV2x8GIHRHFA0wE+DadQwoi1G0k0SNxOVS5qbdeV +yiyKsoU4JSqy5l2tK3K/RJE4htSruPCrRCK3xcN5nBeZK5gZd+/ufPIG+hd78kjQ +Dy3YQXwmEPm7kAZwIsEbMa0PNkp85IDkdR1GpvRvDMCRmUaRHrQUPBwPIjs0akL+ +qoTxJs9k6quV0g3Wd8z65s/k5mEZ+AnHHI0+0CL3y80wnuLSBYmw05YBtKyoa1Fb +FQIDAQAB +-----END PUBLIC KEY----- + From 40c839c62bd79df23f8effdc521f01d902d78749 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Tue, 14 Apr 2020 16:30:42 -0500 Subject: [PATCH 2/7] add comments explaining format and design choices for stashservers api --- stashcp/__init__.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/stashcp/__init__.py b/stashcp/__init__.py index 2222ca9..3de2adc 100755 --- a/stashcp/__init__.py +++ b/stashcp/__init__.py @@ -668,6 +668,34 @@ def get_best_stashcache(): if len(caches_list) == 0: # Used the stashservers.dat api + # After the geo order of the selected server list on line zero, + # the rest of the response is in .cvmfswhitelist format. + # This is done to avoid using https for every request on the + # wlcg-wpad servers and takes advantage of conveniently + # existing infrastructure. + # The format contains the following lines: + # 1. Creation date stamp, e.g. 20200414170005. For debugging + # only. + # 2. Expiration date stamp, e.g. E20200421170005. cvmfs clients + # check this to avoid replay attacks, but for this api that + # is not much of a risk so it is ignored. + # 3. "Repository" name, e.g. Nstash-servers. cvmfs clients + # also check this but it is not important here. + # 4. With cvmfs the 4th line has a repository fingerprint, but + # for this api it instead contains a semi-colon separated list + # of named server lists. Each server list is of the form + # name=servers where servers is comma-separated. Ends with + # "hash=-sha1" because cvmfs_server expects the hash name + # to be there. e.g. + # xroot=stashcache.t2.ucsd.edu,sg-gftp.pace.gatech.edu;xroots=xrootd-local.unl.edu,stashcache.t2.ucsd.edu;hash=-sha1 + # 5. A two-dash separator, i.e "--" + # 6. The sha1 hash of lines 1 through 4. + # 7. The signature, i.e. an RSA encryption of the hash that can + # be decrypted by the OSG cvmfs public key. Contains binary + # information so it may contain a variable number of newlines + # which would have caused it to have been split into multiple + # response "lines". + if len(responselines) < 8: logging.error("stashservers response too short, less than 8 lines") return None @@ -683,7 +711,12 @@ def get_best_stashcache(): logging.error("stashservers response hash does not match expected hash") return None + # Call out to /usr/bin/openssl if present, in order to avoid + # python dependency on a crypto package. if not os.path.exists("/usr/bin/openssl"): + # The signature check isn't critical to be done everywhere; + # any tampering will likely to be caught somewhere and + # investigated. Usually openssl is present. logging.debug("openssl not installed, skipping signature check") else: sig = '\n'.join(responselines[7:]) From 66aff89d2eab081da0678ff1fc10f962f523aca6 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Tue, 21 Apr 2020 12:08:17 -0500 Subject: [PATCH 3/7] add back caches.json in docs/configs --- docs/configs/caches.json | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 docs/configs/caches.json diff --git a/docs/configs/caches.json b/docs/configs/caches.json new file mode 100644 index 0000000..5dd6533 --- /dev/null +++ b/docs/configs/caches.json @@ -0,0 +1,10 @@ +[ + {"name":"root://its-condor-xrootd1.syr.edu", "status":1}, + {"name":"root://sc-cache.chtc.wisc.edu", "status":1}, + {"name":"root://osg-new-york-stashcache.nrp.internet2.edu", "status":1}, + {"name":"root://osg-kansas-city-stashcache.nrp.internet2.edu", "status":1}, + {"name":"root://osg-chicago-stashcache.nrp.internet2.edu", "status":1}, + {"name":"root://fiona.uvalight.net", "status":1, "longitude":4.8995}, + {"name":"root://stashcache.t2.ucsd.edu", "status":1}, + {"name":"root://osg-gftp.pace.gatech.edu", "status":1} +] From d0bd6a9fd419ded2c015ec768dd8d5000ad3ee87 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Mon, 27 Apr 2020 15:57:07 -0500 Subject: [PATCH 4/7] one more cleanup --- docs/configs/caches.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configs/caches.json b/docs/configs/caches.json index 5dd6533..865f051 100644 --- a/docs/configs/caches.json +++ b/docs/configs/caches.json @@ -4,7 +4,7 @@ {"name":"root://osg-new-york-stashcache.nrp.internet2.edu", "status":1}, {"name":"root://osg-kansas-city-stashcache.nrp.internet2.edu", "status":1}, {"name":"root://osg-chicago-stashcache.nrp.internet2.edu", "status":1}, - {"name":"root://fiona.uvalight.net", "status":1, "longitude":4.8995}, + {"name":"root://fiona.uvalight.net", "status":1}, {"name":"root://stashcache.t2.ucsd.edu", "status":1}, {"name":"root://osg-gftp.pace.gatech.edu", "status":1} ] From 726e2609495e32ada9fcb3ba2010cdf2d052eb26 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Wed, 6 May 2020 14:29:42 -0500 Subject: [PATCH 5/7] move json loading and stashserver loading into functions --- stashcp/__init__.py | 275 +++++++++++++++++++++++--------------------- 1 file changed, 145 insertions(+), 130 deletions(-) diff --git a/stashcp/__init__.py b/stashcp/__init__.py index 3de2adc..c7fa5ee 100755 --- a/stashcp/__init__.py +++ b/stashcp/__init__.py @@ -563,6 +563,138 @@ def get_ips(name): # always prefer IPv4 return ipv4s + ipv6s + +# Return list of cache URLs +def get_json_caches(caches_json_location): + try: + with open(caches_json_location, 'r') as f: + caches_list = json.loads(f.read()) + logging.debug("Loaded caches list from %s", caches_json_location) + except: + logging.error("Unable to open or parse json in %s: %s", + caches_json_location, str(sys.exc_info()[1])) + return None + + usable_caches = [] + for cache in caches_list: + if 'status' in cache and cache['status'] == 0: + continue + if 'name' in cache: + usable_caches.append(cache['name']) + if len(usable_caches) == 0: + logging.error("No cache names found in %s without zero status", caches_json_location) + return None + + return usable_caches + + +# Return list of caches as root:// URLs +def get_stashservers_caches(responselines): + + # After the geo order of the selected server list on line zero, + # the rest of the response is in .cvmfswhitelist format. + # This is done to avoid using https for every request on the + # wlcg-wpad servers and takes advantage of conveniently + # existing infrastructure. + # The format contains the following lines: + # 1. Creation date stamp, e.g. 20200414170005. For debugging + # only. + # 2. Expiration date stamp, e.g. E20200421170005. cvmfs clients + # check this to avoid replay attacks, but for this api that + # is not much of a risk so it is ignored. + # 3. "Repository" name, e.g. Nstash-servers. cvmfs clients + # also check this but it is not important here. + # 4. With cvmfs the 4th line has a repository fingerprint, but + # for this api it instead contains a semi-colon separated list + # of named server lists. Each server list is of the form + # name=servers where servers is comma-separated. Ends with + # "hash=-sha1" because cvmfs_server expects the hash name + # to be there. e.g. + # xroot=stashcache.t2.ucsd.edu,sg-gftp.pace.gatech.edu;xroots=xrootd-local.unl.edu,stashcache.t2.ucsd.edu;hash=-sha1 + # 5. A two-dash separator, i.e "--" + # 6. The sha1 hash of lines 1 through 4. + # 7. The signature, i.e. an RSA encryption of the hash that can + # be decrypted by the OSG cvmfs public key. Contains binary + # information so it may contain a variable number of newlines + # which would have caused it to have been split into multiple + # response "lines". + + if len(responselines) < 8: + logging.error("stashservers response too short, less than 8 lines") + return None + hashname = responselines[4][-5:] + if hashname != "-sha1": + logging.error("stashservers response does not have sha1 hash: %s", hashname) + return None + hashedtext = '\n'.join(responselines[1:5]) + '\n' + hash = hashlib.sha1(hashedtext).hexdigest() + if responselines[6] != hash: + logging.debug("stashservers hash %s does not match expected hash %s", responselines[6], hash) + logging.debug("hashed text:\n%s", hashedtext) + logging.error("stashservers response hash does not match expected hash") + return None + + # Call out to /usr/bin/openssl if present, in order to avoid + # python dependency on a crypto package. + if not os.path.exists("/usr/bin/openssl"): + # The signature check isn't critical to be done everywhere; + # any tampering will likely to be caught somewhere and + # investigated. Usually openssl is present. + logging.debug("openssl not installed, skipping signature check") + else: + sig = '\n'.join(responselines[7:]) + + # Look for the OSG cvmfs public key to verify signature + prefix = os.environ.get("OSG_LOCATION", "/") + osgpub = 'opensciencegrid.org.pub' + pubkey_files = ['/etc/cvmfs/keys/opensciencegrid.org/' + osgpub, + os.path.join(prefix, "etc/stashcache", osgpub), + os.path.join(prefix, "usr/share/stashcache", osgpub)] + if resource_filename: + try: + pubkey_files.append(resource_filename(__name__, osgpub)) + except IOError as ioe: + logging.debug("Unable to retrieve caches.json using resource string, trying other locations") + + for pubkey_file in pubkey_files: + if os.path.isfile(pubkey_file): + break + else: + logging.error("Unable to find osg cvmfs key in %r", pubkey_files) + return None + + cmd = "/usr/bin/openssl rsautl -verify -pubin -inkey " + pubkey_file + logging.debug("Running %s", cmd) + p = subprocess.Popen(cmd, shell=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p.stdin.write(sig) + p.stdin.close() + decryptedhash = p.stdout.read() + p.stdout.close() + if hash != decryptedhash: + logging.debug("stashservers hash %s does not match decrypted signature %s", hash, decryptedhash) + logging.error("stashservers signature does not verify") + return None + logging.debug("Signature matched") + + lists = responselines[4].split(';') + logging.debug("Cache lists: %s", lists) + + if cache_list_name == None: + caches = lists[0].split('=')[1] + else: + for l in lists: + n=len(cache_list_name)+1 + if l[0:n] == cache_list_name + '=': + caches = l[n:] + break + caches_list = caches.split(',') + for i in range(len(caches_list)): + caches_list[i] = 'root://' + caches_list[i] + + return caches_list + + # Return best stashcache and set nearest_cache_list global def get_best_stashcache(): global nearest_cache_list @@ -576,48 +708,29 @@ def get_best_stashcache(): # Randomize the geo ip sites random.shuffle(geo_ip_sites) - api_text = '' + api_text = "" caches_list = [] # Check if the user provided a caches json file location - if caches_json_location and os.path.exists(caches_json_location): - # Use geo ip api on caches in provided json file - try: - with open(caches_json_location, 'r') as f: - caches_list = json.loads(f.read()) - logging.debug("Loaded caches list from %s", caches_json_location) - except: - logging.error("Unable to open or parse json in %s: %s", - caches_json_location, str(sys.exc_info()[1])) + if caches_json_location: + if not os.path.exists(caches_json_location): + logging.error(caches_json_location + " does not exist") return None - - # Format the caches for the GeoIP query + # Use geo ip api on caches in provided json file + caches_list = get_json_caches(caches_json_location) caches_string = "" - usable_caches = [] for cache in caches_list: - if 'status' in cache and cache['status'] == 0: - continue - if 'name' in cache: - usable_caches.append(cache['name']) - parsed_url = urlparse(cache['name']) - caches_string = "%s,%s" % (caches_string, parsed_url.hostname) - if len(usable_caches) == 0: - logging.error("No cache names found in %s without zero status", caches_json_location) - return None - - caches_list = usable_caches - + parsed_url = urlparse(cache) + caches_string = "%s,%s" % (caches_string, parsed_url.hostname) # Remove the first comma caches_string = caches_string[1:] - api_text = "api/v1.0/geo/stashcp/" + caches_string - else: # Use stashservers.dat api api_text = "stashservers.dat" if cache_list_name != None: - api_text += '?list=' + cache_list_name + api_text += "?list=" + cache_list_name responselines = [] i = 0 @@ -644,11 +757,11 @@ def get_best_stashcache(): logging.debug("Error: %s", str(e)) i+=1 - order_str = '' + order_str = "" if len(responselines) > 0: order_str = responselines[0] - if order_str == '': + if order_str == "": if len(caches_list) == 0: logging.error("unable to get list of caches") return None @@ -667,107 +780,9 @@ def get_best_stashcache(): if len(caches_list) == 0: # Used the stashservers.dat api - - # After the geo order of the selected server list on line zero, - # the rest of the response is in .cvmfswhitelist format. - # This is done to avoid using https for every request on the - # wlcg-wpad servers and takes advantage of conveniently - # existing infrastructure. - # The format contains the following lines: - # 1. Creation date stamp, e.g. 20200414170005. For debugging - # only. - # 2. Expiration date stamp, e.g. E20200421170005. cvmfs clients - # check this to avoid replay attacks, but for this api that - # is not much of a risk so it is ignored. - # 3. "Repository" name, e.g. Nstash-servers. cvmfs clients - # also check this but it is not important here. - # 4. With cvmfs the 4th line has a repository fingerprint, but - # for this api it instead contains a semi-colon separated list - # of named server lists. Each server list is of the form - # name=servers where servers is comma-separated. Ends with - # "hash=-sha1" because cvmfs_server expects the hash name - # to be there. e.g. - # xroot=stashcache.t2.ucsd.edu,sg-gftp.pace.gatech.edu;xroots=xrootd-local.unl.edu,stashcache.t2.ucsd.edu;hash=-sha1 - # 5. A two-dash separator, i.e "--" - # 6. The sha1 hash of lines 1 through 4. - # 7. The signature, i.e. an RSA encryption of the hash that can - # be decrypted by the OSG cvmfs public key. Contains binary - # information so it may contain a variable number of newlines - # which would have caused it to have been split into multiple - # response "lines". - - if len(responselines) < 8: - logging.error("stashservers response too short, less than 8 lines") - return None - hashname = responselines[4][-5:] - if hashname != "-sha1": - logging.error("stashservers response does not have sha1 hash: %s", hashname) + caches_list = get_stashservers_caches(responselines) + if caches_list is None: return None - hashedtext = '\n'.join(responselines[1:5]) + '\n' - hash = hashlib.sha1(hashedtext).hexdigest() - if responselines[6] != hash: - logging.debug("stashservers hash %s does not match expected hash %s", responselines[6], hash) - logging.debug("hashed text:\n%s", hashedtext) - logging.error("stashservers response hash does not match expected hash") - return None - - # Call out to /usr/bin/openssl if present, in order to avoid - # python dependency on a crypto package. - if not os.path.exists("/usr/bin/openssl"): - # The signature check isn't critical to be done everywhere; - # any tampering will likely to be caught somewhere and - # investigated. Usually openssl is present. - logging.debug("openssl not installed, skipping signature check") - else: - sig = '\n'.join(responselines[7:]) - - # Look for the OSG cvmfs public key to verify signature - prefix = os.environ.get("OSG_LOCATION", "/") - osgpub = 'opensciencegrid.org.pub' - pubkey_files = ['/etc/cvmfs/keys/opensciencegrid.org/' + osgpub, - os.path.join(prefix, "etc/stashcache", osgpub), - os.path.join(prefix, "usr/share/stashcache", osgpub)] - if resource_filename: - try: - pubkey_files.append(resource_filename(__name__, osgpub)) - except IOError as ioe: - logging.debug("Unable to retrieve caches.json using resource string, trying other locations") - - for pubkey_file in pubkey_files: - if os.path.isfile(pubkey_file): - break - else: - logging.error("Unable to find osg cvmfs key in %r", pubkey_files) - return None - - cmd = "/usr/bin/openssl rsautl -verify -pubin -inkey " + pubkey_file - logging.debug("Running %s", cmd) - p = subprocess.Popen(cmd, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - p.stdin.write(sig) - p.stdin.close() - decryptedhash = p.stdout.read() - p.stdout.close() - if hash != decryptedhash: - logging.debug("stashservers hash %s does not match decrypted signature %s", hash, decryptedhash) - logging.error("stashservers signature does not verify") - return None - logging.debug("Signature matched") - - lists = responselines[4].split(';') - logging.debug("Cache lists: %s", lists) - - if cache_list_name == None: - caches = lists[0].split('=')[1] - else: - for l in lists: - n=len(cache_list_name)+1 - if l[0:n] == cache_list_name + '=': - caches = l[n:] - break - caches_list = caches.split(',') - for i in range(len(caches_list)): - caches_list[i] = 'root://' + caches_list[i] minsite = caches_list[int(ordered_list[0])-1] From 20288731ef3286c462cf990620271e5bfc916087 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Wed, 6 May 2020 14:52:37 -0500 Subject: [PATCH 6/7] add --list-names option to print pre-configured list names --- stashcp/__init__.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/stashcp/__init__.py b/stashcp/__init__.py index c7fa5ee..c736d6b 100755 --- a/stashcp/__init__.py +++ b/stashcp/__init__.py @@ -48,6 +48,9 @@ # Global variable for the location of the token to use for reading / writing token_location = None +# Global variable to print names of cache lists +print_cache_list_names = False + TIMEOUT = 300 DIFF = TIMEOUT * 10 @@ -680,6 +683,15 @@ def get_stashservers_caches(responselines): lists = responselines[4].split(';') logging.debug("Cache lists: %s", lists) + if print_cache_list_names: + names = "" + # skip hash at the end + for l in lists[0:-1]: + names = names + ',' + l.split('=')[0] + # skip leading comma + print(names[1:]) + sys.exit(0) + if cache_list_name == None: caches = lists[0].split('=')[1] else: @@ -812,6 +824,7 @@ def main(): default=None) parser.add_option('-n', '--cache-list-name', dest='cache_list_name', help="Name of pre-configured cache list to use", default=None) + parser.add_option('--list-names', dest='list_names', action='store_true', help="Return the names of pre-configured cache lists and exit (first one is default for -n)") parser.add_option('--methods', dest='methods', help="Comma separated list of methods to try, in order. Default: cvmfs,xrootd,http", default="cvmfs,xrootd,http") parser.add_option('-t', '--token', dest='token', help="Token file to use for reading and/or writing") args,opts=parser.parse_args() @@ -826,12 +839,19 @@ def main(): else: logger.setLevel(logging.WARNING) + if args.list_names: + global print_cache_list_names + print_cache_list_names = True + get_best_stashcache() + # does not return + if 'CACHES_JSON' in os.environ: caches_json_location = os.environ['CACHES_JSON'] else: caches_json_location = args.caches_json + cache_list_name = args.cache_list_name - if args.closest: + if args.closest or args.list_names: print get_best_stashcache() sys.exit(0) From 5423192f20303859b2f8ebfc9ed1c7523a550046 Mon Sep 17 00:00:00 2001 From: Dave Dykstra <2129743+DrDaveD@users.noreply.github.com> Date: Wed, 6 May 2020 15:51:39 -0500 Subject: [PATCH 7/7] print usage if no parameters are given --- stashcp/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stashcp/__init__.py b/stashcp/__init__.py index c736d6b..dc04e57 100755 --- a/stashcp/__init__.py +++ b/stashcp/__init__.py @@ -856,7 +856,9 @@ def main(): sys.exit(0) if len(opts) != 2: - parser.error('Source and Destination must be specified on command line') + logging.error('Source and Destination must be specified on command line') + parser.print_help() + sys.exit(1) else: source=opts[0] destination=opts[1]