From 5054e882194adae4b76681e78c45d41ae2c2f0f7 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 15:36:54 -0700 Subject: [PATCH 01/12] Allow `PBar` to accept any kwargs (e.g. those used by `tqdm`) --- pymatgen/util/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymatgen/util/sequence.py b/pymatgen/util/sequence.py index 995996e082d..22261859d02 100644 --- a/pymatgen/util/sequence.py +++ b/pymatgen/util/sequence.py @@ -28,7 +28,7 @@ class PBarSafe: Progress bar. """ - def __init__(self, total): + def __init__(self, total, **kwargs): """ Args: total (): Total value. From 5cd5fe7bb66f0ca3ae662ebdbcaf192ccc76badb Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 15:37:58 -0700 Subject: [PATCH 02/12] Update OptimadeRester to allow querying multiple providers --- pymatgen/ext/optimade.py | 233 ++++++++++++++++++++++------ pymatgen/ext/tests/test_optimade.py | 8 +- 2 files changed, 191 insertions(+), 50 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 6fd1f5d87a9..13aacdfb357 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -3,14 +3,25 @@ """ from collections import namedtuple -from typing import Dict +from typing import Dict, Union, List, Optional +from urllib.parse import urlparse +import logging import requests +import sys from pymatgen.core.periodic_table import DummySpecies from pymatgen.core.structure import Structure from pymatgen.util.sequence import PBar +# TODO: importing optimade-python-tool's data structures will make more sense +Provider = namedtuple("Provider", ["name", "base_url", "description", "homepage", "prefix"]) + +_logger = logging.getLogger(__name__) +_handler = logging.StreamHandler(sys.stdout) +_logger.addHandler(_handler) +_logger.setLevel(logging.WARNING) + class OptimadeRester: """ @@ -43,7 +54,7 @@ class OptimadeRester: "tcod": "https://www.crystallography.net/tcod/optimade", } - def __init__(self, alias_or_structure_resource_url="mp"): + def __init__(self, aliases_or_resource_urls: Optional[Union[str, List[str]]] = None, timeout=5): """ OPTIMADE is an effort to provide a standardized interface to retrieve information from many different materials science databases. @@ -66,19 +77,67 @@ def __init__(self, alias_or_structure_resource_url="mp"): To refresh this list of aliases, generated from the current list of OPTIMADE providers at optimade.org, call the refresh_aliases() method. + This interface is maintained by @mkhorton, please contact him directly with bug reports + or open an Issue in the pymatgen repository. + Args: - alias_or_structure_resource_url: the alias or structure resource URL + aliases_or_resource_urls: the alias or structure resource URL or a list of + aliases or resource URLs, if providing the resource URL directly it should not + be an index, this interface can only currently access the "v1/structures" + information from the specified resource URL + timeout: number of seconds before an attempted request is abandoned, a good + timeout is useful when querying many providers, some of which may be offline """ # TODO: maybe we should use the nice pydantic models from optimade-python-tools # for response validation, and use the Lark parser for filter validation self.session = requests.Session() - self._timeout = 10 # seconds + self._timeout = timeout # seconds + + if isinstance(aliases_or_resource_urls, str): + aliases_or_resource_urls = [aliases_or_resource_urls] + + # this stores a dictionary with keys provider id (in the same format as the aliases) + # and values as the corresponding URL + self.resources = {} + + if not aliases_or_resource_urls: + aliases_or_resource_urls = list(self.aliases.keys()) + _logger.warning( + "Connecting to all known OPTIMADE providers, this will be slow. Please connect to only the " + f"OPTIMADE providers you want to query. Choose from: {', '.join(self.aliases.keys())}" + ) + + for alias_or_resource_url in aliases_or_resource_urls: + + if alias_or_resource_url in self.aliases: + self.resources[alias_or_resource_url] = self.aliases[alias_or_resource_url] + + elif self._validate_provider(alias_or_resource_url): + + # TODO: unclear what the key should be here, the "prefix" is for the root provider, + # may need to walk back to the index for the given provider to find the correct identifier + + self.resources[alias_or_resource_url] = alias_or_resource_url + + else: + _logger.warning(f"The following is not a known alias or valid url: {alias_or_resource_url}") + + self._providers = {url: self._validate_provider(provider_url=url) for url in self.resources.values()} + + def __repr__(self): + return f"OptimadeRester connected to: {', '.join(self.resources.values())}" - if alias_or_structure_resource_url in self.aliases: - self.resource = self.aliases[alias_or_structure_resource_url] - else: - self.resource = alias_or_structure_resource_url + def __str__(self): + return self.describe() + + def describe(self): + """ + Provides human-readable information about the resources being searched by the OptimadeRester. + """ + provider_text = "\n".join(map(str, (provider for provider in self._providers.values() if provider))) + description = f"OptimadeRester connected to:\n{provider_text}" + return description @staticmethod def _build_filter( @@ -117,12 +176,7 @@ def _build_filter( return " AND ".join(filters) def get_structures( - self, - elements=None, - nelements=None, - nsites=None, - chemical_formula_anonymous=None, - chemical_formula_hill=None, + self, elements=None, nelements=None, nsites=None, chemical_formula_anonymous=None, chemical_formula_hill=None, ) -> Dict[str, Structure]: """ Retrieve structures from the OPTIMADE database. @@ -160,28 +214,52 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur Returns: Dict of Structures keyed by that database's id system """ - fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" + all_structures = {} + + for identifier, resource in self.resources.items(): - url = f"{self.resource}/v1/structures?filter={optimade_filter}&fields={fields}" + fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - json = self.session.get(url, timeout=self._timeout).json() + url = f"{resource}/v1/structures?filter={optimade_filter}&fields={fields}" - structures = self._get_structures_from_resource(json) + try: - if "next" in json["links"] and json["links"]["next"]: - pbar = PBar(total=json["meta"].get("data_returned")) - while "next" in json["links"] and json["links"]["next"]: - json = self.session.get(json["links"]["next"], timeout=self._timeout).json() - structures.update(self._get_structures_from_resource(json)) - pbar.update(len(structures)) + json = self.session.get(url, timeout=self._timeout).json() - return structures + structures = self._get_structures_from_resource(json, url) + + pbar = PBar(total=json["meta"].get("data_returned", 0), desc=identifier, initial=len(structures)) + + # TODO: check spec for `more_data_available` boolean, may simplify this conditional + if ("links" in json) and ("next" in json["links"]) and (json["links"]["next"]): + while "next" in json["links"] and json["links"]["next"]: + next_link = json["links"]["next"] + if isinstance(next_link, dict) and "href" in next_link: + next_link = next_link["href"] + json = self.session.get(next_link, timeout=self._timeout).json() + additional_strcutures = self._get_structures_from_resource(json, url) + structures.update(additional_strcutures) + pbar.update(len(additional_strcutures)) + + if structures: + + all_structures[identifier] = structures + + except Exception as exc: + + # TODO: manually inspect failures to either (a) correct a bug or (b) raise more appropriate error + + _logger.warning(f"Could not retrieve required information from provider ({identifier}): {exc}") + + return all_structures @staticmethod - def _get_structures_from_resource(json): + def _get_structures_from_resource(json, url): structures = {} + exceptions = set() + def _sanitize_symbol(symbol): if symbol == "vacancy": symbol = DummySpecies("X_vacancy", oxidation_state=None) @@ -219,30 +297,83 @@ def _get_comp(sp_dict): coords_are_cartesian=True, ) structures[data["id"]] = structure - except Exception: - pass + + except Exception as exc: + if str(exc) not in exceptions: + exceptions.add(str(exc)) + + if exceptions: + _logger.warning(f'Failed to parse returned data for {url}: {", ".join(exceptions)}') return structures - def refresh_aliases(self, providers_url="https://providers.optimade.org/providers.json"): + def _validate_provider(self, provider_url) -> Optional[Provider]: """ - Updates available OPTIMADE structure resources based on the current list of OPTIMADE - providers. + Checks that a given URL is indeed an OPTIMADE provider, + returning None if it is not a provider, or the provider + prefix if it is. + + TODO: careful reading of OPTIMADE specification required + TODO: add better exception handling, intentionally permissive currently """ - json = self.session.get(url=providers_url, timeout=self._timeout).json() - providers_from_url = { - entry["id"]: entry["attributes"]["base_url"] for entry in json["data"] if entry["attributes"]["base_url"] - } - providers = {} - for provider, link in providers_from_url.items(): + def is_url(url): + """ + Basic URL validation thanks to https://stackoverflow.com/a/52455972 + """ try: - providers[provider] = self.session.get(f"{link}/v1/links", timeout=self._timeout).json() - except Exception as exc: - print(f"Failed to parse {provider} at {link}: {exc}") + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + if not is_url(provider_url): + _logger.warning(f"An invalid url was supplied: {provider_url}") + return None + + try: + provider_info_json = self.session.get(f"{provider_url}/v1/info", timeout=self._timeout).json() + except Exception as exc: + _logger.warning(f"Failed to parse {provider_url}: {exc}") + return None + + try: + return Provider( + name=provider_info_json["meta"]["provider"]["name"], + base_url=provider_url, + description=provider_info_json["meta"]["provider"]["description"], + homepage=provider_info_json["meta"]["provider"].get("homepage"), + prefix=provider_info_json["meta"]["provider"]["prefix"], + ) + except Exception as exc: + _logger.warning(f"Failed to extract required information from {provider_url}: {exc}") + return None + + def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: + """ + Used internally to update the list of providers or to + check a given URL is valid. + + It does not raise exceptions but will instead _logger.warning and provide + an empty dictionary in the case of invalid data. + + In future, when the specification is sufficiently well adopted, + we might be more strict here. - # TODO: importing optimade-python-tool's data structures will make more sense - Provider = namedtuple("Provider", ["name", "base_url", "description", "homepage"]) + Args: + provider: the provider prefix + provider_url: An OPTIMADE provider URL + + Returns: + A dictionary of keys (in format of "provider.database") to + Provider objects. + """ + + try: + provider_link_json = self.session.get(f"{provider_url}/v1/links", timeout=self._timeout).json() + except Exception as exc: + _logger.warning(f"Failed to parse {provider_url}: {exc}") + return {} def _parse_provider_link(provider, provider_link_json): """No validation attempted.""" @@ -257,6 +388,7 @@ def _parse_provider_link(provider, provider_link_json): base_url=link["attributes"]["base_url"], description=link["attributes"]["description"], homepage=link["attributes"].get("homepage"), + prefix=link["attributes"].get("prefix"), ) except Exception: # print(f"Failed to parse {provider}: {exc}") @@ -264,12 +396,25 @@ def _parse_provider_link(provider, provider_link_json): pass return ps + return _parse_provider_link(provider, provider_link_json) + + def refresh_aliases(self, providers_url="https://providers.optimade.org/providers.json"): + """ + Updates available OPTIMADE structure resources based on the current list of OPTIMADE + providers. + """ + json = self.session.get(url=providers_url, timeout=self._timeout).json() + providers_from_url = { + entry["id"]: entry["attributes"]["base_url"] for entry in json["data"] if entry["attributes"]["base_url"] + } + structure_providers = {} - for provider, provider_link_json in providers.items(): - structure_providers.update(_parse_provider_link(provider, provider_link_json)) + for provider, provider_link in providers_from_url.items(): + structure_providers.update(self._parse_provider(provider, provider_link)) self.aliases = {alias: provider.base_url for alias, provider in structure_providers.items()} + # TODO: revisit context manager logic here and in MPRester def __enter__(self): """ Support for "with" context. diff --git a/pymatgen/ext/tests/test_optimade.py b/pymatgen/ext/tests/test_optimade.py index c377993ff1d..fce575b3123 100644 --- a/pymatgen/ext/tests/test_optimade.py +++ b/pymatgen/ext/tests/test_optimade.py @@ -13,7 +13,7 @@ def test_get_structures_mp(self): structs = optimade.get_structures(elements=["Ga", "N"], nelements=2) - test_struct = next(iter(structs.values())) + test_struct = next(iter(structs["mp"].values())) self.assertEqual([str(el) for el in test_struct.types_of_species], ["Ga", "N"]) @@ -23,7 +23,7 @@ def test_get_structures_mcloud_2dstructures(self): structs = optimade.get_structures(elements=["B", "N"], nelements=2) - test_struct = next(iter(structs.values())) + test_struct = next(iter(structs["mcloud.2dstructures"].values())) self.assertEqual([str(el) for el in test_struct.types_of_species], ["B", "N"]) @@ -33,7 +33,3 @@ def test_update_aliases(self): optimade.refresh_aliases() self.assertIn("mp", optimade.aliases) - - from pprint import pprint - - pprint(optimade.aliases) From 039e752efb443e74d382acadf192010a981e5aac Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 15:41:50 -0700 Subject: [PATCH 03/12] Correct typo --- pymatgen/ext/optimade.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 13aacdfb357..7d591c45326 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -237,9 +237,9 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur if isinstance(next_link, dict) and "href" in next_link: next_link = next_link["href"] json = self.session.get(next_link, timeout=self._timeout).json() - additional_strcutures = self._get_structures_from_resource(json, url) - structures.update(additional_strcutures) - pbar.update(len(additional_strcutures)) + additional_structures = self._get_structures_from_resource(json, url) + structures.update(additional_structures) + pbar.update(len(additional_structures)) if structures: From ef03743e564d226ed14da793967045680d68a02a Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:03:24 -0700 Subject: [PATCH 04/12] Use `urljoin`, be more permissive with missing metadata --- pymatgen/ext/optimade.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 7d591c45326..fb1e12fdf8d 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -4,7 +4,7 @@ from collections import namedtuple from typing import Dict, Union, List, Optional -from urllib.parse import urlparse +from urllib.parse import urlparse, urljoin import logging import requests @@ -220,7 +220,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - url = f"{resource}/v1/structures?filter={optimade_filter}&fields={fields}" + url = urljoin(resource, f"/v1/structures?filter={optimade_filter}&fields={fields}") try: @@ -249,7 +249,9 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur # TODO: manually inspect failures to either (a) correct a bug or (b) raise more appropriate error - _logger.warning(f"Could not retrieve required information from provider ({identifier}): {exc}") + _logger.warning( + f"Could not retrieve required information from provider {identifier} and url {url}: {exc}" + ) return all_structures @@ -332,21 +334,21 @@ def is_url(url): return None try: - provider_info_json = self.session.get(f"{provider_url}/v1/info", timeout=self._timeout).json() + provider_info_json = self.session.get(urljoin(provider_url, "/v1/info"), timeout=self._timeout).json() except Exception as exc: _logger.warning(f"Failed to parse {provider_url}: {exc}") return None try: return Provider( - name=provider_info_json["meta"]["provider"]["name"], + name=provider_info_json["meta"].get("provider", {}).get("name", "Unknown"), base_url=provider_url, - description=provider_info_json["meta"]["provider"]["description"], - homepage=provider_info_json["meta"]["provider"].get("homepage"), - prefix=provider_info_json["meta"]["provider"]["prefix"], + description=provider_info_json["meta"].get("provider", {}).get("description", "Unknown"), + homepage=provider_info_json["meta"].get("provider", {}).get("homepage"), + prefix=provider_info_json["meta"].get("provider", {}).get("prefix", "Unknown"), ) except Exception as exc: - _logger.warning(f"Failed to extract required information from {provider_url}: {exc}") + _logger.warning(f"Failed to extract required information from {urljoin(provider_url, '/v1/info')}: {exc}") return None def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: @@ -370,7 +372,7 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - provider_link_json = self.session.get(f"{provider_url}/v1/links", timeout=self._timeout).json() + provider_link_json = self.session.get(urljoin(provider_url, "/v1/links"), timeout=self._timeout).json() except Exception as exc: _logger.warning(f"Failed to parse {provider_url}: {exc}") return {} From 415db1be0925ff8cbd367297eb206fa7325553e9 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:05:47 -0700 Subject: [PATCH 05/12] Change import order --- pymatgen/ext/optimade.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index fb1e12fdf8d..45410453bed 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -2,13 +2,13 @@ Optimade support. """ +import logging +import sys from collections import namedtuple from typing import Dict, Union, List, Optional from urllib.parse import urlparse, urljoin -import logging import requests -import sys from pymatgen.core.periodic_table import DummySpecies from pymatgen.core.structure import Structure From 59cc2e5fc597d8b9a29d32ac15ba0fc9a7ec560c Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:13:22 -0700 Subject: [PATCH 06/12] Make error message more clear --- pymatgen/ext/optimade.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 45410453bed..a47021634d9 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -121,7 +121,7 @@ def __init__(self, aliases_or_resource_urls: Optional[Union[str, List[str]]] = N self.resources[alias_or_resource_url] = alias_or_resource_url else: - _logger.warning(f"The following is not a known alias or valid url: {alias_or_resource_url}") + _logger.error(f"The following is not a known alias or a valid url: {alias_or_resource_url}") self._providers = {url: self._validate_provider(provider_url=url) for url in self.resources.values()} @@ -220,7 +220,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - url = urljoin(resource, f"/v1/structures?filter={optimade_filter}&fields={fields}") + url = urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}") try: @@ -249,7 +249,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur # TODO: manually inspect failures to either (a) correct a bug or (b) raise more appropriate error - _logger.warning( + _logger.error( f"Could not retrieve required information from provider {identifier} and url {url}: {exc}" ) @@ -305,7 +305,7 @@ def _get_comp(sp_dict): exceptions.add(str(exc)) if exceptions: - _logger.warning(f'Failed to parse returned data for {url}: {", ".join(exceptions)}') + _logger.error(f'Failed to parse returned data for {url}: {", ".join(exceptions)}') return structures @@ -334,9 +334,9 @@ def is_url(url): return None try: - provider_info_json = self.session.get(urljoin(provider_url, "/v1/info"), timeout=self._timeout).json() + provider_info_json = self.session.get(urljoin(provider_url, "v1/info"), timeout=self._timeout).json() except Exception as exc: - _logger.warning(f"Failed to parse {provider_url}: {exc}") + _logger.warning(f"Failed to parse {urljoin(provider_url, 'v1/info')} when validating: {exc}") return None try: @@ -348,7 +348,7 @@ def is_url(url): prefix=provider_info_json["meta"].get("provider", {}).get("prefix", "Unknown"), ) except Exception as exc: - _logger.warning(f"Failed to extract required information from {urljoin(provider_url, '/v1/info')}: {exc}") + _logger.warning(f"Failed to extract required information from {urljoin(provider_url, 'v1/info')}: {exc}") return None def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: @@ -372,9 +372,9 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - provider_link_json = self.session.get(urljoin(provider_url, "/v1/links"), timeout=self._timeout).json() + provider_link_json = self.session.get(urljoin(provider_url, "v1/links"), timeout=self._timeout).json() except Exception as exc: - _logger.warning(f"Failed to parse {provider_url}: {exc}") + _logger.error(f"Failed to parse {urljoin(provider_url, 'v1/links')} when following links: {exc}") return {} def _parse_provider_link(provider, provider_link_json): From d370714a0287b1caeafb4732e0fdd9ac0d8568a5 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:23:35 -0700 Subject: [PATCH 07/12] Automatically quote URLs --- pymatgen/ext/optimade.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index a47021634d9..3bd80822cd2 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -6,7 +6,7 @@ import sys from collections import namedtuple from typing import Dict, Union, List, Optional -from urllib.parse import urlparse, urljoin +from urllib.parse import urlparse, urljoin, quote import requests @@ -220,7 +220,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - url = urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}") + url = quote(urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}"), safe=":/") try: @@ -334,9 +334,13 @@ def is_url(url): return None try: - provider_info_json = self.session.get(urljoin(provider_url, "v1/info"), timeout=self._timeout).json() + provider_info_json = self.session.get( + quote(urljoin(provider_url, "v1/info"), safe=":/"), timeout=self._timeout + ).json() except Exception as exc: - _logger.warning(f"Failed to parse {urljoin(provider_url, 'v1/info')} when validating: {exc}") + _logger.warning( + f"Failed to parse {quote(urljoin(provider_url, 'v1/info'), safe=':/')} when validating: {exc}" + ) return None try: @@ -348,7 +352,9 @@ def is_url(url): prefix=provider_info_json["meta"].get("provider", {}).get("prefix", "Unknown"), ) except Exception as exc: - _logger.warning(f"Failed to extract required information from {urljoin(provider_url, 'v1/info')}: {exc}") + _logger.warning( + f"Failed to extract required information from {quote(urljoin(provider_url, 'v1/info'), safe=':/')}: {exc}" + ) return None def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: @@ -372,9 +378,13 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - provider_link_json = self.session.get(urljoin(provider_url, "v1/links"), timeout=self._timeout).json() + provider_link_json = self.session.get( + quote(urljoin(provider_url, "v1/links"), safe=":/"), timeout=self._timeout + ).json() except Exception as exc: - _logger.error(f"Failed to parse {urljoin(provider_url, 'v1/links')} when following links: {exc}") + _logger.error( + f"Failed to parse {quote(urljoin(provider_url, 'v1/links'), safe=':/')} when following links: {exc}" + ) return {} def _parse_provider_link(provider, provider_link_json): From d7a8867dffddf93495b7fa6c9f842e24c3892515 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:26:46 -0700 Subject: [PATCH 08/12] Revert "Automatically quote URLs" This reverts commit d370714a0287b1caeafb4732e0fdd9ac0d8568a5. --- pymatgen/ext/optimade.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 3bd80822cd2..a47021634d9 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -6,7 +6,7 @@ import sys from collections import namedtuple from typing import Dict, Union, List, Optional -from urllib.parse import urlparse, urljoin, quote +from urllib.parse import urlparse, urljoin import requests @@ -220,7 +220,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - url = quote(urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}"), safe=":/") + url = urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}") try: @@ -334,13 +334,9 @@ def is_url(url): return None try: - provider_info_json = self.session.get( - quote(urljoin(provider_url, "v1/info"), safe=":/"), timeout=self._timeout - ).json() + provider_info_json = self.session.get(urljoin(provider_url, "v1/info"), timeout=self._timeout).json() except Exception as exc: - _logger.warning( - f"Failed to parse {quote(urljoin(provider_url, 'v1/info'), safe=':/')} when validating: {exc}" - ) + _logger.warning(f"Failed to parse {urljoin(provider_url, 'v1/info')} when validating: {exc}") return None try: @@ -352,9 +348,7 @@ def is_url(url): prefix=provider_info_json["meta"].get("provider", {}).get("prefix", "Unknown"), ) except Exception as exc: - _logger.warning( - f"Failed to extract required information from {quote(urljoin(provider_url, 'v1/info'), safe=':/')}: {exc}" - ) + _logger.warning(f"Failed to extract required information from {urljoin(provider_url, 'v1/info')}: {exc}") return None def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: @@ -378,13 +372,9 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - provider_link_json = self.session.get( - quote(urljoin(provider_url, "v1/links"), safe=":/"), timeout=self._timeout - ).json() + provider_link_json = self.session.get(urljoin(provider_url, "v1/links"), timeout=self._timeout).json() except Exception as exc: - _logger.error( - f"Failed to parse {quote(urljoin(provider_url, 'v1/links'), safe=':/')} when following links: {exc}" - ) + _logger.error(f"Failed to parse {urljoin(provider_url, 'v1/links')} when following links: {exc}") return {} def _parse_provider_link(provider, provider_link_json): From b88057e06d0a1233087a2da6c6f42ee3fb7f5e84 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:32:08 -0700 Subject: [PATCH 09/12] Tidy up --- pymatgen/ext/optimade.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index a47021634d9..c803421b735 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -334,9 +334,10 @@ def is_url(url): return None try: - provider_info_json = self.session.get(urljoin(provider_url, "v1/info"), timeout=self._timeout).json() + url = urljoin(provider_url, "v1/info") + provider_info_json = self.session.get(url, timeout=self._timeout).json() except Exception as exc: - _logger.warning(f"Failed to parse {urljoin(provider_url, 'v1/info')} when validating: {exc}") + _logger.warning(f"Failed to parse {url} when validating: {exc}") return None try: @@ -348,7 +349,7 @@ def is_url(url): prefix=provider_info_json["meta"].get("provider", {}).get("prefix", "Unknown"), ) except Exception as exc: - _logger.warning(f"Failed to extract required information from {urljoin(provider_url, 'v1/info')}: {exc}") + _logger.warning(f"Failed to extract required information from {url}: {exc}") return None def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: @@ -372,9 +373,10 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - provider_link_json = self.session.get(urljoin(provider_url, "v1/links"), timeout=self._timeout).json() + url = urljoin(provider_url, "v1/links") + provider_link_json = self.session.get(url, timeout=self._timeout).json() except Exception as exc: - _logger.error(f"Failed to parse {urljoin(provider_url, 'v1/links')} when following links: {exc}") + _logger.error(f"Failed to parse {url} when following links: {exc}") return {} def _parse_provider_link(provider, provider_link_json): From 38b7e4692b850145c95bfdeec975831918180902 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sat, 4 Sep 2021 16:34:51 -0700 Subject: [PATCH 10/12] Use os.path.join instead of urljoin --- pymatgen/ext/optimade.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index c803421b735..a4fd78614a8 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -5,8 +5,9 @@ import logging import sys from collections import namedtuple +from os.path import join from typing import Dict, Union, List, Optional -from urllib.parse import urlparse, urljoin +from urllib.parse import urlparse import requests @@ -220,7 +221,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" - url = urljoin(resource, f"v1/structures?filter={optimade_filter}&fields={fields}") + url = join(resource, f"v1/structures?filter={optimade_filter}&fields={fields}") try: @@ -334,7 +335,7 @@ def is_url(url): return None try: - url = urljoin(provider_url, "v1/info") + url = join(provider_url, "v1/info") provider_info_json = self.session.get(url, timeout=self._timeout).json() except Exception as exc: _logger.warning(f"Failed to parse {url} when validating: {exc}") @@ -373,7 +374,7 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: """ try: - url = urljoin(provider_url, "v1/links") + url = join(provider_url, "v1/links") provider_link_json = self.session.get(url, timeout=self._timeout).json() except Exception as exc: _logger.error(f"Failed to parse {url} when following links: {exc}") From 7e7c5545c784e4b4d7cc9e2963c204e34340dc85 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sun, 5 Sep 2021 10:47:18 -0700 Subject: [PATCH 11/12] Add ability to retrieve SNLs --- pymatgen/ext/optimade.py | 104 +++++++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 16 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index a4fd78614a8..1439970fbed 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -13,6 +13,7 @@ from pymatgen.core.periodic_table import DummySpecies from pymatgen.core.structure import Structure +from pymatgen.util.provenance import StructureNL from pymatgen.util.sequence import PBar # TODO: importing optimade-python-tool's data structures will make more sense @@ -78,9 +79,6 @@ def __init__(self, aliases_or_resource_urls: Optional[Union[str, List[str]]] = N To refresh this list of aliases, generated from the current list of OPTIMADE providers at optimade.org, call the refresh_aliases() method. - This interface is maintained by @mkhorton, please contact him directly with bug reports - or open an Issue in the pymatgen repository. - Args: aliases_or_resource_urls: the alias or structure resource URL or a list of aliases or resource URLs, if providing the resource URL directly it should not @@ -178,9 +176,9 @@ def _build_filter( def get_structures( self, elements=None, nelements=None, nsites=None, chemical_formula_anonymous=None, chemical_formula_hill=None, - ) -> Dict[str, Structure]: + ) -> Dict[str, Dict[str, Structure]]: """ - Retrieve structures from the OPTIMADE database. + Retrieve Structures from OPTIMADE providers. Not all functionality of OPTIMADE is currently exposed in this convenience method. To use a custom filter, call get_structures_with_filter(). @@ -192,7 +190,7 @@ def get_structures( chemical_formula_anonymous: Anonymous chemical formula chemical_formula_hill: Chemical formula following Hill convention - Returns: Dict of Structures keyed by that database's id system + Returns: Dict of (Dict Structures keyed by that database's id system) keyed by provider """ optimade_filter = self._build_filter( @@ -205,7 +203,40 @@ def get_structures( return self.get_structures_with_filter(optimade_filter) - def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structure]: + def get_snls( + self, elements=None, nelements=None, nsites=None, chemical_formula_anonymous=None, chemical_formula_hill=None, + ) -> Dict[str, Dict[str, StructureNL]]: + """ + Retrieve StructureNL from OPTIMADE providers. + + A StructureNL is an object provided by pymatgen which combines Structure with + associated metadata, such as the URL is was downloaded from and any additional namespaced + data. + + Not all functionality of OPTIMADE is currently exposed in this convenience method. To + use a custom filter, call get_structures_with_filter(). + + Args: + elements: List of elements + nelements: Number of elements, e.g. 4 or [2, 5] for the range >=2 and <=5 + nsites: Number of sites, e.g. 4 or [2, 5] for the range >=2 and <=5 + chemical_formula_anonymous: Anonymous chemical formula + chemical_formula_hill: Chemical formula following Hill convention + + Returns: Dict of (Dict of StructureNLs keyed by that database's id system) keyed by provider + """ + + optimade_filter = self._build_filter( + elements=elements, + nelements=nelements, + nsites=nsites, + chemical_formula_anonymous=chemical_formula_anonymous, + chemical_formula_hill=chemical_formula_hill, + ) + + return self.get_snls_with_filter(optimade_filter) + + def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Dict[str, Structure]]: """ Get structures satisfying a given OPTIMADE filter. @@ -215,8 +246,26 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur Returns: Dict of Structures keyed by that database's id system """ + all_snls = self.get_snls_with_filter(optimade_filter) all_structures = {} + for identifier, snls_dict in all_snls.items(): + all_structures[identifier] = {k: snl.structure for k, snl in snls_dict.items()} + + return all_structures + + def get_snls_with_filter(self, optimade_filter: str) -> Dict[str, Dict[str, StructureNL]]: + """ + Get structures satisfying a given OPTIMADE filter. + + Args: + filter: An OPTIMADE-compliant filter + + Returns: Dict of Structures keyed by that database's id system + """ + + all_snls = {} + for identifier, resource in self.resources.items(): fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites" @@ -227,7 +276,7 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur json = self.session.get(url, timeout=self._timeout).json() - structures = self._get_structures_from_resource(json, url) + structures = self._get_snls_from_resource(json, url, identifier) pbar = PBar(total=json["meta"].get("data_returned", 0), desc=identifier, initial=len(structures)) @@ -238,13 +287,13 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur if isinstance(next_link, dict) and "href" in next_link: next_link = next_link["href"] json = self.session.get(next_link, timeout=self._timeout).json() - additional_structures = self._get_structures_from_resource(json, url) + additional_structures = self._get_snls_from_resource(json, url, identifier) structures.update(additional_structures) pbar.update(len(additional_structures)) if structures: - all_structures[identifier] = structures + all_snls[identifier] = structures except Exception as exc: @@ -254,12 +303,12 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur f"Could not retrieve required information from provider {identifier} and url {url}: {exc}" ) - return all_structures + return all_snls @staticmethod - def _get_structures_from_resource(json, url): + def _get_snls_from_resource(json, url, identifier) -> Dict[str, StructureNL]: - structures = {} + snls = {} exceptions = set() @@ -279,6 +328,7 @@ def _get_comp(sp_dict): for data in json["data"]: # TODO: check the spec! and remove this try/except (are all providers following spec?) + # e.g. can check data["type"] == "structures" try: # e.g. COD @@ -288,7 +338,19 @@ def _get_comp(sp_dict): coords=data["attributes"]["cartesian_site_positions"], coords_are_cartesian=True, ) - structures[data["id"]] = structure + namespaced_data = {k: v for k, v in data.items() if k.startswith("_")} + + # TODO: follow `references` to add reference information here + snl = StructureNL( + structure, + authors={}, + history=[{"name": identifier, "url": url, "description": {"id": data["id"]}}], + data={"_optimade": namespaced_data}, + ) + + snls[data["id"]] = snl + + # TODO: bare exception, remove... except Exception: try: @@ -299,7 +361,17 @@ def _get_comp(sp_dict): coords=data["attributes"]["cartesian_site_positions"], coords_are_cartesian=True, ) - structures[data["id"]] = structure + namespaced_data = {k: v for k, v in data["attributes"].items() if k.startswith("_")} + + # TODO: follow `references` to add reference information here + snl = StructureNL( + structure, + authors={}, + history=[{"name": identifier, "url": url, "description": {"id": data["id"]}}], + data={"_optimade": namespaced_data}, + ) + + snls[data["id"]] = snl except Exception as exc: if str(exc) not in exceptions: @@ -308,7 +380,7 @@ def _get_comp(sp_dict): if exceptions: _logger.error(f'Failed to parse returned data for {url}: {", ".join(exceptions)}') - return structures + return snls def _validate_provider(self, provider_url) -> Optional[Provider]: """ From f94e8d5c32dc3671ca5aeee3066ac5a2401ecd73 Mon Sep 17 00:00:00 2001 From: Matthew Horton Date: Sun, 5 Sep 2021 11:00:21 -0700 Subject: [PATCH 12/12] Add `retry` --- pymatgen/ext/optimade.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pymatgen/ext/optimade.py b/pymatgen/ext/optimade.py index 1439970fbed..9eb2ff63516 100644 --- a/pymatgen/ext/optimade.py +++ b/pymatgen/ext/optimade.py @@ -10,6 +10,7 @@ from urllib.parse import urlparse import requests +from retrying import retry from pymatgen.core.periodic_table import DummySpecies from pymatgen.core.structure import Structure @@ -138,6 +139,14 @@ def describe(self): description = f"OptimadeRester connected to:\n{provider_text}" return description + @retry(stop_max_attempt_number=3, wait_random_min=1000, wait_random_max=2000) + def _get_json(self, url): + """ + Retrieves JSON, will attempt to (politely) try again on failure subject to a + random delay and a maximum number of attempts. + """ + return self.session.get(url, timeout=self._timeout).json() + @staticmethod def _build_filter( elements=None, nelements=None, nsites=None, chemical_formula_anonymous=None, chemical_formula_hill=None @@ -274,7 +283,7 @@ def get_snls_with_filter(self, optimade_filter: str) -> Dict[str, Dict[str, Stru try: - json = self.session.get(url, timeout=self._timeout).json() + json = self._get_json(url) structures = self._get_snls_from_resource(json, url, identifier) @@ -286,7 +295,7 @@ def get_snls_with_filter(self, optimade_filter: str) -> Dict[str, Dict[str, Stru next_link = json["links"]["next"] if isinstance(next_link, dict) and "href" in next_link: next_link = next_link["href"] - json = self.session.get(next_link, timeout=self._timeout).json() + json = self._get_json(next_link) additional_structures = self._get_snls_from_resource(json, url, identifier) structures.update(additional_structures) pbar.update(len(additional_structures)) @@ -408,7 +417,7 @@ def is_url(url): try: url = join(provider_url, "v1/info") - provider_info_json = self.session.get(url, timeout=self._timeout).json() + provider_info_json = self._get_json(url) except Exception as exc: _logger.warning(f"Failed to parse {url} when validating: {exc}") return None @@ -447,7 +456,7 @@ def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]: try: url = join(provider_url, "v1/links") - provider_link_json = self.session.get(url, timeout=self._timeout).json() + provider_link_json = self._get_json(url) except Exception as exc: _logger.error(f"Failed to parse {url} when following links: {exc}") return {} @@ -480,7 +489,7 @@ def refresh_aliases(self, providers_url="https://providers.optimade.org/provider Updates available OPTIMADE structure resources based on the current list of OPTIMADE providers. """ - json = self.session.get(url=providers_url, timeout=self._timeout).json() + json = self._get_json(providers_url) providers_from_url = { entry["id"]: entry["attributes"]["base_url"] for entry in json["data"] if entry["attributes"]["base_url"] }