Skip to content

Commit

Permalink
Merge 5cd5fe7 into e9760e2
Browse files Browse the repository at this point in the history
  • Loading branch information
mkhorton committed Sep 4, 2021
2 parents e9760e2 + 5cd5fe7 commit 96af87c
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 51 deletions.
233 changes: 189 additions & 44 deletions pymatgen/ext/optimade.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@
"""

from collections import namedtuple
from typing import Dict
from typing import Dict, Union, List, Optional
from urllib.parse import urlparse

import logging
import requests
import sys

from pymatgen.core.periodic_table import DummySpecies
from pymatgen.core.structure import Structure
from pymatgen.util.sequence import PBar

# TODO: importing optimade-python-tool's data structures will make more sense
Provider = namedtuple("Provider", ["name", "base_url", "description", "homepage", "prefix"])

_logger = logging.getLogger(__name__)
_handler = logging.StreamHandler(sys.stdout)
_logger.addHandler(_handler)
_logger.setLevel(logging.WARNING)


class OptimadeRester:
"""
Expand Down Expand Up @@ -43,7 +54,7 @@ class OptimadeRester:
"tcod": "https://www.crystallography.net/tcod/optimade",
}

def __init__(self, alias_or_structure_resource_url="mp"):
def __init__(self, aliases_or_resource_urls: Optional[Union[str, List[str]]] = None, timeout=5):
"""
OPTIMADE is an effort to provide a standardized interface to retrieve information
from many different materials science databases.
Expand All @@ -66,19 +77,67 @@ def __init__(self, alias_or_structure_resource_url="mp"):
To refresh this list of aliases, generated from the current list of OPTIMADE providers
at optimade.org, call the refresh_aliases() method.
This interface is maintained by @mkhorton, please contact him directly with bug reports
or open an Issue in the pymatgen repository.
Args:
alias_or_structure_resource_url: the alias or structure resource URL
aliases_or_resource_urls: the alias or structure resource URL or a list of
aliases or resource URLs, if providing the resource URL directly it should not
be an index, this interface can only currently access the "v1/structures"
information from the specified resource URL
timeout: number of seconds before an attempted request is abandoned, a good
timeout is useful when querying many providers, some of which may be offline
"""

# TODO: maybe we should use the nice pydantic models from optimade-python-tools
# for response validation, and use the Lark parser for filter validation
self.session = requests.Session()
self._timeout = 10 # seconds
self._timeout = timeout # seconds

if isinstance(aliases_or_resource_urls, str):
aliases_or_resource_urls = [aliases_or_resource_urls]

# this stores a dictionary with keys provider id (in the same format as the aliases)
# and values as the corresponding URL
self.resources = {}

if not aliases_or_resource_urls:
aliases_or_resource_urls = list(self.aliases.keys())
_logger.warning(
"Connecting to all known OPTIMADE providers, this will be slow. Please connect to only the "
f"OPTIMADE providers you want to query. Choose from: {', '.join(self.aliases.keys())}"
)

for alias_or_resource_url in aliases_or_resource_urls:

if alias_or_resource_url in self.aliases:
self.resources[alias_or_resource_url] = self.aliases[alias_or_resource_url]

elif self._validate_provider(alias_or_resource_url):

# TODO: unclear what the key should be here, the "prefix" is for the root provider,
# may need to walk back to the index for the given provider to find the correct identifier

self.resources[alias_or_resource_url] = alias_or_resource_url

else:
_logger.warning(f"The following is not a known alias or valid url: {alias_or_resource_url}")

self._providers = {url: self._validate_provider(provider_url=url) for url in self.resources.values()}

def __repr__(self):
return f"OptimadeRester connected to: {', '.join(self.resources.values())}"

if alias_or_structure_resource_url in self.aliases:
self.resource = self.aliases[alias_or_structure_resource_url]
else:
self.resource = alias_or_structure_resource_url
def __str__(self):
return self.describe()

def describe(self):
"""
Provides human-readable information about the resources being searched by the OptimadeRester.
"""
provider_text = "\n".join(map(str, (provider for provider in self._providers.values() if provider)))
description = f"OptimadeRester connected to:\n{provider_text}"
return description

@staticmethod
def _build_filter(
Expand Down Expand Up @@ -117,12 +176,7 @@ def _build_filter(
return " AND ".join(filters)

def get_structures(
self,
elements=None,
nelements=None,
nsites=None,
chemical_formula_anonymous=None,
chemical_formula_hill=None,
self, elements=None, nelements=None, nsites=None, chemical_formula_anonymous=None, chemical_formula_hill=None,
) -> Dict[str, Structure]:
"""
Retrieve structures from the OPTIMADE database.
Expand Down Expand Up @@ -160,28 +214,52 @@ def get_structures_with_filter(self, optimade_filter: str) -> Dict[str, Structur
Returns: Dict of Structures keyed by that database's id system
"""

fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites"
all_structures = {}

for identifier, resource in self.resources.items():

url = f"{self.resource}/v1/structures?filter={optimade_filter}&fields={fields}"
fields = "response_fields=lattice_vectors,cartesian_site_positions,species,species_at_sites"

json = self.session.get(url, timeout=self._timeout).json()
url = f"{resource}/v1/structures?filter={optimade_filter}&fields={fields}"

structures = self._get_structures_from_resource(json)
try:

if "next" in json["links"] and json["links"]["next"]:
pbar = PBar(total=json["meta"].get("data_returned"))
while "next" in json["links"] and json["links"]["next"]:
json = self.session.get(json["links"]["next"], timeout=self._timeout).json()
structures.update(self._get_structures_from_resource(json))
pbar.update(len(structures))
json = self.session.get(url, timeout=self._timeout).json()

return structures
structures = self._get_structures_from_resource(json, url)

pbar = PBar(total=json["meta"].get("data_returned", 0), desc=identifier, initial=len(structures))

# TODO: check spec for `more_data_available` boolean, may simplify this conditional
if ("links" in json) and ("next" in json["links"]) and (json["links"]["next"]):
while "next" in json["links"] and json["links"]["next"]:
next_link = json["links"]["next"]
if isinstance(next_link, dict) and "href" in next_link:
next_link = next_link["href"]
json = self.session.get(next_link, timeout=self._timeout).json()
additional_strcutures = self._get_structures_from_resource(json, url)
structures.update(additional_strcutures)
pbar.update(len(additional_strcutures))

if structures:

all_structures[identifier] = structures

except Exception as exc:

# TODO: manually inspect failures to either (a) correct a bug or (b) raise more appropriate error

_logger.warning(f"Could not retrieve required information from provider ({identifier}): {exc}")

return all_structures

@staticmethod
def _get_structures_from_resource(json):
def _get_structures_from_resource(json, url):

structures = {}

exceptions = set()

def _sanitize_symbol(symbol):
if symbol == "vacancy":
symbol = DummySpecies("X_vacancy", oxidation_state=None)
Expand Down Expand Up @@ -219,30 +297,83 @@ def _get_comp(sp_dict):
coords_are_cartesian=True,
)
structures[data["id"]] = structure
except Exception:
pass

except Exception as exc:
if str(exc) not in exceptions:
exceptions.add(str(exc))

if exceptions:
_logger.warning(f'Failed to parse returned data for {url}: {", ".join(exceptions)}')

return structures

def refresh_aliases(self, providers_url="https://providers.optimade.org/providers.json"):
def _validate_provider(self, provider_url) -> Optional[Provider]:
"""
Updates available OPTIMADE structure resources based on the current list of OPTIMADE
providers.
Checks that a given URL is indeed an OPTIMADE provider,
returning None if it is not a provider, or the provider
prefix if it is.
TODO: careful reading of OPTIMADE specification required
TODO: add better exception handling, intentionally permissive currently
"""
json = self.session.get(url=providers_url, timeout=self._timeout).json()
providers_from_url = {
entry["id"]: entry["attributes"]["base_url"] for entry in json["data"] if entry["attributes"]["base_url"]
}

providers = {}
for provider, link in providers_from_url.items():
def is_url(url):
"""
Basic URL validation thanks to https://stackoverflow.com/a/52455972
"""
try:
providers[provider] = self.session.get(f"{link}/v1/links", timeout=self._timeout).json()
except Exception as exc:
print(f"Failed to parse {provider} at {link}: {exc}")
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False

if not is_url(provider_url):
_logger.warning(f"An invalid url was supplied: {provider_url}")
return None

try:
provider_info_json = self.session.get(f"{provider_url}/v1/info", timeout=self._timeout).json()
except Exception as exc:
_logger.warning(f"Failed to parse {provider_url}: {exc}")
return None

try:
return Provider(
name=provider_info_json["meta"]["provider"]["name"],
base_url=provider_url,
description=provider_info_json["meta"]["provider"]["description"],
homepage=provider_info_json["meta"]["provider"].get("homepage"),
prefix=provider_info_json["meta"]["provider"]["prefix"],
)
except Exception as exc:
_logger.warning(f"Failed to extract required information from {provider_url}: {exc}")
return None

def _parse_provider(self, provider, provider_url) -> Dict[str, Provider]:
"""
Used internally to update the list of providers or to
check a given URL is valid.
It does not raise exceptions but will instead _logger.warning and provide
an empty dictionary in the case of invalid data.
In future, when the specification is sufficiently well adopted,
we might be more strict here.
# TODO: importing optimade-python-tool's data structures will make more sense
Provider = namedtuple("Provider", ["name", "base_url", "description", "homepage"])
Args:
provider: the provider prefix
provider_url: An OPTIMADE provider URL
Returns:
A dictionary of keys (in format of "provider.database") to
Provider objects.
"""

try:
provider_link_json = self.session.get(f"{provider_url}/v1/links", timeout=self._timeout).json()
except Exception as exc:
_logger.warning(f"Failed to parse {provider_url}: {exc}")
return {}

def _parse_provider_link(provider, provider_link_json):
"""No validation attempted."""
Expand All @@ -257,19 +388,33 @@ def _parse_provider_link(provider, provider_link_json):
base_url=link["attributes"]["base_url"],
description=link["attributes"]["description"],
homepage=link["attributes"].get("homepage"),
prefix=link["attributes"].get("prefix"),
)
except Exception:
# print(f"Failed to parse {provider}: {exc}")
# Not all providers parse yet.
pass
return ps

return _parse_provider_link(provider, provider_link_json)

def refresh_aliases(self, providers_url="https://providers.optimade.org/providers.json"):
"""
Updates available OPTIMADE structure resources based on the current list of OPTIMADE
providers.
"""
json = self.session.get(url=providers_url, timeout=self._timeout).json()
providers_from_url = {
entry["id"]: entry["attributes"]["base_url"] for entry in json["data"] if entry["attributes"]["base_url"]
}

structure_providers = {}
for provider, provider_link_json in providers.items():
structure_providers.update(_parse_provider_link(provider, provider_link_json))
for provider, provider_link in providers_from_url.items():
structure_providers.update(self._parse_provider(provider, provider_link))

self.aliases = {alias: provider.base_url for alias, provider in structure_providers.items()}

# TODO: revisit context manager logic here and in MPRester
def __enter__(self):
"""
Support for "with" context.
Expand Down
8 changes: 2 additions & 6 deletions pymatgen/ext/tests/test_optimade.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_get_structures_mp(self):

structs = optimade.get_structures(elements=["Ga", "N"], nelements=2)

test_struct = next(iter(structs.values()))
test_struct = next(iter(structs["mp"].values()))

self.assertEqual([str(el) for el in test_struct.types_of_species], ["Ga", "N"])

Expand All @@ -23,7 +23,7 @@ def test_get_structures_mcloud_2dstructures(self):

structs = optimade.get_structures(elements=["B", "N"], nelements=2)

test_struct = next(iter(structs.values()))
test_struct = next(iter(structs["mcloud.2dstructures"].values()))

self.assertEqual([str(el) for el in test_struct.types_of_species], ["B", "N"])

Expand All @@ -33,7 +33,3 @@ def test_update_aliases(self):
optimade.refresh_aliases()

self.assertIn("mp", optimade.aliases)

from pprint import pprint

pprint(optimade.aliases)
2 changes: 1 addition & 1 deletion pymatgen/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class PBarSafe:
Progress bar.
"""

def __init__(self, total):
def __init__(self, total, **kwargs):
"""
Args:
total (): Total value.
Expand Down

0 comments on commit 96af87c

Please sign in to comment.