Skip to content

Commit

Permalink
Add new synthesis recipes to API. (#257)
Browse files Browse the repository at this point in the history
* Add new synthesis recipes schema.

* [WIP] add models to synthesis recipes and implement query classes

* [WIP] add query class for synthesis-type, experimental operations, and paragraph keywords (half-completed).

* [WIP] add script to convert dataset from the public repo to MP database.

* Change synthesis type and operations into enum type.

* Add experimental conditions query class.

* Only keep one API endpoint for all recipe calls.

* Fix ellipsis function for removing heading characters.

* Remove debugging print statement.

* Return total number of hits.

* Add adaptor that converts synpro collections.

* Allow min/max value to be set as None.

* handle cases when aggregate returns zero docs

* Let mongodb return all highlights and handle char limits by ourselves

* Use str for targets_formula/precursors_formula

* Fix mypy and comment ensure_index calls

* Add docstrings and comments to data adaptors.
  • Loading branch information
hhaoyan committed Jun 3, 2021
1 parent a4085cb commit 80f6599
Show file tree
Hide file tree
Showing 12 changed files with 759 additions and 139 deletions.
26 changes: 2 additions & 24 deletions src/mp_api/routes/synthesis/client.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,8 @@
from typing import List

from mp_api.core.client import BaseRester
from mp_api.routes.synthesis.models import SynthesisDoc
from mp_api.routes.synthesis.models import SynthesisRecipe


class SynthesisRester(BaseRester):

suffix = "synthesis"
document_model = SynthesisDoc # type: ignore
primary_key = "task_id"

def search_synthesis_text(self, keywords: List[str]):
"""
Search synthesis recipe text.
Arguments:
keywords (List[str]): List of search keywords
Returns:
synthesis_docs ([SynthesisDoc]): List of synthesis documents
"""

keyword_string = ",".join(keywords)

synthesis_docs = self._query_resource(
criteria={"keywords": keyword_string}, suburl="text_search", use_document_model=True,
)

return synthesis_docs
document_model = SynthesisRecipe
12 changes: 3 additions & 9 deletions src/mp_api/routes/synthesis/client.pyi
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
from typing import List, Optional
from mp_api.routes.synthesis.models import SynthesisDoc
from typing import List
from mp_api.routes.synthesis.models import SynthesisRecipe


class SynthesisRester:

def get_document_by_id(
self,
document_id: str,
fields: Optional[List[str]] = None,
monty_decode: bool = True,
version: Optional[str] = None,
) -> SynthesisDoc:
def query_text(self, keywords: List[str]) -> SynthesisRecipe:
...
77 changes: 77 additions & 0 deletions src/mp_api/routes/synthesis/data_adaptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
This script converts synthesis recipes data fetched directly
from the public repo of synthesis recipes
(https://github.com/CederGroupHub/text-mined-synthesis_public)
into MP compatible formats.
"""
import json
import sys

from pymatgen.core import Composition
from pymatgen.core.composition import CompositionError


def string2comp(x):
"""Convert string material formulas into pymatgen Compositions."""
# TODO: if a material contains multiple parts, this function
# only takes the first part. This is not the optimal solution,
# and should be resolved in the future.
formula = x.split('·')

# return reduce(add, [Composition(x) for x in formula])
return Composition(formula[0])


def convert_recipe(recipe):
"""Convert an entire synthesis recipe."""
targets_string = recipe['targets_string']
try:
target_comps = [string2comp(x) for x in targets_string]
except (CompositionError, ValueError):
print('Cannot process materials: ', targets_string)
raise

recipe['targets_formula'] = [json.loads(x.to_json()) for x in target_comps]
recipe['targets_formula_s'] = [x.reduced_formula for x in target_comps]
del recipe['targets_string']

recipe['precursors_formula'] = []
recipe['precursors_formula_s'] = []
for precursor in recipe['precursors']:
try:
comp = string2comp(precursor['material_formula'])
except (CompositionError, ValueError):
print('Cannot process precursor material: ', precursor['material_formula'])
continue
recipe['precursors_formula'].append(json.loads(comp.to_json()))
recipe['precursors_formula_s'].append(comp.reduced_formula)

return recipe


def convert_json_public_repo(src_json, dst_json):
"""
Convert the public synthesis recipes dataset (in a json file)
into a format as json file which can be imported into the MP database.
"""
with open(src_json) as f:
data = json.load(f)
recipes = data['reactions']

print('Loaded %s recipes, version %s' % (len(recipes), data['release_date']))

converted = []
for recipe in recipes:
try:
convert_recipe(recipe)
converted.append(recipe)
except (CompositionError, ValueError, IndexError):
pass

print('Converted %d recipes' % (len(converted),))
with open(dst_json, 'w') as f:
json.dump(converted, f)


if __name__ == '__main__':
convert_json_public_repo(sys.argv[1], sys.argv[2])
169 changes: 169 additions & 0 deletions src/mp_api/routes/synthesis/data_adaptor_synpro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
"""
This script converts synthesis recipes data fetched directly
from Ceder Group Synthesis Mining team MongoDB into MP compatible
formats.
"""
import json
import os
import re

from pymatgen.core.composition import CompositionError, Composition
from pymongo import MongoClient
from tqdm import tqdm


def convert_value(val):
"""Convert values in operation conditions dictionaries."""
return {
'min_value': float(val['min']) if val['min'] is not None else None,
'max_value': float(val['max']) if val['max'] is not None else None,
'values': [float(x) for x in val['values']],
'units': str(val['units']),
}


def convert_conditions(cond, op_type):
"""Convert conditions dictionaries."""
return {
'heating_temperature': [convert_value(x) for x in cond['temperature']],
'heating_time': [convert_value(x) for x in cond['time']],
'heating_atmosphere': [x.strip() for x in cond['environment'] if
x.strip()] if op_type == 'HeatingOperation' else [],
'mixing_device': (cond['environment'][1].strip() if cond['environment'][
1].strip() else None) if op_type == 'MixingOperation' else None,
'mixing_media': (cond['environment'][0].strip() if cond['environment'][
0].strip() else None) if op_type == 'MixingOperation' else None,
}


all_posible_ops = set()


def convert_op(op):
"""Convert operation dictionaries."""
all_posible_ops.add(op['type'])
return {
'type': op['type'],
'token': op['string'],
'conditions': convert_conditions(op['attributes'], op['type'])
}


def convert_mat_value(val):
"""Convert values specified in materials elements_vars."""
return {
'values': [float(x) for x in val['values']],
'min_value': float(val['min_value']) if val['min_value'] is not None else None,
'max_value': float(val['max_value']) if val['max_value'] is not None else None,
}


def convert_material(mat):
"""Convert materials dictionaries."""
return {
'material_string': str(mat['material_string']),
'material_name': str(mat['material_name']),
'material_formula': str(mat['material_formula']),
'phase': str(mat['phase']) or None,
'is_acronym': bool(mat['is_acronym']),
'composition': [{
'formula': str(x['formula']),
'amount': str(x['amount']),
'elements': {str(y): str(z) for y, z in x['elements'].items()}
} for x in mat['composition']],
'amounts_vars': {x: convert_mat_value(y) for x, y in mat['amounts_vars'].items()},
'elements_vars': {x: [str(z.strip()) for z in y if z.strip()] for x, y in mat['elements_vars'].items()},
'additives': [str(x.strip()) for x in mat['additives'] if x.strip()],
'oxygen_deficiency': str(mat['oxygen_deficiency']) or None
}


def get_material_formula(mat):
"""Convert string material formulas into pymatgen Compositions."""
formula = mat['material_formula']
formula = re.sub(r'·\d*H2O', '', formula)
try:
return Composition(formula)
except CompositionError:
q = None
for comp in mat['composition']:
if q is None:
q = Composition({x: float(y) for x, y in comp['elements'].items()})
else:
q += Composition({x: float(y) for x, y in comp['elements'].items()})
return q


def target_comps(doc):
"""Find all target material formulas and convert them into Composition."""
result = []
for x in doc['targets_string']:
if not x.strip():
continue
try:
result.append(Composition(x))
except (CompositionError, ValueError):
pass
return result


def precursor_comps(doc):
"""Find all precursor material formulas and convert them into Composition."""
result = []
for x in doc['precursors']:
try:
result.append(get_material_formula(x))
except (CompositionError, ValueError):
pass
return result


def convert_one(doc):
"""Convert an entire synthesis recipe."""
return {
'doi': str(doc['doi']),
'paragraph_string': ' '.join(doc['ext_paragraph']),
'synthesis_type': str(doc['synthesis_type']),
'reaction_string': str(doc['reaction_string']),
'reaction': {
'left_side': [{
'amount': str(x['amount']),
'material': str(x['material'])
} for x in doc['reaction']['left']],
'right_side': [{
'amount': str(x['amount']),
'material': str(x['material'])
} for x in doc['reaction']['right']],
},
'targets_formula': [json.loads(x.to_json()) for x in target_comps(doc)],
'target': convert_material(doc['target']),
'targets_formula_s': [x.reduced_formula for x in target_comps(doc)],
'precursors_formula_s': [x.reduced_formula for x in precursor_comps(doc)],
'precursors_formula': [json.loads(x.to_json()) for x in precursor_comps(doc)],
'precursors': [convert_material(x) for x in doc['precursors']],
'operations': [convert_op(x) for x in doc.get('operations', [])]
}


def main():
"""
Convert the Reactions_Solid_State/Reactions_Sol_Gel collection in
Ceder Group database into a json file which can be imported into the MP database.
"""
synpro_db = MongoClient(os.environ['SYNPRO_URI']).SynPro

synthesis_recipes = []

for item in tqdm(synpro_db.Reactions_Solid_State.find()):
synthesis_recipes.append(convert_one(item))
for item in tqdm(synpro_db.Reactions_Sol_Gel.find()):
synthesis_recipes.append(convert_one(item))

with open('synthesis_recipes.json', 'w') as f:
json.dump(synthesis_recipes, f)

print('All possible operation types', all_posible_ops)


if __name__ == '__main__':
main()
24 changes: 0 additions & 24 deletions src/mp_api/routes/synthesis/models.py

This file was deleted.

18 changes: 18 additions & 0 deletions src/mp_api/routes/synthesis/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from mp_api.routes.synthesis.models.core import (
SynthesisRecipe,
SynthesisTypeEnum,
)
from mp_api.routes.synthesis.models.materials import (
Component,
ExtractedMaterial,
)
from mp_api.routes.synthesis.models.operations import (
Value,
Conditions,
Operation,
OperationTypeEnum,
)
from mp_api.routes.synthesis.models.reaction import (
FormulaPart,
ReactionFormula,
)
Loading

0 comments on commit 80f6599

Please sign in to comment.