-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new synthesis recipes to API. (#257)
* Add new synthesis recipes schema. * [WIP] add models to synthesis recipes and implement query classes * [WIP] add query class for synthesis-type, experimental operations, and paragraph keywords (half-completed). * [WIP] add script to convert dataset from the public repo to MP database. * Change synthesis type and operations into enum type. * Add experimental conditions query class. * Only keep one API endpoint for all recipe calls. * Fix ellipsis function for removing heading characters. * Remove debugging print statement. * Return total number of hits. * Add adaptor that converts synpro collections. * Allow min/max value to be set as None. * handle cases when aggregate returns zero docs * Let mongodb return all highlights and handle char limits by ourselves * Use str for targets_formula/precursors_formula * Fix mypy and comment ensure_index calls * Add docstrings and comments to data adaptors.
- Loading branch information
Showing
12 changed files
with
759 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,8 @@ | ||
from typing import List | ||
|
||
from mp_api.core.client import BaseRester | ||
from mp_api.routes.synthesis.models import SynthesisDoc | ||
from mp_api.routes.synthesis.models import SynthesisRecipe | ||
|
||
|
||
class SynthesisRester(BaseRester): | ||
|
||
suffix = "synthesis" | ||
document_model = SynthesisDoc # type: ignore | ||
primary_key = "task_id" | ||
|
||
def search_synthesis_text(self, keywords: List[str]): | ||
""" | ||
Search synthesis recipe text. | ||
Arguments: | ||
keywords (List[str]): List of search keywords | ||
Returns: | ||
synthesis_docs ([SynthesisDoc]): List of synthesis documents | ||
""" | ||
|
||
keyword_string = ",".join(keywords) | ||
|
||
synthesis_docs = self._query_resource( | ||
criteria={"keywords": keyword_string}, suburl="text_search", use_document_model=True, | ||
) | ||
|
||
return synthesis_docs | ||
document_model = SynthesisRecipe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,8 @@ | ||
from typing import List, Optional | ||
from mp_api.routes.synthesis.models import SynthesisDoc | ||
from typing import List | ||
from mp_api.routes.synthesis.models import SynthesisRecipe | ||
|
||
|
||
class SynthesisRester: | ||
|
||
def get_document_by_id( | ||
self, | ||
document_id: str, | ||
fields: Optional[List[str]] = None, | ||
monty_decode: bool = True, | ||
version: Optional[str] = None, | ||
) -> SynthesisDoc: | ||
def query_text(self, keywords: List[str]) -> SynthesisRecipe: | ||
... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
""" | ||
This script converts synthesis recipes data fetched directly | ||
from the public repo of synthesis recipes | ||
(https://github.com/CederGroupHub/text-mined-synthesis_public) | ||
into MP compatible formats. | ||
""" | ||
import json | ||
import sys | ||
|
||
from pymatgen.core import Composition | ||
from pymatgen.core.composition import CompositionError | ||
|
||
|
||
def string2comp(x): | ||
"""Convert string material formulas into pymatgen Compositions.""" | ||
# TODO: if a material contains multiple parts, this function | ||
# only takes the first part. This is not the optimal solution, | ||
# and should be resolved in the future. | ||
formula = x.split('·') | ||
|
||
# return reduce(add, [Composition(x) for x in formula]) | ||
return Composition(formula[0]) | ||
|
||
|
||
def convert_recipe(recipe): | ||
"""Convert an entire synthesis recipe.""" | ||
targets_string = recipe['targets_string'] | ||
try: | ||
target_comps = [string2comp(x) for x in targets_string] | ||
except (CompositionError, ValueError): | ||
print('Cannot process materials: ', targets_string) | ||
raise | ||
|
||
recipe['targets_formula'] = [json.loads(x.to_json()) for x in target_comps] | ||
recipe['targets_formula_s'] = [x.reduced_formula for x in target_comps] | ||
del recipe['targets_string'] | ||
|
||
recipe['precursors_formula'] = [] | ||
recipe['precursors_formula_s'] = [] | ||
for precursor in recipe['precursors']: | ||
try: | ||
comp = string2comp(precursor['material_formula']) | ||
except (CompositionError, ValueError): | ||
print('Cannot process precursor material: ', precursor['material_formula']) | ||
continue | ||
recipe['precursors_formula'].append(json.loads(comp.to_json())) | ||
recipe['precursors_formula_s'].append(comp.reduced_formula) | ||
|
||
return recipe | ||
|
||
|
||
def convert_json_public_repo(src_json, dst_json): | ||
""" | ||
Convert the public synthesis recipes dataset (in a json file) | ||
into a format as json file which can be imported into the MP database. | ||
""" | ||
with open(src_json) as f: | ||
data = json.load(f) | ||
recipes = data['reactions'] | ||
|
||
print('Loaded %s recipes, version %s' % (len(recipes), data['release_date'])) | ||
|
||
converted = [] | ||
for recipe in recipes: | ||
try: | ||
convert_recipe(recipe) | ||
converted.append(recipe) | ||
except (CompositionError, ValueError, IndexError): | ||
pass | ||
|
||
print('Converted %d recipes' % (len(converted),)) | ||
with open(dst_json, 'w') as f: | ||
json.dump(converted, f) | ||
|
||
|
||
if __name__ == '__main__': | ||
convert_json_public_repo(sys.argv[1], sys.argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
""" | ||
This script converts synthesis recipes data fetched directly | ||
from Ceder Group Synthesis Mining team MongoDB into MP compatible | ||
formats. | ||
""" | ||
import json | ||
import os | ||
import re | ||
|
||
from pymatgen.core.composition import CompositionError, Composition | ||
from pymongo import MongoClient | ||
from tqdm import tqdm | ||
|
||
|
||
def convert_value(val): | ||
"""Convert values in operation conditions dictionaries.""" | ||
return { | ||
'min_value': float(val['min']) if val['min'] is not None else None, | ||
'max_value': float(val['max']) if val['max'] is not None else None, | ||
'values': [float(x) for x in val['values']], | ||
'units': str(val['units']), | ||
} | ||
|
||
|
||
def convert_conditions(cond, op_type): | ||
"""Convert conditions dictionaries.""" | ||
return { | ||
'heating_temperature': [convert_value(x) for x in cond['temperature']], | ||
'heating_time': [convert_value(x) for x in cond['time']], | ||
'heating_atmosphere': [x.strip() for x in cond['environment'] if | ||
x.strip()] if op_type == 'HeatingOperation' else [], | ||
'mixing_device': (cond['environment'][1].strip() if cond['environment'][ | ||
1].strip() else None) if op_type == 'MixingOperation' else None, | ||
'mixing_media': (cond['environment'][0].strip() if cond['environment'][ | ||
0].strip() else None) if op_type == 'MixingOperation' else None, | ||
} | ||
|
||
|
||
all_posible_ops = set() | ||
|
||
|
||
def convert_op(op): | ||
"""Convert operation dictionaries.""" | ||
all_posible_ops.add(op['type']) | ||
return { | ||
'type': op['type'], | ||
'token': op['string'], | ||
'conditions': convert_conditions(op['attributes'], op['type']) | ||
} | ||
|
||
|
||
def convert_mat_value(val): | ||
"""Convert values specified in materials elements_vars.""" | ||
return { | ||
'values': [float(x) for x in val['values']], | ||
'min_value': float(val['min_value']) if val['min_value'] is not None else None, | ||
'max_value': float(val['max_value']) if val['max_value'] is not None else None, | ||
} | ||
|
||
|
||
def convert_material(mat): | ||
"""Convert materials dictionaries.""" | ||
return { | ||
'material_string': str(mat['material_string']), | ||
'material_name': str(mat['material_name']), | ||
'material_formula': str(mat['material_formula']), | ||
'phase': str(mat['phase']) or None, | ||
'is_acronym': bool(mat['is_acronym']), | ||
'composition': [{ | ||
'formula': str(x['formula']), | ||
'amount': str(x['amount']), | ||
'elements': {str(y): str(z) for y, z in x['elements'].items()} | ||
} for x in mat['composition']], | ||
'amounts_vars': {x: convert_mat_value(y) for x, y in mat['amounts_vars'].items()}, | ||
'elements_vars': {x: [str(z.strip()) for z in y if z.strip()] for x, y in mat['elements_vars'].items()}, | ||
'additives': [str(x.strip()) for x in mat['additives'] if x.strip()], | ||
'oxygen_deficiency': str(mat['oxygen_deficiency']) or None | ||
} | ||
|
||
|
||
def get_material_formula(mat): | ||
"""Convert string material formulas into pymatgen Compositions.""" | ||
formula = mat['material_formula'] | ||
formula = re.sub(r'·\d*H2O', '', formula) | ||
try: | ||
return Composition(formula) | ||
except CompositionError: | ||
q = None | ||
for comp in mat['composition']: | ||
if q is None: | ||
q = Composition({x: float(y) for x, y in comp['elements'].items()}) | ||
else: | ||
q += Composition({x: float(y) for x, y in comp['elements'].items()}) | ||
return q | ||
|
||
|
||
def target_comps(doc): | ||
"""Find all target material formulas and convert them into Composition.""" | ||
result = [] | ||
for x in doc['targets_string']: | ||
if not x.strip(): | ||
continue | ||
try: | ||
result.append(Composition(x)) | ||
except (CompositionError, ValueError): | ||
pass | ||
return result | ||
|
||
|
||
def precursor_comps(doc): | ||
"""Find all precursor material formulas and convert them into Composition.""" | ||
result = [] | ||
for x in doc['precursors']: | ||
try: | ||
result.append(get_material_formula(x)) | ||
except (CompositionError, ValueError): | ||
pass | ||
return result | ||
|
||
|
||
def convert_one(doc): | ||
"""Convert an entire synthesis recipe.""" | ||
return { | ||
'doi': str(doc['doi']), | ||
'paragraph_string': ' '.join(doc['ext_paragraph']), | ||
'synthesis_type': str(doc['synthesis_type']), | ||
'reaction_string': str(doc['reaction_string']), | ||
'reaction': { | ||
'left_side': [{ | ||
'amount': str(x['amount']), | ||
'material': str(x['material']) | ||
} for x in doc['reaction']['left']], | ||
'right_side': [{ | ||
'amount': str(x['amount']), | ||
'material': str(x['material']) | ||
} for x in doc['reaction']['right']], | ||
}, | ||
'targets_formula': [json.loads(x.to_json()) for x in target_comps(doc)], | ||
'target': convert_material(doc['target']), | ||
'targets_formula_s': [x.reduced_formula for x in target_comps(doc)], | ||
'precursors_formula_s': [x.reduced_formula for x in precursor_comps(doc)], | ||
'precursors_formula': [json.loads(x.to_json()) for x in precursor_comps(doc)], | ||
'precursors': [convert_material(x) for x in doc['precursors']], | ||
'operations': [convert_op(x) for x in doc.get('operations', [])] | ||
} | ||
|
||
|
||
def main(): | ||
""" | ||
Convert the Reactions_Solid_State/Reactions_Sol_Gel collection in | ||
Ceder Group database into a json file which can be imported into the MP database. | ||
""" | ||
synpro_db = MongoClient(os.environ['SYNPRO_URI']).SynPro | ||
|
||
synthesis_recipes = [] | ||
|
||
for item in tqdm(synpro_db.Reactions_Solid_State.find()): | ||
synthesis_recipes.append(convert_one(item)) | ||
for item in tqdm(synpro_db.Reactions_Sol_Gel.find()): | ||
synthesis_recipes.append(convert_one(item)) | ||
|
||
with open('synthesis_recipes.json', 'w') as f: | ||
json.dump(synthesis_recipes, f) | ||
|
||
print('All possible operation types', all_posible_ops) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from mp_api.routes.synthesis.models.core import ( | ||
SynthesisRecipe, | ||
SynthesisTypeEnum, | ||
) | ||
from mp_api.routes.synthesis.models.materials import ( | ||
Component, | ||
ExtractedMaterial, | ||
) | ||
from mp_api.routes.synthesis.models.operations import ( | ||
Value, | ||
Conditions, | ||
Operation, | ||
OperationTypeEnum, | ||
) | ||
from mp_api.routes.synthesis.models.reaction import ( | ||
FormulaPart, | ||
ReactionFormula, | ||
) |
Oops, something went wrong.