From fc1b5e1552677e880a08bfa90ff0580401458bdd Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 27 Apr 2018 14:31:19 -0700 Subject: [PATCH 01/97] start cli with add_tasks command --- emmet/scripts/__init__.py | 0 emmet/scripts/emmet.py | 76 +++++++++++++++++++++++++++++++++++++++ setup.py | 10 ++++-- 3 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 emmet/scripts/__init__.py create mode 100644 emmet/scripts/emmet.py diff --git a/emmet/scripts/__init__.py b/emmet/scripts/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py new file mode 100644 index 0000000000..3522c87c12 --- /dev/null +++ b/emmet/scripts/emmet.py @@ -0,0 +1,76 @@ +import click, os +from atomate.vasp.database import VaspCalcDb + +@click.group() +def cli(): + pass + +@cli.command() +@click.option('--source_db_file', default="source.json", help='source db file') +@click.option('--target_db_file', default="target.json", help='target db file') +@click.option('--tag', default=None, help='only insert tasks with specific tag') +@click.option('--insert/--no-insert', default=False, help='actually execute task addition') +def add_tasks(source_db_file, target_db_file, tag, insert): + """Retrieve tasks from source and add to target""" + + def get_subdir(dn): + return dn.rsplit(os.sep, 1)[-1] + + if not os.path.exists(source_db_file): + print(source_db_file, 'not found!') + return + source = VaspCalcDb.from_db_file(source_db_file, admin=True) # '../config/db.json' + print('connected to source db with', source.collection.count(), 'tasks') + + if not os.path.exists(target_db_file): + print(target_db_file, 'not found!') + return + target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json' + print('connected to target db with', target.collection.count(), 'tasks') + + tags = [tag] + if tag is None: + tags = [t for t in source.collection.distinct('tags') if t is not None] + print(len(tags), 'tags in source collection') + + for t in tags: + + print('tag:', t) + query = {'tags': t} + source_count = source.collection.count(query) + print('source:', source_count, 'tasks out of', source.collection.count()) + print('target:', target.collection.count(query), 'tasks out of', target.collection.count()) + + # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*]) + source_task_ids = source.collection.find(query).distinct('task_id') + source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)] + skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id') + print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids)) + + query.update({'task_id': {'$nin': skip_task_ids}}) + already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] + subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs] + print(len(subdirs), 'candidate tasks to insert') + if len(subdirs) < 1: + continue + + if not insert: + print('add --insert flag to actually add tasks to production') + continue + + for subdir in subdirs: + subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}} + doc = target.collection.find_one(subdir_query, {'task_id': 1}) + if doc: + print(subdir, 'already inserted as', doc['task_id']) + continue + + source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id'] + print('retrieve', source_task_id, 'for', subdir) + task_doc = source.retrieve_task(source_task_id) + + if isinstance(task_doc['task_id'], int): + c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"] + task_doc['task_id'] = 'mp-{}'.format(c) + + target.insert_task(task_doc, use_gridfs=True) diff --git a/setup.py b/setup.py index 360464490d..2e76da10e2 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,13 @@ author_email='matproj-develop@googlegroups.com', license='modified BSD', packages=find_packages(), + include_package_data=True, package_data={}, zip_safe=False, install_requires=[ 'atomate', 'pymatgen>=2018.4.20','maggma','monty', 'six', 'pydash', 'tqdm', 'matminer', - 'prettyplotlib', "pybtex" + 'prettyplotlib', "pybtex", "Click" ], classifiers=["Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", @@ -34,5 +35,10 @@ 'Topic :: Other/Nonlisted Topic', 'Topic :: Scientific/Engineering'], test_suite='nose.collector', - tests_require=['nose'] + tests_require=['nose'], + py_modules=['emmet'], + entry_points=''' + [console_scripts] + emmet=emmet.scripts.emmet:cli + ''', ) From 6d6d8f31783920c90031030088adf354f0c7903e Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 10 May 2018 13:44:12 -0700 Subject: [PATCH 02/97] save progress on add_tasks cli --- emmet/scripts/emmet.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 3522c87c12..d083c72691 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,5 +1,6 @@ import click, os from atomate.vasp.database import VaspCalcDb +from pymongo.collection import ReturnDocument @click.group() def cli(): @@ -13,6 +14,8 @@ def cli(): def add_tasks(source_db_file, target_db_file, tag, insert): """Retrieve tasks from source and add to target""" + exclude = {'tags': {'$ne': 'deprecated'}} + def get_subdir(dn): return dn.rsplit(os.sep, 1)[-1] @@ -28,32 +31,40 @@ def get_subdir(dn): target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json' print('connected to target db with', target.collection.count(), 'tasks') + indexes = ['task_id', 'tags', 'dir_name'] + for index in indexes: + for db in [source, target]: + keys = [k.rsplit('_', 1)[0] for k in db.collection.index_information().keys()] + if index not in keys: + db.collection.ensure_index(index) + print('ensured index', index) + tags = [tag] if tag is None: - tags = [t for t in source.collection.distinct('tags') if t is not None] + tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None] print(len(tags), 'tags in source collection') for t in tags: - print('tag:', t) - query = {'tags': t} + print('### {} ###'.format(t)) + query = {'$and': [{'tags': t}, exclude]} source_count = source.collection.count(query) - print('source:', source_count, 'tasks out of', source.collection.count()) - print('target:', target.collection.count(query), 'tasks out of', target.collection.count()) + print('source / target:', source_count, '/', target.collection.count(query)) # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*]) source_task_ids = source.collection.find(query).distinct('task_id') source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)] skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id') - print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids)) + if len(skip_task_ids): + print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids)) query.update({'task_id': {'$nin': skip_task_ids}}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs] - print(len(subdirs), 'candidate tasks to insert') if len(subdirs) < 1: continue + print(len(subdirs), 'candidate tasks to insert') if not insert: print('add --insert flag to actually add tasks to production') continue From 7f80f9b9bcbb8c25114054dcf3ac6e9529b317b3 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 12 Jun 2018 16:42:44 -0700 Subject: [PATCH 03/97] cli: add_wflows subcommand --- emmet/scripts/emmet.py | 356 +++++++++++++++++++++++++++++++++++++--- emmet/vasp/materials.py | 5 +- 2 files changed, 337 insertions(+), 24 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index d083c72691..40926cb241 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,17 +1,27 @@ -import click, os -from atomate.vasp.database import VaspCalcDb +import click, os, yaml, sys, logging, operator +from collections import Counter +from pymongo import MongoClient from pymongo.collection import ReturnDocument +from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor +from pymatgen import Structure +from fireworks import LaunchPad +from atomate.vasp.database import VaspCalcDb +from atomate.vasp.workflows.presets.core import wf_structure_optimization +from atomate.vasp.database import VaspCalcDb +from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs +from emmet.vasp.materials import group_structures, get_sg +from emmet.vasp.task_tagger import task_type +from log4mongo.handlers import MongoHandler @click.group() def cli(): pass @cli.command() -@click.option('--source_db_file', default="source.json", help='source db file') @click.option('--target_db_file', default="target.json", help='target db file') @click.option('--tag', default=None, help='only insert tasks with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute task addition') -def add_tasks(source_db_file, target_db_file, tag, insert): +def add_tasks(target_db_file, tag, insert): """Retrieve tasks from source and add to target""" exclude = {'tags': {'$ne': 'deprecated'}} @@ -19,11 +29,9 @@ def add_tasks(source_db_file, target_db_file, tag, insert): def get_subdir(dn): return dn.rsplit(os.sep, 1)[-1] - if not os.path.exists(source_db_file): - print(source_db_file, 'not found!') - return - source = VaspCalcDb.from_db_file(source_db_file, admin=True) # '../config/db.json' - print('connected to source db with', source.collection.count(), 'tasks') + lpad = LaunchPad.auto_load() + source = lpad.db.tasks + print('connected to source db with', source.count(), 'tasks') if not os.path.exists(target_db_file): print(target_db_file, 'not found!') @@ -31,28 +39,22 @@ def get_subdir(dn): target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json' print('connected to target db with', target.collection.count(), 'tasks') - indexes = ['task_id', 'tags', 'dir_name'] - for index in indexes: - for db in [source, target]: - keys = [k.rsplit('_', 1)[0] for k in db.collection.index_information().keys()] - if index not in keys: - db.collection.ensure_index(index) - print('ensured index', index) + ensure_indexes(['task_id', 'tags', 'dir_name'], [source, target.collection]) tags = [tag] if tag is None: - tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None] + tags = [t for t in source.find(exclude).distinct('tags') if t is not None] print(len(tags), 'tags in source collection') for t in tags: print('### {} ###'.format(t)) query = {'$and': [{'tags': t}, exclude]} - source_count = source.collection.count(query) + source_count = source.count(query) print('source / target:', source_count, '/', target.collection.count(query)) # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*]) - source_task_ids = source.collection.find(query).distinct('task_id') + source_task_ids = source.find(query).distinct('task_id') source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)] skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id') if len(skip_task_ids): @@ -60,7 +62,7 @@ def get_subdir(dn): query.update({'task_id': {'$nin': skip_task_ids}}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] - subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs] + subdirs = [get_subdir(dn) for dn in source.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs] if len(subdirs) < 1: continue @@ -76,7 +78,7 @@ def get_subdir(dn): print(subdir, 'already inserted as', doc['task_id']) continue - source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id'] + source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id'] print('retrieve', source_task_id, 'for', subdir) task_doc = source.retrieve_task(source_task_id) @@ -85,3 +87,315 @@ def get_subdir(dn): task_doc['task_id'] = 'mp-{}'.format(c) target.insert_task(task_doc, use_gridfs=True) + + +@cli.command() +@click.argument('list_of_structures', type=click.File('rb')) +@click.option('-a', '--alt_tasks_db_file', type=click.Path(exists=True), help='config file for alternative tasks collection') +@click.option('--tag', default=None, help='only include structures with specific tag') +@click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') +@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection') +def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): + """add workflows for list of structures / SNLs (YAML config or JSON list of pymatgen structures""" + + exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} + + if not insert: + print('DRY RUN! Add --insert flag to actually add workflows') + + try: + snl_db_config = yaml.load(list_of_structures) + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_coll = snl_db[snl_db_config['collection']] + except Exception as ex: + print(ex) + # NOTE WIP might change it to use add_snls first, and then add_wflows based on SNL collection only + # TODO load pymatgen structures from JSON file into MongoDB collection + # TODO also fake-tag them, add SNL info + snl_coll = None + print('to be implemented') + return + print('# SNLs:\t', snl_coll.count(exclude)) + + lpad = LaunchPad.auto_load() + + logger = logging.getLogger('add_wflows') + mongo_handler = MongoHandler( + host=lpad.host, port=lpad.port, database_name=lpad.name, collection='add_wflows_logs', + username=lpad.username, password=lpad.password, authentication_db=lpad.name + ) + logger.addHandler(mongo_handler) + ensure_indexes(['level', 'snl_id', 'formula'], [mongo_handler.collection]) + if clear_logs: + mongo_handler.collection.drop() + + if alt_tasks_db_file is not None: + target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True) + tasks_coll = target.collection + else: + tasks_coll = lpad.db.tasks + print('# tasks:', tasks_coll.count()) + + structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] + NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] + base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': ['He', 'Ar', 'Ne']}} # exclude no electroneg elements + task_base_query = {'_mpworks_meta': {'$exists': 0}} + vp = DLSVolumePredictor() + + ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label'], [snl_coll]) + + tags = [tag] + if tag is None: + tags = dict( + (t, snl_coll.count({'$and': [{'about.remarks': t}, exclude]})) + for t in snl_coll.find(exclude).distinct('about.remarks') if t is not None + ) + tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True) + print(len(tags), 'tags in source collection') + + canonical_task_structures = {} + grouped_workflow_structures = {} + canonical_workflow_structures = {} + + for tag, ndocs in tags: + query = {'$and': [{'about.remarks': tag}, exclude]} + query.update(base_query) + + # TODO WIP will be removed + if tag == 'new_ordered_icsd_2017': + #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1}) + print(tag, 'TODO implement db.icsd as snl_coll') + continue + elif tag == 'pre-atomate production': + # TODO scan last + continue + + print('aggregate', ndocs, 'structures for', tag, '...') + structure_groups = snl_coll.aggregate([ + {'$match': query}, {'$group': { + '_id': '$reduced_cell_formula', + 'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} + }} + ], allowDiskUse=True, batchSize=50) + + print('loop formulas for', tag, '...') + counter = Counter() + structures, canonical_structures = {}, {} + + for idx_group, group in enumerate(structure_groups): + + counter['formulas'] += 1 + formula = group['_id'] + if formula not in structures: + structures[formula] = {} + if formula not in canonical_structures: + canonical_structures[formula] = {} + if idx_group and not idx_group%1000: + print(idx_group, '...') + + for dct in group['structures']: + if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}): + continue # already checked + counter['structures'] += 1 + s = Structure.from_dict(dct) + s.snl_id = dct['snl_id'] + s.task_id = dct.get('task_id') + s.remove_oxidation_states() + try: + sgnum = get_sg(s) + except Exception as ex: + s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) + print(str(ex)) + sys.exit(0) + if sgnum not in structures[formula]: + structures[formula][sgnum] = [] + structures[formula][sgnum].append(s) + + for sgnum, slist in structures[formula].items(): + for g in group_structures(slist): + if sgnum not in canonical_structures[formula]: + canonical_structures[formula][sgnum] = [] + canonical_structures[formula][sgnum].append(g[0]) + if len(g) > 1: + for s in g[1:]: + logger.warning('duplicate structure', extra={ + 'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id + }) + + if not canonical_structures[formula]: + continue + #print(sum([len(x) for x in canonical_structures[formula].values()]), 'canonical structure(s) for', formula) + + if formula not in canonical_workflow_structures: + canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {} + workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1}) + if workflows.count() > 0: + workflow_structures = {} + for wf in workflows: + s = Structure.from_dict(wf['metadata']['structure']) + s.remove_oxidation_states() + sgnum = get_sg(s) + if sgnum in canonical_structures[formula]: + if sgnum not in workflow_structures: + workflow_structures[sgnum] = [] + s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework + workflow_structures[sgnum].append(s) + if workflow_structures: + for sgnum, slist in workflow_structures.items(): + grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] + canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] + #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) + + for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): + + for struc in slist: + + try: + struct = vp.get_predicted_structure(struc) + struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + except Exception as ex: + print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') + print(ex) + struct = struc + + if not structures_match(struct, struc): + print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') + struct = struc + + wf_found, readd_wf = False, False + if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: + for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): + if structures_match(struct, s): + msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id) + print(msg) + if struct.task_id is not None: + task_query = {'task_id': struct.task_id} + task_query.update(task_base_query) + task = tasks_coll.find_one(task_query, ['input.structure']) + if task: + s_task = Structure.from_dict(task['input']['structure']) + s_task.remove_oxidation_states() + if not structures_match(struct, s_task): + msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) + print(msg) + logger.error(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch' + }) + counter['snl-task_mismatch'] += 1 + else: + msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) + print(msg) + logger.warning(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id + }) + else: + print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) + fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]] + fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks']) + fw_found = False + for fw in fws: + if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: + msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) + print(msg) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id}) + fw_found = True + break + if not fw_found: + print(' --> no WF with enforced task-id', struct.task_id, '-> re-add workflow') + readd_wf = True + break + else: + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) + wf_found = True + break + + if wf_found: + continue + + # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) + if not readd_wf: + try: + if formula not in canonical_task_structures: + canonical_task_structures[formula] = {} + task_query = {'formula_pretty': formula} + task_query.update(task_base_query) + tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1}) + if tasks.count() > 0: + task_structures = {} + for task in tasks: + task_label = task_type(task['orig_inputs'], include_calc_type=False) + if task_label == "Structure Optimization": + s = Structure.from_dict(task['input']['structure']) + sg = get_sg(s) + if sg in canonical_structures[formula]: + if sg not in task_structures: + task_structures[sg] = [] + s.task_id = task['task_id'] + task_structures[sg].append(s) + if task_structures: + for sg, slist in task_structures.items(): + canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)] + #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula) + + matched_task_ids = [] + if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]: + for s in canonical_task_structures[formula][sgnum]: + if structures_match(struct, s): + print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id) + matched_task_ids.append(s.task_id) + if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids: + print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids) + raise ValueError + if matched_task_ids: + logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) + continue + except ValueError as ex: + counter['unmatched_task_id'] += 1 + continue + + msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) + if struct.task_id is not None: + msg += ' --> enforcing task-id {}'.format(struct.task_id) + print(msg) + + no_potcars = set(NO_POTCARS) & set(struct.composition.elements) + if len(no_potcars) > 0: + msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) + print(msg) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars}) + continue + + try: + wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) + wf = add_trackers(wf) + wf = add_tags(wf, [tag]) + if struct.task_id is not None: + wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) + #if struct.icsd_id is not None: + # wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id}) + except: + msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'}) + continue + + if insert: + old_new = lpad.add_wf(wf) + logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + counter['add(ed)'] += 1 + + print(counter) + + +def structures_match(s1, s2): + return bool(len(list(group_structures([s1, s2]))) == 1) + +def ensure_indexes(indexes, colls): + for index in indexes: + for coll in colls: + keys = [k.rsplit('_', 1)[0] for k in coll.index_information().keys()] + if index not in keys: + coll.ensure_index(index) + print('ensured index', index, 'on', coll.full_name) diff --git a/emmet/vasp/materials.py b/emmet/vasp/materials.py index 3a8a41a843..e0ff39b325 100644 --- a/emmet/vasp/materials.py +++ b/emmet/vasp/materials.py @@ -274,6 +274,8 @@ def ensure_indicies(self): self.materials.ensure_index("task_ids") self.materials.ensure_index(self.materials.lu_field) +def get_sg(struc): + return struc.get_space_group_info(symprec=0.1)[1] def structure_metadata(structure): """ @@ -319,9 +321,6 @@ def group_structures(structures, ltol=0.2, stol=0.3, angle_tol=5, separate_mag_o allow_subset=False, comparator=ElementComparator()) - def get_sg(struc): - return struc.get_space_group_info(symprec=0.1)[1] - def get_mag_ordering(struc): return CollinearMagneticStructureAnalyzer(struc).ordering.value From f2ed91ec5f90f7d9caea8fe0de4583a69ecca2bb Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 14 Jun 2018 14:20:43 -0700 Subject: [PATCH 04/97] cli: CursorNotFound, electroneg query --- emmet/scripts/emmet.py | 424 ++++++++++++++++++++++------------------- 1 file changed, 223 insertions(+), 201 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 40926cb241..5920cc9adb 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,6 +1,7 @@ import click, os, yaml, sys, logging, operator -from collections import Counter +from collections import Counter, OrderedDict from pymongo import MongoClient +from pymongo.errors import CursorNotFound from pymongo.collection import ReturnDocument from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure @@ -140,20 +141,29 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] - base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': ['He', 'Ar', 'Ne']}} # exclude no electroneg elements + no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+'] + base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} task_base_query = {'_mpworks_meta': {'$exists': 0}} vp = DLSVolumePredictor() - ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label'], [snl_coll]) + ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll]) - tags = [tag] + tags = [] if tag is None: - tags = dict( - (t, snl_coll.count({'$and': [{'about.remarks': t}, exclude]})) - for t in snl_coll.find(exclude).distinct('about.remarks') if t is not None - ) + query = dict(exclude) + query.update(base_query) + remarks = filter(None, snl_coll.find(query).distinct('about.remarks')) + for t in remarks: + query = {'$and': [{'about.remarks': t}, exclude]} + query.update(base_query) + tags.append((t, snl_coll.count(query))) tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True) - print(len(tags), 'tags in source collection') + print(len(tags), 'tags in source collection => TOP10:') + print('\n'.join(['{} ({})'.format(*t) for t in tags[:10]])) + else: + query = {'$and': [{'about.remarks': tag}, exclude]} + query.update(base_query) + tags = [(tag, snl_coll.count(query))] canonical_task_structures = {} grouped_workflow_structures = {} @@ -174,217 +184,229 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): print('aggregate', ndocs, 'structures for', tag, '...') structure_groups = snl_coll.aggregate([ - {'$match': query}, {'$group': { + {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, + {'$group': { '_id': '$reduced_cell_formula', 'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} }} - ], allowDiskUse=True, batchSize=50) + ], allowDiskUse=True, batchSize=1) print('loop formulas for', tag, '...') counter = Counter() structures, canonical_structures = {}, {} - for idx_group, group in enumerate(structure_groups): - - counter['formulas'] += 1 - formula = group['_id'] - if formula not in structures: - structures[formula] = {} - if formula not in canonical_structures: - canonical_structures[formula] = {} - if idx_group and not idx_group%1000: - print(idx_group, '...') - - for dct in group['structures']: - if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}): - continue # already checked - counter['structures'] += 1 - s = Structure.from_dict(dct) - s.snl_id = dct['snl_id'] - s.task_id = dct.get('task_id') - s.remove_oxidation_states() - try: - sgnum = get_sg(s) - except Exception as ex: - s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) - print(str(ex)) - sys.exit(0) - if sgnum not in structures[formula]: - structures[formula][sgnum] = [] - structures[formula][sgnum].append(s) - - for sgnum, slist in structures[formula].items(): - for g in group_structures(slist): - if sgnum not in canonical_structures[formula]: - canonical_structures[formula][sgnum] = [] - canonical_structures[formula][sgnum].append(g[0]) - if len(g) > 1: - for s in g[1:]: - logger.warning('duplicate structure', extra={ - 'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id - }) - - if not canonical_structures[formula]: - continue - #print(sum([len(x) for x in canonical_structures[formula].values()]), 'canonical structure(s) for', formula) - - if formula not in canonical_workflow_structures: - canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {} - workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1}) - if workflows.count() > 0: - workflow_structures = {} - for wf in workflows: - s = Structure.from_dict(wf['metadata']['structure']) - s.remove_oxidation_states() - sgnum = get_sg(s) - if sgnum in canonical_structures[formula]: - if sgnum not in workflow_structures: - workflow_structures[sgnum] = [] - s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework - workflow_structures[sgnum].append(s) - if workflow_structures: - for sgnum, slist in workflow_structures.items(): - grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] - canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] - #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) - - for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): - - for struc in slist: - + try: + for idx_group, group in enumerate(structure_groups): + + counter['formulas'] += 1 + formula = group['_id'] + if formula not in structures: + structures[formula] = {} + if formula not in canonical_structures: + canonical_structures[formula] = {} + if idx_group and not idx_group%1000: + print(idx_group, '...') + + for dct in group['structures']: + if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}): + continue # already checked + mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups + counter['structures'] += 1 + s = Structure.from_dict(dct) + s.snl_id = dct['snl_id'] + s.task_id = dct.get('task_id') + s.remove_oxidation_states() try: - struct = vp.get_predicted_structure(struc) - struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + sgnum = get_sg(s) except Exception as ex: - print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') - print(ex) - struct = struc - - if not structures_match(struct, struc): - print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') - struct = struc - - wf_found, readd_wf = False, False - if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: - for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): - if structures_match(struct, s): - msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id) - print(msg) - if struct.task_id is not None: - task_query = {'task_id': struct.task_id} - task_query.update(task_base_query) - task = tasks_coll.find_one(task_query, ['input.structure']) - if task: - s_task = Structure.from_dict(task['input']['structure']) - s_task.remove_oxidation_states() - if not structures_match(struct, s_task): - msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) - print(msg) - logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch' - }) - counter['snl-task_mismatch'] += 1 - else: - msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) - print(msg) - logger.warning(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id - }) - else: - print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) - fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]] - fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks']) - fw_found = False - for fw in fws: - if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: - msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) - print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id}) - fw_found = True - break - if not fw_found: - print(' --> no WF with enforced task-id', struct.task_id, '-> re-add workflow') - readd_wf = True - break - else: - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) - wf_found = True - break - - if wf_found: + s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) + msg = 'SNL {}: {}'.format(s.snl_id, ex) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)}) continue + if sgnum not in structures[formula]: + structures[formula][sgnum] = [] + structures[formula][sgnum].append(s) + + for sgnum, slist in structures[formula].items(): + for g in group_structures(slist): + if sgnum not in canonical_structures[formula]: + canonical_structures[formula][sgnum] = [] + canonical_structures[formula][sgnum].append(g[0]) + if len(g) > 1: + for s in g[1:]: + logger.warning('duplicate structure', extra={ + 'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id + }) + + if not canonical_structures[formula]: + continue + canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist] + + if formula not in canonical_workflow_structures: + canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {} + workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1}) + if workflows.count() > 0: + workflow_structures = {} + for wf in workflows: + s = Structure.from_dict(wf['metadata']['structure']) + s.remove_oxidation_states() + sgnum = get_sg(s) + if sgnum in canonical_structures[formula]: + if sgnum not in workflow_structures: + workflow_structures[sgnum] = [] + s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework + workflow_structures[sgnum].append(s) + if workflow_structures: + for sgnum, slist in workflow_structures.items(): + grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] + canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] + #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) + + for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): + + for struc in slist: - # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) - if not readd_wf: try: - if formula not in canonical_task_structures: - canonical_task_structures[formula] = {} - task_query = {'formula_pretty': formula} - task_query.update(task_base_query) - tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1}) - if tasks.count() > 0: - task_structures = {} - for task in tasks: - task_label = task_type(task['orig_inputs'], include_calc_type=False) - if task_label == "Structure Optimization": - s = Structure.from_dict(task['input']['structure']) - sg = get_sg(s) - if sg in canonical_structures[formula]: - if sg not in task_structures: - task_structures[sg] = [] - s.task_id = task['task_id'] - task_structures[sg].append(s) - if task_structures: - for sg, slist in task_structures.items(): - canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)] - #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula) - - matched_task_ids = [] - if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]: - for s in canonical_task_structures[formula][sgnum]: - if structures_match(struct, s): - print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id) - matched_task_ids.append(s.task_id) - if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids: - print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids) - raise ValueError - if matched_task_ids: - logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) - continue - except ValueError as ex: - counter['unmatched_task_id'] += 1 - continue + struct = vp.get_predicted_structure(struc) + struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + except Exception as ex: + print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') + print(ex) + struct = struc + + if not structures_match(struct, struc): + print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') + struct = struc + + wf_found, readd_wf = False, False + if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: + for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): + if structures_match(struct, s): + msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id) + print(msg) + if struct.task_id is not None: + task_query = {'task_id': struct.task_id} + task_query.update(task_base_query) + task = tasks_coll.find_one(task_query, ['input.structure']) + if task: + s_task = Structure.from_dict(task['input']['structure']) + s_task.remove_oxidation_states() + if not structures_match(struct, s_task): + msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) + print(msg) + logger.error(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch' + }) + counter['snl-task_mismatch'] += 1 + else: + msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) + print(msg) + logger.warning(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id + }) + else: + print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) + fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]] + fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks']) + fw_found = False + for fw in fws: + if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: + msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) + print(msg) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id}) + fw_found = True + break + if not fw_found: + print(' --> no WF with enforced task-id', struct.task_id, '-> re-add workflow') + readd_wf = True + break + else: + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) + wf_found = True + break - msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) - if struct.task_id is not None: - msg += ' --> enforcing task-id {}'.format(struct.task_id) - print(msg) + if wf_found: + continue - no_potcars = set(NO_POTCARS) & set(struct.composition.elements) - if len(no_potcars) > 0: - msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) - print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars}) - continue + # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) + if not readd_wf: + try: + if formula not in canonical_task_structures: + canonical_task_structures[formula] = {} + task_query = {'formula_pretty': formula} + task_query.update(task_base_query) + tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1}) + if tasks.count() > 0: + task_structures = {} + for task in tasks: + task_label = task_type(task['orig_inputs'], include_calc_type=False) + if task_label == "Structure Optimization": + s = Structure.from_dict(task['input']['structure']) + sg = get_sg(s) + if sg in canonical_structures[formula]: + if sg not in task_structures: + task_structures[sg] = [] + s.task_id = task['task_id'] + task_structures[sg].append(s) + if task_structures: + for sg, slist in task_structures.items(): + canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)] + #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula) + + matched_task_ids = [] + if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]: + for s in canonical_task_structures[formula][sgnum]: + if structures_match(struct, s): + print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id) + matched_task_ids.append(s.task_id) + if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids: + print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids) + raise ValueError + if matched_task_ids: + logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) + continue + except ValueError as ex: + counter['unmatched_task_id'] += 1 + continue - try: - wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) - wf = add_trackers(wf) - wf = add_tags(wf, [tag]) + msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) if struct.task_id is not None: - wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) - #if struct.icsd_id is not None: - # wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id}) - except: - msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) + msg += ' --> enforcing task-id {}'.format(struct.task_id) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'}) - continue - if insert: - old_new = lpad.add_wf(wf) - logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) - counter['add(ed)'] += 1 + no_potcars = set(NO_POTCARS) & set(struct.composition.elements) + if len(no_potcars) > 0: + msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) + print(msg) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars}) + continue + + try: + wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) + wf = add_trackers(wf) + wf = add_tags(wf, [tag]) + if struct.task_id is not None: + wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) + #if struct.icsd_id is not None: + # wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id}) + except: + msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'}) + continue + + if insert: + old_new = lpad.add_wf(wf) + logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + counter['add(ed)'] += 1 + + except CursorNotFound as ex: + print(ex) + sites_elements = [ + (len(set([e.symbol for e in x.composition.elements])), x.num_sites) + for x in canonical_structures_list + ] + print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) print(counter) From 08e56f6117f9731fe0373ebd45b03478de733a78 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 15 Jun 2018 16:57:35 -0700 Subject: [PATCH 05/97] cli: multiple tasks collections, enforce/clean-up task_ids --- emmet/scripts/emmet.py | 181 ++++++++++++++++++++++++++++------------- 1 file changed, 123 insertions(+), 58 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 5920cc9adb..6f553d6c54 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -54,16 +54,27 @@ def get_subdir(dn): source_count = source.count(query) print('source / target:', source_count, '/', target.collection.count(query)) - # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*]) - source_task_ids = source.find(query).distinct('task_id') - source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)] - skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id') + # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*]) + nr_source_mp_tasks, skip_task_ids = 0, [] + for doc in source.find(query, ['task_id', 'dir_name']): + if isinstance(doc['task_id'], str): + nr_source_mp_tasks += 1 + task_query = {'task_id': doc['task_id'], 'dir_name': doc['dir_name']} + if target.collection.count(task_query): + skip_task_ids.append(doc['task_id']) if len(skip_task_ids): - print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids)) + print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks) query.update({'task_id': {'$nin': skip_task_ids}}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] - subdirs = [get_subdir(dn) for dn in source.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs] + subdirs = [] + for doc in source.find(query, ['dir_name', 'task_id', 'retired_task_id']): + subdir = get_subdir(doc['dir_name']) + if subdir not in already_inserted_subdirs or 'retired_task_id' in doc: + entry = {'subdir': subdir} + if 'retired_task_id' in doc: + entry.update({'task_id': doc['task_id']}) + subdirs.append(entry) if len(subdirs) < 1: continue @@ -72,11 +83,20 @@ def get_subdir(dn): print('add --insert flag to actually add tasks to production') continue - for subdir in subdirs: - subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}} + for subdir_doc in subdirs: + subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}} doc = target.collection.find_one(subdir_query, {'task_id': 1}) if doc: - print(subdir, 'already inserted as', doc['task_id']) + print(subdir_doc['subdir'], 'already inserted as', doc['task_id']) + if 'task_id' in subdir_doc and subdir_doc['task_id'] != doc['task_id']: + target.collection.remove({'task_id': subdir_doc['task_id']}) + target.collection.update( + {'task_id': doc['task_id']}, { + '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id']}, + '$addToSet': {'tags': t} + } + ) + print('replaced task_id', doc['task_id'], 'with', subdir_doc['task_id']) continue source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id'] @@ -118,7 +138,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): snl_coll = None print('to be implemented') return - print('# SNLs:\t', snl_coll.count(exclude)) + print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) lpad = LaunchPad.auto_load() @@ -132,12 +152,13 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): if clear_logs: mongo_handler.collection.drop() - if alt_tasks_db_file is not None: + tasks_collections = OrderedDict() + tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks + if alt_tasks_db_file is not None: # TODO multiple alt_task_db_files? target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True) - tasks_coll = target.collection - else: - tasks_coll = lpad.db.tasks - print('# tasks:', tasks_coll.count()) + tasks_collections[target.collection.full_name] = target.collection + for full_name, tasks_coll in tasks_collections.items(): + print(tasks_coll.count(), 'tasks in', full_name) structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] @@ -169,6 +190,41 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): grouped_workflow_structures = {} canonical_workflow_structures = {} + def load_canonical_task_structures(formula, full_name): + if full_name not in canonical_task_structures: + canonical_task_structures[full_name] = {} + if formula not in canonical_task_structures[full_name]: + canonical_task_structures[full_name][formula] = {} + task_query = {'formula_pretty': formula} + task_query.update(task_base_query) + tasks = tasks_collections[full_name].find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1}) + if tasks.count() > 0: + task_structures = {} + for task in tasks: + task_label = task_type(task['orig_inputs'], include_calc_type=False) + if task_label == "Structure Optimization": + s = Structure.from_dict(task['input']['structure']) + sg = get_sg(s) + if sg in canonical_structures[formula]: + if sg not in task_structures: + task_structures[sg] = [] + s.task_id = task['task_id'] + task_structures[sg].append(s) + if task_structures: + for sg, slist in task_structures.items(): + canonical_task_structures[full_name][formula][sg] = [g[0] for g in group_structures(slist)] + #print(sum([len(x) for x in canonical_task_structures[full_name][formula].values()]), 'canonical task structure(s) for', formula) + + def find_matching_canonical_task_structures(formula, struct, full_name): + matched_task_ids = [] + if sgnum in canonical_task_structures[full_name][formula] and canonical_task_structures[full_name][formula][sgnum]: + for s in canonical_task_structures[full_name][formula][sgnum]: + if structures_match(struct, s): + print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id, 'in', full_name) + matched_task_ids.append(s.task_id) + return matched_task_ids + + for tag, ndocs in tags: query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) @@ -278,7 +334,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') struct = struc - wf_found, readd_wf = False, False + wf_found = False if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): if structures_match(struct, s): @@ -287,7 +343,10 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): if struct.task_id is not None: task_query = {'task_id': struct.task_id} task_query.update(task_base_query) - task = tasks_coll.find_one(task_query, ['input.structure']) + for full_name in reversed(tasks_collections): + task = tasks_collections[full_name].find_one(task_query, ['input.structure']) + if task: + break if task: s_task = Structure.from_dict(task['input']['structure']) s_task.remove_oxidation_states() @@ -317,9 +376,38 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): fw_found = True break if not fw_found: - print(' --> no WF with enforced task-id', struct.task_id, '-> re-add workflow') - readd_wf = True - break + print(' --> no WF with enforced task-id', struct.task_id) + fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1}) + print(' -->', s.fw_id, fw['state']) + if fw['state'] == 'COMPLETED': + # the task is in lpad.db.tasks with different integer task_id + # => find task => overwrite task_id => add_tasks will pick it up + full_name = list(tasks_collections.keys())[0] + load_canonical_task_structures(formula, full_name) + matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name) + if len(matched_task_ids) == 1: + tasks_collections[full_name].update( + {'task_id': matched_task_ids[0]}, { + '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0]}, + '$addToSet': {'tags': tag} + } + ) + print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name) + elif matched_task_ids: + msg = ' --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id) + print(msg) + logger.error(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF' + }) + else: + msg = ' --> ERROR: task for completed WF {} does not exist!?'.format(s.fw_id) + print(msg) + logger.error(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing' + }) + else: + # update WF to include task_id as additional_field + sys.exit(0) else: logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) wf_found = True @@ -329,45 +417,22 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): continue # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) - if not readd_wf: - try: - if formula not in canonical_task_structures: - canonical_task_structures[formula] = {} - task_query = {'formula_pretty': formula} - task_query.update(task_base_query) - tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1}) - if tasks.count() > 0: - task_structures = {} - for task in tasks: - task_label = task_type(task['orig_inputs'], include_calc_type=False) - if task_label == "Structure Optimization": - s = Structure.from_dict(task['input']['structure']) - sg = get_sg(s) - if sg in canonical_structures[formula]: - if sg not in task_structures: - task_structures[sg] = [] - s.task_id = task['task_id'] - task_structures[sg].append(s) - if task_structures: - for sg, slist in task_structures.items(): - canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)] - #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula) - - matched_task_ids = [] - if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]: - for s in canonical_task_structures[formula][sgnum]: - if structures_match(struct, s): - print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id) - matched_task_ids.append(s.task_id) - if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids: - print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids) - raise ValueError - if matched_task_ids: - logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) - continue - except ValueError as ex: - counter['unmatched_task_id'] += 1 + try: + matched_task_ids = OrderedDict() + for full_name in reversed(tasks_collections): + load_canonical_task_structures(formula, full_name) + matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name) + if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]: + print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids[full_name]) + raise ValueError + if matched_task_ids[full_name]: + break + if any(matched_task_ids.values()): + logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) continue + except ValueError as ex: + counter['unmatched_task_id'] += 1 + continue msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) if struct.task_id is not None: From ea38779cbb493a5f812bc360f55553915e160f8a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 20 Jun 2018 11:48:39 -0700 Subject: [PATCH 06/97] cli: save progress --- emmet/scripts/emmet.py | 106 +++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 6f553d6c54..062517a822 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,5 @@ -import click, os, yaml, sys, logging, operator +import click, os, yaml, sys, logging, operator, json +from datetime import datetime from collections import Counter, OrderedDict from pymongo import MongoClient from pymongo.errors import CursorNotFound @@ -27,12 +28,15 @@ def add_tasks(target_db_file, tag, insert): exclude = {'tags': {'$ne': 'deprecated'}} + if not insert: + print('DRY RUN: add --insert flag to actually add tasks to production') + def get_subdir(dn): return dn.rsplit(os.sep, 1)[-1] lpad = LaunchPad.auto_load() - source = lpad.db.tasks - print('connected to source db with', source.count(), 'tasks') + source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) + print('connected to source db with', source.collection.count(), 'tasks') if not os.path.exists(target_db_file): print(target_db_file, 'not found!') @@ -40,26 +44,26 @@ def get_subdir(dn): target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json' print('connected to target db with', target.collection.count(), 'tasks') - ensure_indexes(['task_id', 'tags', 'dir_name'], [source, target.collection]) + ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection]) tags = [tag] if tag is None: - tags = [t for t in source.find(exclude).distinct('tags') if t is not None] + tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None] print(len(tags), 'tags in source collection') for t in tags: print('### {} ###'.format(t)) query = {'$and': [{'tags': t}, exclude]} - source_count = source.count(query) + source_count = source.collection.count(query) print('source / target:', source_count, '/', target.collection.count(query)) # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*]) nr_source_mp_tasks, skip_task_ids = 0, [] - for doc in source.find(query, ['task_id', 'dir_name']): + for doc in source.collection.find(query, ['task_id', 'dir_name']): if isinstance(doc['task_id'], str): nr_source_mp_tasks += 1 - task_query = {'task_id': doc['task_id'], 'dir_name': doc['dir_name']} + task_query = {'task_id': doc['task_id'], '$or': [{'dir_name': doc['dir_name']}, {'_mpworks_meta': {'$exists': 0}}]} if target.collection.count(task_query): skip_task_ids.append(doc['task_id']) if len(skip_task_ids): @@ -68,7 +72,7 @@ def get_subdir(dn): query.update({'task_id': {'$nin': skip_task_ids}}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] subdirs = [] - for doc in source.find(query, ['dir_name', 'task_id', 'retired_task_id']): + for doc in source.collection.find(query, ['dir_name', 'task_id', 'retired_task_id']): subdir = get_subdir(doc['dir_name']) if subdir not in already_inserted_subdirs or 'retired_task_id' in doc: entry = {'subdir': subdir} @@ -79,9 +83,6 @@ def get_subdir(dn): continue print(len(subdirs), 'candidate tasks to insert') - if not insert: - print('add --insert flag to actually add tasks to production') - continue for subdir_doc in subdirs: subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}} @@ -89,25 +90,46 @@ def get_subdir(dn): if doc: print(subdir_doc['subdir'], 'already inserted as', doc['task_id']) if 'task_id' in subdir_doc and subdir_doc['task_id'] != doc['task_id']: - target.collection.remove({'task_id': subdir_doc['task_id']}) - target.collection.update( - {'task_id': doc['task_id']}, { - '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id']}, - '$addToSet': {'tags': t} - } - ) + if insert: + target.collection.remove({'task_id': subdir_doc['task_id']}) + target.collection.update( + {'task_id': doc['task_id']}, { + '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id'], 'last_updated': datetime.utcnow()}, + '$addToSet': {'tags': t} + } + ) print('replaced task_id', doc['task_id'], 'with', subdir_doc['task_id']) continue - source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id'] - print('retrieve', source_task_id, 'for', subdir) + source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id'] + print('retrieve', source_task_id, 'for', subdir_doc['subdir']) task_doc = source.retrieve_task(source_task_id) if isinstance(task_doc['task_id'], int): - c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"] - task_doc['task_id'] = 'mp-{}'.format(c) + if insert: + c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"] + task_doc['task_id'] = 'mp-{}'.format(c) + else: + task = target.collection.find_one({'task_id': task_doc['task_id']}, ['orig_inputs', 'output.structure']) + if task: + task_label = task_type(task['orig_inputs'], include_calc_type=False) + if task_label == "Structure Optimization": + s1 = Structure.from_dict(task['output']['structure']) + s2 = Structure.from_dict(task_doc['output']['structure']) + if structures_match(s1, s2): + if insert: + target.collection.remove({'task_id': task_doc['task_id']}) + print('INFO: removed old task!') + else: + print('ERROR: structures do not match!') + #json.dump({'old': s1.as_dict(), 'new': s2.as_dict()}, open('{}.json'.format(task_doc['task_id']), 'w')) + continue + else: + print('ERROR: not a SO task!') + continue - target.insert_task(task_doc, use_gridfs=True) + if insert: + target.insert_task(task_doc, use_gridfs=True) @cli.command() @@ -169,7 +191,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll]) - tags = [] + tags = OrderedDict() if tag is None: query = dict(exclude) query.update(base_query) @@ -177,14 +199,14 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): for t in remarks: query = {'$and': [{'about.remarks': t}, exclude]} query.update(base_query) - tags.append((t, snl_coll.count(query))) - tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True) + tags[t] = snl_coll.count(query) + tags = OrderedDict((el[0], el[1]) for el in sorted(tags.items(), key=operator.itemgetter(1), reverse=True)) print(len(tags), 'tags in source collection => TOP10:') - print('\n'.join(['{} ({})'.format(*t) for t in tags[:10]])) + print('\n'.join(['{} ({})'.format(k, v) for k, v in list(tags.items())[:10]])) else: query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) - tags = [(tag, snl_coll.count(query))] + tags = OrderedDict((tag, snl_coll.count(query))) canonical_task_structures = {} grouped_workflow_structures = {} @@ -225,7 +247,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): return matched_task_ids - for tag, ndocs in tags: + for tag, ndocs in tags.items(): query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) @@ -316,7 +338,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for sgnum, slist in workflow_structures.items(): grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] - #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) + print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): @@ -388,7 +410,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if len(matched_task_ids) == 1: tasks_collections[full_name].update( {'task_id': matched_task_ids[0]}, { - '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0]}, + '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()}, '$addToSet': {'tags': tag} } ) @@ -406,8 +428,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing' }) else: - # update WF to include task_id as additional_field - sys.exit(0) + print(' --> TODO: update {} WF to include task_id as additional_field'.format(fw['state'], s.fw_id)) else: logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) wf_found = True @@ -428,17 +449,15 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if matched_task_ids[full_name]: break if any(matched_task_ids.values()): - logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids}) + logger.warning('matched task ids', extra={ + 'formula': formula, 'snl_id': struct.snl_id, + 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) + }) continue except ValueError as ex: counter['unmatched_task_id'] += 1 continue - msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) - if struct.task_id is not None: - msg += ' --> enforcing task-id {}'.format(struct.task_id) - print(msg) - no_potcars = set(NO_POTCARS) & set(struct.composition.elements) if len(no_potcars) > 0: msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) @@ -460,9 +479,14 @@ def find_matching_canonical_task_structures(formula, struct, full_name): logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'}) continue + msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) + if struct.task_id is not None: + msg += ' --> enforcing task-id {}'.format(struct.task_id) + print(msg) + if insert: old_new = lpad.add_wf(wf) - logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + #logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) counter['add(ed)'] += 1 except CursorNotFound as ex: From 609ea7e1ed2c9fb4d1f5089094b951a77cd93101 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 20 Jun 2018 15:25:02 -0700 Subject: [PATCH 07/97] cli: resolve/cleanup task_id errors --- emmet/scripts/emmet.py | 63 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 062517a822..4ff07642b9 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -6,7 +6,7 @@ from pymongo.collection import ReturnDocument from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure -from fireworks import LaunchPad +from fireworks import LaunchPad, Workflow from atomate.vasp.database import VaspCalcDb from atomate.vasp.workflows.presets.core import wf_structure_optimization from atomate.vasp.database import VaspCalcDb @@ -186,7 +186,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+'] base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} - task_base_query = {'_mpworks_meta': {'$exists': 0}} + task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}} vp = DLSVolumePredictor() ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll]) @@ -206,7 +206,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): else: query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) - tags = OrderedDict((tag, snl_coll.count(query))) + tags[tag] = snl_coll.count(query) canonical_task_structures = {} grouped_workflow_structures = {} @@ -299,6 +299,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): except Exception as ex: s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) msg = 'SNL {}: {}'.format(s.snl_id, ex) + print(msg) logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)}) continue if sgnum not in structures[formula]: @@ -338,7 +339,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for sgnum, slist in workflow_structures.items(): grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] - print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) + #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): @@ -374,10 +375,10 @@ def find_matching_canonical_task_structures(formula, struct, full_name): s_task.remove_oxidation_states() if not structures_match(struct, s_task): msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) + msg += ' --> CLEANUP: remove task_id from SNL' print(msg) - logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch' - }) + snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) counter['snl-task_mismatch'] += 1 else: msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) @@ -422,13 +423,15 @@ def find_matching_canonical_task_structures(formula, struct, full_name): 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF' }) else: - msg = ' --> ERROR: task for completed WF {} does not exist!?'.format(s.fw_id) + msg = ' --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id) + msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id) print(msg) - logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing' - }) + lpad.delete_wf(s.fw_id) + break else: - print(' --> TODO: update {} WF to include task_id as additional_field'.format(fw['state'], s.fw_id)) + print(' --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'], s.fw_id)) + lpad.delete_wf(s.fw_id) + break else: logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) wf_found = True @@ -438,24 +441,20 @@ def find_matching_canonical_task_structures(formula, struct, full_name): continue # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) - try: - matched_task_ids = OrderedDict() - for full_name in reversed(tasks_collections): - load_canonical_task_structures(formula, full_name) - matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name) - if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]: - print(' --> ERROR: task', struct.task_id, 'not in', matched_task_ids[full_name]) - raise ValueError - if matched_task_ids[full_name]: - break - if any(matched_task_ids.values()): - logger.warning('matched task ids', extra={ - 'formula': formula, 'snl_id': struct.snl_id, - 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) - }) - continue - except ValueError as ex: - counter['unmatched_task_id'] += 1 + msg, matched_task_ids = '', OrderedDict() + for full_name in reversed(tasks_collections): + load_canonical_task_structures(formula, full_name) + matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name) + if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]: + msg = ' --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name]) + print(msg) + if matched_task_ids[full_name]: + break + if any(matched_task_ids.values()): + logger.warning('matched task ids' + msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, + 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) + }) continue no_potcars = set(NO_POTCARS) & set(struct.composition.elements) @@ -486,7 +485,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if insert: old_new = lpad.add_wf(wf) - #logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + else: + logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id}) counter['add(ed)'] += 1 except CursorNotFound as ex: From 1fa945b161a5a278d5305b2f1c7536390d05734a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 21 Jun 2018 14:41:07 -0700 Subject: [PATCH 08/97] cli: add report subcommand --- emmet/scripts/emmet.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 4ff07642b9..7455af9acd 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -511,3 +511,43 @@ def ensure_indexes(indexes, colls): if index not in keys: coll.ensure_index(index) print('ensured index', index, 'on', coll.full_name) + + +@cli.command() +@click.option('--tag', default=None, help='only include structures with specific tag') +def report(tag): + """generate a report of calculations status""" + + lpad = LaunchPad.auto_load() + states = ['COMPLETED', 'FIZZLED', 'READY', 'RUNNING'] + + tags = [tag] + if tag is None: + tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None] + print(len(tags), 'tags in workflows collection') + + from prettytable import PrettyTable + table = PrettyTable() + table.field_names = ['tag', 'workflows'] + states + ['% FIZZLED', 'progress'] + + for t in tags: + wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1}) + counter = Counter([wf['state'] for wf in wflows]) + total = sum(v for k, v in counter.items() if k in states) + tc, progress = t, '-' + if counter['COMPLETED'] + counter['FIZZLED'] != total: + tc = "\033[1;34m{}\033[0m".format(t) + progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. + progress = '{:.0f}%'.format(progress) + entry = [tc, total] + [counter[state] for state in states] + fizzled = counter['FIZZLED'] / total + percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ + if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) + entry.append(percent_fizzled) + entry.append(progress) + table.add_row(entry) + + table.sortby = 'workflows' + table.reversesort = True + table.align['tag'] = 'r' + print(table) From 6f06894feb9ca4693937ebc1a35577fdd8dfa59a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 22 Jun 2018 12:15:45 -0700 Subject: [PATCH 09/97] cli: progress on report --- emmet/scripts/emmet.py | 46 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 7455af9acd..eb1250d63e 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -13,7 +13,7 @@ from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs from emmet.vasp.materials import group_structures, get_sg from emmet.vasp.task_tagger import task_type -from log4mongo.handlers import MongoHandler +from log4mongo.handlers import MongoHandler, MongoFormatter @click.group() def cli(): @@ -167,12 +167,12 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): logger = logging.getLogger('add_wflows') mongo_handler = MongoHandler( host=lpad.host, port=lpad.port, database_name=lpad.name, collection='add_wflows_logs', - username=lpad.username, password=lpad.password, authentication_db=lpad.name + username=lpad.username, password=lpad.password, authentication_db=lpad.name, formatter=MyMongoFormatter() ) logger.addHandler(mongo_handler) - ensure_indexes(['level', 'snl_id', 'formula'], [mongo_handler.collection]) if clear_logs: mongo_handler.collection.drop() + ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tag'], [mongo_handler.collection]) tasks_collections = OrderedDict() tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks @@ -300,7 +300,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) msg = 'SNL {}: {}'.format(s.snl_id, ex) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)}) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'error': str(ex)}) continue if sgnum not in structures[formula]: structures[formula][sgnum] = [] @@ -314,7 +314,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if len(g) > 1: for s in g[1:]: logger.warning('duplicate structure', extra={ - 'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id + 'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'canonical_snl_id': g[0].snl_id }) if not canonical_structures[formula]: @@ -378,13 +378,13 @@ def find_matching_canonical_task_structures(formula, struct, full_name): msg += ' --> CLEANUP: remove task_id from SNL' print(msg) snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag}) counter['snl-task_mismatch'] += 1 else: msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) print(msg) logger.warning(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id + 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag }) else: print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) @@ -395,7 +395,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag}) fw_found = True break if not fw_found: @@ -420,7 +420,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): msg = ' --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id) print(msg) logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF' + 'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'Multiple tasks for Completed WF' }) else: msg = ' --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id) @@ -433,7 +433,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): lpad.delete_wf(s.fw_id) break else: - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag}) wf_found = True break @@ -452,7 +452,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): break if any(matched_task_ids.values()): logger.warning('matched task ids' + msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, + 'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) }) continue @@ -461,7 +461,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if len(no_potcars) > 0: msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': no_potcars}) continue try: @@ -475,7 +475,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): except: msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'}) + logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'could not make workflow'}) continue msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) @@ -485,9 +485,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if insert: old_new = lpad.add_wf(wf) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'fw_id': list(old_new.values())[0]}) else: - logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id}) + logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag}) counter['add(ed)'] += 1 except CursorNotFound as ex: @@ -512,6 +512,17 @@ def ensure_indexes(indexes, colls): coll.ensure_index(index) print('ensured index', index, 'on', coll.full_name) +class MyMongoFormatter(logging.Formatter): + KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tag', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)'] + + def format(self, record): + mongoformatter = MongoFormatter() + document = mongoformatter.format(record) + for k in list(document.keys()): + if k not in self.KEEP_KEYS: + document.pop(k) + return document + @cli.command() @click.option('--tag', default=None, help='only include structures with specific tag') @@ -528,10 +539,11 @@ def report(tag): from prettytable import PrettyTable table = PrettyTable() - table.field_names = ['tag', 'workflows'] + states + ['% FIZZLED', 'progress'] + table.field_names = ['tag', 'SNLs', 'workflows'] + states + ['% FIZZLED', 'progress'] for t in tags: wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1}) + nr_snls = lpad.db.add_wflows_logs.count({'tag': t}) counter = Counter([wf['state'] for wf in wflows]) total = sum(v for k, v in counter.items() if k in states) tc, progress = t, '-' @@ -539,7 +551,7 @@ def report(tag): tc = "\033[1;34m{}\033[0m".format(t) progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. progress = '{:.0f}%'.format(progress) - entry = [tc, total] + [counter[state] for state in states] + entry = [tc, nr_snls, total] + [counter[state] for state in states] fizzled = counter['FIZZLED'] / total percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) From 063caf8a7f9aa750b6db8f74876c1f6185a4cf6c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 22 Jun 2018 17:17:31 -0700 Subject: [PATCH 10/97] cli: more progress on report etc --- emmet/scripts/emmet.py | 152 +++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 67 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index eb1250d63e..f305db0798 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, operator, json +import click, os, yaml, sys, logging, json from datetime import datetime from collections import Counter, OrderedDict from pymongo import MongoClient @@ -15,6 +15,10 @@ from emmet.vasp.task_tagger import task_type from log4mongo.handlers import MongoHandler, MongoFormatter +if 'FW_CONFIG_FILE' not in os.environ: + print('Please set FW_CONFIG_FILE!') + sys.exit(0) + @click.group() def cli(): pass @@ -133,36 +137,34 @@ def get_subdir(dn): @cli.command() -@click.argument('list_of_structures', type=click.File('rb')) -@click.option('-a', '--alt_tasks_db_file', type=click.Path(exists=True), help='config file for alternative tasks collection') +@click.option('--add_snls_db', type=click.Path(exists=True), help='config file for additional SNLs collection') +@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection') @click.option('--tag', default=None, help='only include structures with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection') -def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): - """add workflows for list of structures / SNLs (YAML config or JSON list of pymatgen structures""" +@click.option('--max-structures', default=1000, help='set max structures for tags to scan') +def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures): + """add workflows based on tags in SNL collection""" exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} if not insert: print('DRY RUN! Add --insert flag to actually add workflows') - try: - snl_db_config = yaml.load(list_of_structures) + lpad = LaunchPad.auto_load() + + # TODO use add_snls first, and then add_wflows based on SNL collection + snl_collections = [lpad.db.snls] + if add_snls_db is not None: + snl_db_config = yaml.load(open(add_snls_db, 'r')) snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) - snl_coll = snl_db[snl_db_config['collection']] - except Exception as ex: - print(ex) - # NOTE WIP might change it to use add_snls first, and then add_wflows based on SNL collection only - # TODO load pymatgen structures from JSON file into MongoDB collection - # TODO also fake-tag them, add SNL info - snl_coll = None - print('to be implemented') - return - print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) + snl_collections.append(snl_db[snl_db_config['collection']]) - lpad = LaunchPad.auto_load() + ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], snl_collections) + for snl_coll in snl_collections: + print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) logger = logging.getLogger('add_wflows') mongo_handler = MongoHandler( @@ -172,12 +174,12 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): logger.addHandler(mongo_handler) if clear_logs: mongo_handler.collection.drop() - ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tag'], [mongo_handler.collection]) + ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tags'], [mongo_handler.collection]) tasks_collections = OrderedDict() tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks - if alt_tasks_db_file is not None: # TODO multiple alt_task_db_files? - target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True) + if add_tasks_db is not None: # TODO multiple alt_task_db_files? + target = VaspCalcDb.from_db_file(add_tasks_db, admin=True) tasks_collections[target.collection.full_name] = target.collection for full_name, tasks_coll in tasks_collections.items(): print(tasks_coll.count(), 'tasks in', full_name) @@ -189,24 +191,40 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs): task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}} vp = DLSVolumePredictor() - ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll]) - tags = OrderedDict() if tag is None: + all_tags = OrderedDict() query = dict(exclude) query.update(base_query) - remarks = filter(None, snl_coll.find(query).distinct('about.remarks')) - for t in remarks: - query = {'$and': [{'about.remarks': t}, exclude]} - query.update(base_query) - tags[t] = snl_coll.count(query) - tags = OrderedDict((el[0], el[1]) for el in sorted(tags.items(), key=operator.itemgetter(1), reverse=True)) - print(len(tags), 'tags in source collection => TOP10:') - print('\n'.join(['{} ({})'.format(k, v) for k, v in list(tags.items())[:10]])) + for snl_coll in snl_collections: + remarks = filter(None, snl_coll.find(query).distinct('about.remarks')) + for t in remarks: + query = {'$and': [{'about.remarks': t}, exclude]} + query.update(base_query) + if t not in all_tags: + all_tags[t] = [snl_coll.count(query), snl_coll] + else: + print('tag -', t, '- already in', all_tags[t][-1].full_name) + sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0]) + for item in sorted_tags: + to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]}) + if item[1][0] < max_structures and to_scan: + tags[item[0]] = [item[1][0], to_scan, item[1][-1]] else: query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) - tags[tag] = snl_coll.count(query) + for snl_coll in snl_collections: + cnt = snl_coll.count(query) + if cnt: + to_scan = cnt - lpad.db.add_wflows_logs.count({'tags': tag}) + tags[tag] = [cnt, to_scan, snl_coll] + break + + if not tags: + print('nothing to scan') + return + print(len(tags), 'tags to scan in source SNL collections:') + print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()])) canonical_task_structures = {} grouped_workflow_structures = {} @@ -247,21 +265,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name): return matched_task_ids - for tag, ndocs in tags.items(): + for tag, value in tags.items(): query = {'$and': [{'about.remarks': tag}, exclude]} query.update(base_query) - # TODO WIP will be removed - if tag == 'new_ordered_icsd_2017': + if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1}) - print(tag, 'TODO implement db.icsd as snl_coll') - continue - elif tag == 'pre-atomate production': - # TODO scan last + print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?') continue - print('aggregate', ndocs, 'structures for', tag, '...') - structure_groups = snl_coll.aggregate([ + print('aggregate', value[0], 'structures for', tag, '...') + structure_groups = value[-1].aggregate([ {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, {'$group': { '_id': '$reduced_cell_formula', @@ -286,7 +300,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name): print(idx_group, '...') for dct in group['structures']: - if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}): + q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']} + if mongo_handler.collection.find_one(q): + lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}}) continue # already checked mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups counter['structures'] += 1 @@ -300,7 +316,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) msg = 'SNL {}: {}'.format(s.snl_id, ex) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'error': str(ex)}) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) continue if sgnum not in structures[formula]: structures[formula][sgnum] = [] @@ -314,7 +330,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if len(g) > 1: for s in g[1:]: logger.warning('duplicate structure', extra={ - 'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'canonical_snl_id': g[0].snl_id + 'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id }) if not canonical_structures[formula]: @@ -377,14 +393,14 @@ def find_matching_canonical_task_structures(formula, struct, full_name): msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) msg += ' --> CLEANUP: remove task_id from SNL' print(msg) - snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag}) + value[-1].update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) counter['snl-task_mismatch'] += 1 else: msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) print(msg) logger.warning(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag + 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag] }) else: print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) @@ -395,7 +411,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]}) fw_found = True break if not fw_found: @@ -420,7 +436,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): msg = ' --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id) print(msg) logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'Multiple tasks for Completed WF' + 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF' }) else: msg = ' --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id) @@ -433,7 +449,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): lpad.delete_wf(s.fw_id) break else: - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) wf_found = True break @@ -452,7 +468,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): break if any(matched_task_ids.values()): logger.warning('matched task ids' + msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, + 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) }) continue @@ -461,7 +477,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if len(no_potcars) > 0: msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': no_potcars}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars}) continue try: @@ -475,7 +491,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): except: msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'could not make workflow'}) + logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'}) continue msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) @@ -485,17 +501,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if insert: old_new = lpad.add_wf(wf) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'fw_id': list(old_new.values())[0]}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]}) else: - logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag}) + logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]}) counter['add(ed)'] += 1 except CursorNotFound as ex: print(ex) - sites_elements = [ + sites_elements = set([ (len(set([e.symbol for e in x.composition.elements])), x.num_sites) for x in canonical_structures_list - ] + ]) print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) print(counter) @@ -513,7 +529,7 @@ def ensure_indexes(indexes, colls): print('ensured index', index, 'on', coll.full_name) class MyMongoFormatter(logging.Formatter): - KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tag', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)'] + KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tags', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)'] def format(self, record): mongoformatter = MongoFormatter() @@ -535,31 +551,33 @@ def report(tag): tags = [tag] if tag is None: tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None] - print(len(tags), 'tags in workflows collection') + tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags] + print(len(tags), 'tags in WFs and logs collections') from prettytable import PrettyTable table = PrettyTable() - table.field_names = ['tag', 'SNLs', 'workflows'] + states + ['% FIZZLED', 'progress'] + table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress'] for t in tags: wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1}) - nr_snls = lpad.db.add_wflows_logs.count({'tag': t}) + nr_snls = lpad.db.add_wflows_logs.count({'tags': t}) + wflows_to_add = lpad.db.add_wflows_logs.count({'tags': t, 'level': 'ERROR', 'error': {'$exists': 0}}) counter = Counter([wf['state'] for wf in wflows]) total = sum(v for k, v in counter.items() if k in states) tc, progress = t, '-' - if counter['COMPLETED'] + counter['FIZZLED'] != total: + if wflows_to_add or counter['COMPLETED'] + counter['FIZZLED'] != total: tc = "\033[1;34m{}\033[0m".format(t) - progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. + progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0. progress = '{:.0f}%'.format(progress) - entry = [tc, nr_snls, total] + [counter[state] for state in states] - fizzled = counter['FIZZLED'] / total + entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states] + fizzled = counter['FIZZLED'] / total if total else 0. percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) entry.append(percent_fizzled) entry.append(progress) table.add_row(entry) - table.sortby = 'workflows' + table.sortby = 'SNLs' table.reversesort = True - table.align['tag'] = 'r' + table.align['Tag'] = 'r' print(table) From 5f9e9d6ca8bc7489d7d6e66b15b1c009ffb61bb3 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 11:01:24 -0700 Subject: [PATCH 11/97] cli: skip-all-scanned, catch another sgnum --- emmet/scripts/emmet.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index f305db0798..41ac7600bd 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -143,7 +143,8 @@ def get_subdir(dn): @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection') @click.option('--max-structures', default=1000, help='set max structures for tags to scan') -def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures): +@click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors') +def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): """add workflows based on tags in SNL collection""" exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} @@ -244,7 +245,14 @@ def load_canonical_task_structures(formula, full_name): task_label = task_type(task['orig_inputs'], include_calc_type=False) if task_label == "Structure Optimization": s = Structure.from_dict(task['input']['structure']) - sg = get_sg(s) + try: + sgnum = get_sg(s) + except Exception as ex: + s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) + msg = 'SNL {}: {}'.format(s.snl_id, ex) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) + continue if sg in canonical_structures[formula]: if sg not in task_structures: task_structures[sg] = [] @@ -304,7 +312,10 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if mongo_handler.collection.find_one(q): lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}}) continue # already checked - mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups + q['level'] = 'ERROR' + if skip_all_scanned and mongo_handler.collection.find_one(q): + continue + mongo_handler.collection.remove(q) # avoid dups counter['structures'] += 1 s = Structure.from_dict(dct) s.snl_id = dct['snl_id'] From 75b145c437a8a2b26c1b8a5662bdc57d3dbbf26a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 16:56:57 -0700 Subject: [PATCH 12/97] cli: add ensure_meta --- emmet/scripts/emmet.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 41ac7600bd..db4428314d 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -23,6 +23,33 @@ def cli(): pass + +@cli.command() +@click.argument('snls_db', type=click.Path(exists=True)) +def ensure_meta(snls_db): + """ensure meta-data fields are set in SNL collection""" + + snl_db_config = yaml.load(open(snls_db, 'r')) + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_coll = snl_db[snl_db_config['collection']] + print(snl_coll.count(), 'SNLs in', snl_coll.full_name) + + for idx, doc in enumerate(snl_coll.find({}, structure_keys)): + if idx and not idx%1000: + print(idx, '...') + struct = Structure.from_dict(doc) + d = {'formula_pretty': struct.composition.reduced_formula} + d['nelements'] = len(set(struct.composition.elements)) + d['nsites'] = len(struct) + d['is_ordered'] = struct.is_ordered + d['is_valid'] = struct.is_valid() + snl_coll.update({'snl_id': doc['snl_id']}, {'$set': d}) + + ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll]) + + @cli.command() @click.option('--target_db_file', default="target.json", help='target db file') @click.option('--tag', default=None, help='only insert tasks with specific tag') From ef17bdf749a6e7170d918d99ae5cd6c4549ef1f4 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 16:58:16 -0700 Subject: [PATCH 13/97] cli: some global definitions --- emmet/scripts/emmet.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index db4428314d..d237d50cf2 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -19,6 +19,12 @@ print('Please set FW_CONFIG_FILE!') sys.exit(0) +exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}} +no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+'] +base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} +task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}} +structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] + @click.group() def cli(): pass @@ -57,8 +63,6 @@ def ensure_meta(snls_db): def add_tasks(target_db_file, tag, insert): """Retrieve tasks from source and add to target""" - exclude = {'tags': {'$ne': 'deprecated'}} - if not insert: print('DRY RUN: add --insert flag to actually add tasks to production') @@ -174,8 +178,6 @@ def get_subdir(dn): def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): """add workflows based on tags in SNL collection""" - exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} - if not insert: print('DRY RUN! Add --insert flag to actually add workflows') @@ -212,11 +214,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure for full_name, tasks_coll in tasks_collections.items(): print(tasks_coll.count(), 'tasks in', full_name) - structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] - no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+'] - base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} - task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}} vp = DLSVolumePredictor() tags = OrderedDict() From dd6afb76da434e3ae0c0b2aa3f92e02b41df4582 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 16:59:02 -0700 Subject: [PATCH 14/97] cli: start add_snls --- emmet/scripts/emmet.py | 94 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index d237d50cf2..1eee57794e 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,5 @@ -import click, os, yaml, sys, logging, json +import click, os, yaml, sys, logging, json, tarfile +from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict from pymongo import MongoClient @@ -6,6 +7,7 @@ from pymongo.collection import ReturnDocument from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure +from pymatgen.util.provenance import StructureNL, Author from fireworks import LaunchPad, Workflow from atomate.vasp.database import VaspCalcDb from atomate.vasp.workflows.presets.core import wf_structure_optimization @@ -617,3 +619,93 @@ def report(tag): table.reversesort = True table.align['Tag'] = 'r' print(table) + + +@cli.command() +@click.argument('archive', type=click.Path(exists=True)) +@click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections') +def add_snls(archive, add_snls_dbs): + """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection""" + # TODO assign task_ids to structures? + + lpad = LaunchPad.auto_load() + snl_collections = [lpad.db.snls] + if add_snls_dbs: + for add_snls_db in add_snls_dbs: + snl_db_config = yaml.load(open(add_snls_db, 'r')) + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_collections.append(snl_db[snl_db_config['collection']]) + for snl_coll in snl_collections: + print(snl_coll.count(), 'SNLs in', snl_coll.full_name) + + fname, ext = os.path.splitext(os.path.basename(archive)) + tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else fname, '' + if sec_ext: + ext = ''.join([sec_ext, ext]) + exts = ['tar.gz', '.tgz'] + if ext not in exts: + print(ext, 'not supported (yet)! Please use one of', exts) + return + + meta_path = '{}.yaml'.format(tag) + if not os.path.exists(meta_path): + print('Please include meta info in', meta_path) + return + with open(meta_path, 'r') as f: + meta = yaml.load(f) + meta['authors'] = [Author.parse_author(a) for a in meta['authors']] + + exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} + + snls = [] + tar = tarfile.open(archive, 'r:gz') + for member in tar.getmembers(): + if os.path.basename(member.name).startswith('.'): + continue + f = tar.extractfile(member) + if f: + print(member.name) + contents = f.read().decode('utf-8') + fname = member.name.lower() + if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"): + fmt = 'cif' + elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"): + fmt = 'json' + else: + print('reading', fname, 'not supported (yet)') + continue + + try: + struct = Structure.from_str(contents, fmt=fmt) + except Exception as ex: + print(ex) + break #continue + + formula = struct.composition.reduced_formula + query = {'$and': [{'formula_pretty': formula}, exclude]} + query.update(base_query) + + for snl_coll in snl_collections: + snl_groups = snl_coll.aggregate([ + {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, + {'$group': { + '_id': '$formula_pretty', + 'snls': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} + }} + ], allowDiskUse=True, batchSize=1) + return + + snls.append(StructureNL( + struct, authors, references=references.strip(), remarks=[tag] + )) + print(len(snls)) + +# snls.append(snl.as_dict()) +# if snls: +# print('add', len(snls), 'SNLs') +# result = target.db.snls.insert_many(snls) +# print('#SNLs inserted:', len(result.inserted_ids)) +# else: +# print('no SNLs to insert') From 931cff109b785e5cdda57f48e77558eba7f060df Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 16:59:22 -0700 Subject: [PATCH 15/97] cli: fix sgnum error catch --- emmet/scripts/emmet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 1eee57794e..cf530a1632 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -275,10 +275,10 @@ def load_canonical_task_structures(formula, full_name): try: sgnum = get_sg(s) except Exception as ex: - s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) - msg = 'SNL {}: {}'.format(s.snl_id, ex) + s.to(fmt='json', filename='sgnum_{}.json'.format(task['task_id'])) + msg = 'SNL {}: {}'.format(task['task_id'], ex) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) + logger.error(msg, extra={'formula': formula, 'task_id': task['task_id'], 'tags': [tag], 'error': str(ex)}) continue if sg in canonical_structures[formula]: if sg not in task_structures: From 50866a8d97a2fc166f05b800724e25afc7807c44 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 16:59:52 -0700 Subject: [PATCH 16/97] cli.report: minor table update --- emmet/scripts/emmet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index cf530a1632..cc54179c4c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -613,7 +613,8 @@ def report(tag): if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) entry.append(percent_fizzled) entry.append(progress) - table.add_row(entry) + if any(entry[2:-2]): + table.add_row(entry) table.sortby = 'SNLs' table.reversesort = True From 718d1878c81e0e1f26d6fcab45002bb1c9b4b17c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 27 Jun 2018 17:06:35 -0700 Subject: [PATCH 17/97] cli: codacy fixes --- emmet/scripts/emmet.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index cc54179c4c..bd5b9ed55f 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, json, tarfile +import click, os, yaml, sys, logging, tarfile from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict @@ -11,7 +11,6 @@ from fireworks import LaunchPad, Workflow from atomate.vasp.database import VaspCalcDb from atomate.vasp.workflows.presets.core import wf_structure_optimization -from atomate.vasp.database import VaspCalcDb from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs from emmet.vasp.materials import group_structures, get_sg from emmet.vasp.task_tagger import task_type @@ -37,7 +36,7 @@ def cli(): def ensure_meta(snls_db): """ensure meta-data fields are set in SNL collection""" - snl_db_config = yaml.load(open(snls_db, 'r')) + snl_db_config = yaml.safe_load(open(snls_db, 'r')) snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) @@ -188,7 +187,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure # TODO use add_snls first, and then add_wflows based on SNL collection snl_collections = [lpad.db.snls] if add_snls_db is not None: - snl_db_config = yaml.load(open(add_snls_db, 'r')) + snl_db_config = yaml.safe_load(open(add_snls_db, 'r')) snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) @@ -273,7 +272,7 @@ def load_canonical_task_structures(formula, full_name): if task_label == "Structure Optimization": s = Structure.from_dict(task['input']['structure']) try: - sgnum = get_sg(s) + sg = get_sg(s) except Exception as ex: s.to(fmt='json', filename='sgnum_{}.json'.format(task['task_id'])) msg = 'SNL {}: {}'.format(task['task_id'], ex) @@ -483,7 +482,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): lpad.delete_wf(s.fw_id) break else: - print(' --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'], s.fw_id)) + print(' --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'])) lpad.delete_wf(s.fw_id) break else: @@ -633,7 +632,7 @@ def add_snls(archive, add_snls_dbs): snl_collections = [lpad.db.snls] if add_snls_dbs: for add_snls_db in add_snls_dbs: - snl_db_config = yaml.load(open(add_snls_db, 'r')) + snl_db_config = yaml.safe_load(open(add_snls_db, 'r')) snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) @@ -655,7 +654,7 @@ def add_snls(archive, add_snls_dbs): print('Please include meta info in', meta_path) return with open(meta_path, 'r') as f: - meta = yaml.load(f) + meta = yaml.safe_load(f) meta['authors'] = [Author.parse_author(a) for a in meta['authors']] exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} From 7776857eee78094147ec8c9aae9e67c48462c19b Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 28 Jun 2018 16:23:21 -0700 Subject: [PATCH 18/97] cli: add_snls working --- emmet/scripts/emmet.py | 140 +++++++++++++++++++++++++++-------------- 1 file changed, 92 insertions(+), 48 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index bd5b9ed55f..d36e266484 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -26,6 +26,25 @@ task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}} structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] +def aggregate_by_formula(coll, q, key='reduced_cell_formula'): + query = {'$and': [q, exclude]} + query.update(base_query) + return coll.aggregate([ + {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, + {'$group': { + '_id': '${}'.format(key), + 'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} + }} + ], allowDiskUse=True, batchSize=1) + +def get_meta_from_structure(struct): + d = {'formula_pretty': struct.composition.reduced_formula} + d['nelements'] = len(set(struct.composition.elements)) + d['nsites'] = len(struct) + d['is_ordered'] = struct.is_ordered + d['is_valid'] = struct.is_valid() + return d + @click.group() def cli(): pass @@ -47,18 +66,13 @@ def ensure_meta(snls_db): if idx and not idx%1000: print(idx, '...') struct = Structure.from_dict(doc) - d = {'formula_pretty': struct.composition.reduced_formula} - d['nelements'] = len(set(struct.composition.elements)) - d['nsites'] = len(struct) - d['is_ordered'] = struct.is_ordered - d['is_valid'] = struct.is_valid() - snl_coll.update({'snl_id': doc['snl_id']}, {'$set': d}) + snl_coll.update({'snl_id': doc['snl_id']}, {'$set': get_meta_from_structure(struct)}) ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll]) @cli.command() -@click.option('--target_db_file', default="target.json", help='target db file') +@click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute task addition') def add_tasks(target_db_file, tag, insert): @@ -74,10 +88,7 @@ def get_subdir(dn): source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to source db with', source.collection.count(), 'tasks') - if not os.path.exists(target_db_file): - print(target_db_file, 'not found!') - return - target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json' + target = VaspCalcDb.from_db_file(target_db_file, admin=True) print('connected to target db with', target.collection.count(), 'tasks') ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection]) @@ -184,7 +195,6 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure lpad = LaunchPad.auto_load() - # TODO use add_snls first, and then add_wflows based on SNL collection snl_collections = [lpad.db.snls] if add_snls_db is not None: snl_db_config = yaml.safe_load(open(add_snls_db, 'r')) @@ -300,22 +310,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for tag, value in tags.items(): - query = {'$and': [{'about.remarks': tag}, exclude]} - query.update(base_query) if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1}) print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?') continue + if skip_all_scanned and not value[1]: + continue + print('aggregate', value[0], 'structures for', tag, '...') - structure_groups = value[-1].aggregate([ - {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, - {'$group': { - '_id': '$reduced_cell_formula', - 'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} - }} - ], allowDiskUse=True, batchSize=1) + structure_groups = aggregate_by_formula(value[-1], {'about.remarks': tag}) print('loop formulas for', tag, '...') counter = Counter() @@ -350,7 +355,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): try: sgnum = get_sg(s) except Exception as ex: - s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id)) + s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) msg = 'SNL {}: {}'.format(s.snl_id, ex) print(msg) logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) @@ -624,10 +629,14 @@ def report(tag): @cli.command() @click.argument('archive', type=click.Path(exists=True)) @click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections') -def add_snls(archive, add_snls_dbs): +@click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion') +def add_snls(archive, add_snls_dbs, insert): """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection""" # TODO assign task_ids to structures? + if not insert: + print('DRY RUN! Add --insert flag to actually add SNLs') + lpad = LaunchPad.auto_load() snl_collections = [lpad.db.snls] if add_snls_dbs: @@ -641,7 +650,7 @@ def add_snls(archive, add_snls_dbs): print(snl_coll.count(), 'SNLs in', snl_coll.full_name) fname, ext = os.path.splitext(os.path.basename(archive)) - tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else fname, '' + tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, ''] if sec_ext: ext = ''.join([sec_ext, ext]) exts = ['tar.gz', '.tgz'] @@ -666,7 +675,6 @@ def add_snls(archive, add_snls_dbs): continue f = tar.extractfile(member) if f: - print(member.name) contents = f.read().decode('utf-8') fname = member.name.lower() if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"): @@ -684,28 +692,64 @@ def add_snls(archive, add_snls_dbs): break #continue formula = struct.composition.reduced_formula - query = {'$and': [{'formula_pretty': formula}, exclude]} - query.update(base_query) + sg = get_sg(struct) + struct_added = False for snl_coll in snl_collections: - snl_groups = snl_coll.aggregate([ - {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, - {'$group': { - '_id': '$formula_pretty', - 'snls': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)} - }} - ], allowDiskUse=True, batchSize=1) - return - - snls.append(StructureNL( - struct, authors, references=references.strip(), remarks=[tag] - )) - print(len(snls)) - -# snls.append(snl.as_dict()) -# if snls: -# print('add', len(snls), 'SNLs') -# result = target.db.snls.insert_many(snls) -# print('#SNLs inserted:', len(result.inserted_ids)) -# else: -# print('no SNLs to insert') + try: + group = aggregate_by_formula(snl_coll, {'formula_pretty': formula}, key='formula_pretty').next() # only one formula + except StopIteration: + continue + + structures = [] + for dct in group['structures']: + s = Structure.from_dict(dct) + s.snl_id = dct['snl_id'] + s.remove_oxidation_states() + try: + sgnum = get_sg(s) + except Exception as ex: + s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) + print('SNL {}: {}'.format(s.snl_id, ex)) + continue + if sgnum == sg: + structures.append(s) + + if not structures: + continue + + canonical_structures = [] + for g in group_structures(structures): + canonical_structures.append(g[0]) + + if not canonical_structures: + continue + + for s in canonical_structures: + if structures_match(struct, s): + print('Structure from', member.name, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name) + struct_added = True + break + + if struct_added: + break + + if struct_added: + continue + + print('append SNL for structure from', member.name) + snl_dct = StructureNL(struct, meta['authors'], references=meta.get('references', '').strip(), projects=[tag]).as_dict() + snl_dct.update(get_meta_from_structure(struct)) + prefix = snl_collections[0].database.name + index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1 + snl_dct['snl_id'] = '{}-{}'.format(prefix, index) + snls.append(snl_dct) + + if snls: + print('add', len(snls), 'SNLs') + if insert: + result = snl_collections[0].insert_many(snls) + print('#SNLs inserted:', len(result.inserted_ids)) + else: + print('no SNLs to insert') + From 89c95831fc5526849d343314dba78ae29e856f1f Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 28 Jun 2018 16:28:40 -0700 Subject: [PATCH 19/97] cli: fix codacy issues --- emmet/scripts/emmet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index d36e266484..43d5e14421 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -8,7 +8,7 @@ from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure from pymatgen.util.provenance import StructureNL, Author -from fireworks import LaunchPad, Workflow +from fireworks import LaunchPad from atomate.vasp.database import VaspCalcDb from atomate.vasp.workflows.presets.core import wf_structure_optimization from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs @@ -530,8 +530,8 @@ def find_matching_canonical_task_structures(formula, struct, full_name): wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) #if struct.icsd_id is not None: # wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id}) - except: - msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id) + except Exception as ex: + msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex)) print(msg) logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'}) continue From b7b80c7fbda9518da47361034aae2675486ad639 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 28 Jun 2018 17:55:57 -0700 Subject: [PATCH 20/97] cli: include projects in distinct tags --- emmet/scripts/emmet.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 43d5e14421..c0d24eb356 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -203,7 +203,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) snl_collections.append(snl_db[snl_db_config['collection']]) - ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], snl_collections) + ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections) for snl_coll in snl_collections: print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) @@ -234,12 +234,12 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure query = dict(exclude) query.update(base_query) for snl_coll in snl_collections: - remarks = filter(None, snl_coll.find(query).distinct('about.remarks')) - for t in remarks: - query = {'$and': [{'about.remarks': t}, exclude]} - query.update(base_query) + remarks_projects = snl_coll.distinct('about.projects', query) + snl_coll.distinct('about.remarks', query) + for t in set(remarks_projects): + q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]} + q.update(base_query) if t not in all_tags: - all_tags[t] = [snl_coll.count(query), snl_coll] + all_tags[t] = [snl_coll.count(q), snl_coll] else: print('tag -', t, '- already in', all_tags[t][-1].full_name) sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0]) @@ -248,7 +248,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure if item[1][0] < max_structures and to_scan: tags[item[0]] = [item[1][0], to_scan, item[1][-1]] else: - query = {'$and': [{'about.remarks': tag}, exclude]} + query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]} query.update(base_query) for snl_coll in snl_collections: cnt = snl_coll.count(query) @@ -261,6 +261,8 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure print('nothing to scan') return print(len(tags), 'tags to scan in source SNL collections:') + if tag is None: + print('[with < {} structures to scan]'.format(max_structures)) print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()])) canonical_task_structures = {} From 7e9e6ac3b9d1c0133f22fc39a07e131da447491c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 29 Jun 2018 16:31:46 -0700 Subject: [PATCH 21/97] more progress with cli --- emmet/scripts/emmet.py | 112 +++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index c0d24eb356..475bc28c05 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -25,10 +25,20 @@ base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}} structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] +aggregation_keys = ['reduced_cell_formula', 'formula_pretty'] -def aggregate_by_formula(coll, q, key='reduced_cell_formula'): +def aggregate_by_formula(coll, q, key=None): query = {'$and': [q, exclude]} query.update(base_query) + if key is None: + for k in aggregation_keys: + q = {k: {'$exists': 1}} + q.update(base_query) + if coll.count(q): + key = k + break + if key is None: + raise ValueError('could not find aggregation keys', aggregation_keys, 'in', coll.full_name) return coll.aggregate([ {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])}, {'$group': { @@ -75,8 +85,8 @@ def ensure_meta(snls_db): @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute task addition') -def add_tasks(target_db_file, tag, insert): - """Retrieve tasks from source and add to target""" +def copy_tasks(target_db_file, tag, insert): + """Retrieve tasks from source and copy to target task collection""" if not insert: print('DRY RUN: add --insert flag to actually add tasks to production') @@ -180,14 +190,14 @@ def get_subdir(dn): @cli.command() -@click.option('--add_snls_db', type=click.Path(exists=True), help='config file for additional SNLs collection') -@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection') +@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') +@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan') @click.option('--tag', default=None, help='only include structures with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') -@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection') -@click.option('--max-structures', default=1000, help='set max structures for tags to scan') +@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag') +@click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan') @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors') -def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): +def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): """add workflows based on tags in SNL collection""" if not insert: @@ -196,14 +206,14 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure lpad = LaunchPad.auto_load() snl_collections = [lpad.db.snls] - if add_snls_db is not None: - snl_db_config = yaml.safe_load(open(add_snls_db, 'r')) - snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) - snl_db = snl_db_conn[snl_db_config['db']] - snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) - snl_collections.append(snl_db[snl_db_config['collection']]) - - ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections) + if add_snls_dbs is not None: + for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')): + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_collections.append(snl_db[snl_db_config['collection']]) + + ensure_indexes(['snl_id', 'reduced_cell_formula', 'formula_pretty', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections) for snl_coll in snl_collections: print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) @@ -213,8 +223,8 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure username=lpad.username, password=lpad.password, authentication_db=lpad.name, formatter=MyMongoFormatter() ) logger.addHandler(mongo_handler) - if clear_logs: - mongo_handler.collection.drop() + if clear_logs and tag is not None: + mongo_handler.collection.remove({'tags': tag}) ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tags'], [mongo_handler.collection]) tasks_collections = OrderedDict() @@ -234,14 +244,22 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure query = dict(exclude) query.update(base_query) for snl_coll in snl_collections: - remarks_projects = snl_coll.distinct('about.projects', query) + snl_coll.distinct('about.remarks', query) - for t in set(remarks_projects): + print('collecting tags from', snl_coll.full_name, '...') + projects = snl_coll.distinct('about.projects', query) + remarks = snl_coll.distinct('about.remarks', query) + projects_remarks = projects + if len(remarks) < 100: + projects_remarks += remarks + else: + print('too many remarks in', snl_coll.full_name, '({})'.format(len(remarks))) + for t in set(projects_remarks): q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]} q.update(base_query) if t not in all_tags: all_tags[t] = [snl_coll.count(q), snl_coll] else: print('tag -', t, '- already in', all_tags[t][-1].full_name) + print('sort and analyze tags ...') sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0]) for item in sorted_tags: to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]}) @@ -313,16 +331,11 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for tag, value in tags.items(): - if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed - #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1}) - print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?') - continue - if skip_all_scanned and not value[1]: continue print('aggregate', value[0], 'structures for', tag, '...') - structure_groups = aggregate_by_formula(value[-1], {'about.remarks': tag}) + structure_groups = aggregate_by_formula(value[-1], {'$or': [{'about.remarks': tag}, {'about.projects': tag}]}) print('loop formulas for', tag, '...') counter = Counter() @@ -530,8 +543,6 @@ def find_matching_canonical_task_structures(formula, struct, full_name): wf = add_tags(wf, [tag]) if struct.task_id is not None: wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) - #if struct.icsd_id is not None: - # wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id}) except Exception as ex: msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex)) print(msg) @@ -557,6 +568,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for x in canonical_structures_list ]) print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) + if tag is not None: + print('trying again ...') + add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, True) print(counter) @@ -596,11 +610,16 @@ def report(tag): if tag is None: tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None] tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags] + all_tags = [] + for t in tags: + all_tags.append((t, lpad.db.add_wflows_logs.count({'tags': t}))) + tags = [t[0] for t in sorted(all_tags, key=lambda x: x[1], reverse=True)] print(len(tags), 'tags in WFs and logs collections') from prettytable import PrettyTable table = PrettyTable() table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress'] + sums = ['total'] + [0] * (len(table.field_names)-1) for t in tags: wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1}) @@ -615,22 +634,27 @@ def report(tag): progress = '{:.0f}%'.format(progress) entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states] fizzled = counter['FIZZLED'] / total if total else 0. + if progress != '-': + fizzled = counter['FIZZLED'] / counter['COMPLETED'] if counter['COMPLETED'] else 0. percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) entry.append(percent_fizzled) entry.append(progress) + for idx, e in enumerate(entry): + if isinstance(e, int): + sums[idx] += e if any(entry[2:-2]): table.add_row(entry) - table.sortby = 'SNLs' - table.reversesort = True + if tag is None: + table.add_row(['\033[1;32m{}\033[0m'.format(s if s else '-') for s in sums]) table.align['Tag'] = 'r' print(table) @cli.command() @click.argument('archive', type=click.Path(exists=True)) -@click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections') +@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against') @click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion') def add_snls(archive, add_snls_dbs, insert): """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection""" @@ -641,9 +665,8 @@ def add_snls(archive, add_snls_dbs, insert): lpad = LaunchPad.auto_load() snl_collections = [lpad.db.snls] - if add_snls_dbs: - for add_snls_db in add_snls_dbs: - snl_db_config = yaml.safe_load(open(add_snls_db, 'r')) + if add_snls_dbs is not None: + for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')): snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) @@ -661,14 +684,14 @@ def add_snls(archive, add_snls_dbs, insert): return meta_path = '{}.yaml'.format(tag) + meta = None if not os.path.exists(meta_path): - print('Please include meta info in', meta_path) - return - with open(meta_path, 'r') as f: - meta = yaml.safe_load(f) - meta['authors'] = [Author.parse_author(a) for a in meta['authors']] - - exclude = {'about.remarks': {'$ne': 'DEPRECATED'}} + meta = {'authors': ['Materials Project ']} + print(meta_path, 'not found. Using', meta) + else: + with open(meta_path, 'r') as f: + meta = yaml.safe_load(f) + meta['authors'] = [Author.parse_author(a) for a in meta['authors']] snls = [] tar = tarfile.open(archive, 'r:gz') @@ -693,13 +716,18 @@ def add_snls(archive, add_snls_dbs, insert): print(ex) break #continue + if not (struct.is_ordered and struct.is_valid()): + print('Structure from', member.name, 'not ordered and valid!') + continue + formula = struct.composition.reduced_formula sg = get_sg(struct) struct_added = False for snl_coll in snl_collections: try: - group = aggregate_by_formula(snl_coll, {'formula_pretty': formula}, key='formula_pretty').next() # only one formula + q = {'$or': [{k: formula} for k in aggregation_keys]} + group = aggregate_by_formula(snl_coll, q).next() # only one formula except StopIteration: continue From d63021bb0167ec3b1318823cebdccf13fca9b77d Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 29 Jun 2018 17:20:07 -0700 Subject: [PATCH 22/97] cli.add_snls: support bson and TransformedStructure --- emmet/scripts/emmet.py | 78 +++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 475bc28c05..02c17e4f40 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, tarfile +import click, os, yaml, sys, logging, tarfile, bson, gzip from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict @@ -7,6 +7,7 @@ from pymongo.collection import ReturnDocument from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure +from pymatgen.alchemy.materials import TransformedStructure from pymatgen.util.provenance import StructureNL, Author from fireworks import LaunchPad from atomate.vasp.database import VaspCalcDb @@ -678,7 +679,7 @@ def add_snls(archive, add_snls_dbs, insert): tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, ''] if sec_ext: ext = ''.join([sec_ext, ext]) - exts = ['tar.gz', '.tgz'] + exts = ['tar.gz', '.tgz', 'bson.gz'] if ext not in exts: print(ext, 'not supported (yet)! Please use one of', exts) return @@ -693,37 +694,46 @@ def add_snls(archive, add_snls_dbs, insert): meta = yaml.safe_load(f) meta['authors'] = [Author.parse_author(a) for a in meta['authors']] - snls = [] - tar = tarfile.open(archive, 'r:gz') - for member in tar.getmembers(): - if os.path.basename(member.name).startswith('.'): - continue - f = tar.extractfile(member) - if f: - contents = f.read().decode('utf-8') - fname = member.name.lower() - if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"): - fmt = 'cif' - elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"): - fmt = 'json' - else: - print('reading', fname, 'not supported (yet)') + input_structures = [] + if ext == 'bson.gz': + for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))): + if idx and not idx%1000: + print(idx, '...') + input_structures.append(TransformedStructure.from_dict(doc['structure'])) + else: + tar = tarfile.open(archive, 'r:gz') + for member in tar.getmembers(): + if os.path.basename(member.name).startswith('.'): continue + f = tar.extractfile(member) + if f: + contents = f.read().decode('utf-8') + fname = member.name.lower() + if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"): + fmt = 'cif' + elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"): + fmt = 'json' + else: + print('reading', fname, 'not supported (yet)') + continue + try: + input_structures.append(Structure.from_str(contents, fmt=fmt)) + except Exception as ex: + print(ex) + break #continue - try: - struct = Structure.from_str(contents, fmt=fmt) - except Exception as ex: - print(ex) - break #continue + print(len(input_structures), 'structure(s) loaded.') - if not (struct.is_ordered and struct.is_valid()): - print('Structure from', member.name, 'not ordered and valid!') - continue + snls = [] + for struct in input_structures: formula = struct.composition.reduced_formula sg = get_sg(struct) - struct_added = False + if not (struct.is_ordered and struct.is_valid()): + print('Structure for', formula, sg, 'not ordered and valid!') + continue + struct_added = False for snl_coll in snl_collections: try: q = {'$or': [{k: formula} for k in aggregation_keys]} @@ -757,7 +767,7 @@ def add_snls(archive, add_snls_dbs, insert): for s in canonical_structures: if structures_match(struct, s): - print('Structure from', member.name, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name) + print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name) struct_added = True break @@ -767,12 +777,18 @@ def add_snls(archive, add_snls_dbs, insert): if struct_added: continue - print('append SNL for structure from', member.name) - snl_dct = StructureNL(struct, meta['authors'], references=meta.get('references', '').strip(), projects=[tag]).as_dict() - snl_dct.update(get_meta_from_structure(struct)) prefix = snl_collections[0].database.name index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1 - snl_dct['snl_id'] = '{}-{}'.format(prefix, index) + snl_id = '{}-{}'.format(prefix, index) + print('append SNL for structure with', formula, sg, 'as', snl_id) + references = meta.get('references', '').strip() + if isinstance(struct, TransformedStructure): + snl = struct.to_snl(meta['authors'], references=references, projects=[tag]) + else: + snl = StructureNL(struct, meta['authors'], references=references, projects=[tag]) + snl_dct = snl.as_dict() + snl_dct.update(get_meta_from_structure(struct)) + snl_dct['snl_id'] = snl_id snls.append(snl_dct) if snls: From 511d1c53031e33eeadb83c1523b3e728020ff2fd Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 6 Jul 2018 10:43:54 -0700 Subject: [PATCH 23/97] cli: Trafos, bug fixes, insert_snls --- emmet/scripts/emmet.py | 159 ++++++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 64 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 02c17e4f40..43c75ad402 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -22,8 +22,8 @@ sys.exit(0) exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}} -no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+'] -base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}} +skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+'] +base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': skip_labels}} task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}} structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] aggregation_keys = ['reduced_cell_formula', 'formula_pretty'] @@ -367,7 +367,13 @@ def find_matching_canonical_task_structures(formula, struct, full_name): s = Structure.from_dict(dct) s.snl_id = dct['snl_id'] s.task_id = dct.get('task_id') - s.remove_oxidation_states() + try: + s.remove_oxidation_states() + except Exception as ex: + msg = 'SNL {}: {}'.format(s.snl_id, ex) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) + continue try: sgnum = get_sg(s) except Exception as ex: @@ -571,7 +577,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) if tag is not None: print('trying again ...') - add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, True) + add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True) print(counter) @@ -675,6 +681,17 @@ def add_snls(archive, add_snls_dbs, insert): for snl_coll in snl_collections: print(snl_coll.count(), 'SNLs in', snl_coll.full_name) + def insert_snls(snls_list): + if snls_list: + print('add', len(snls_list), 'SNLs') + if insert: + result = snl_collections[0].insert_many(snls_list) + print('#SNLs inserted:', len(result.inserted_ids)) + snls_list.clear() + else: + print('no SNLs to insert') + + fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, ''] if sec_ext: @@ -699,6 +716,9 @@ def add_snls(archive, add_snls_dbs, insert): for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))): if idx and not idx%1000: print(idx, '...') + elements = set([specie['element'] for site in doc['structure']['sites'] for specie in site['species']]) + if any([bool(l in elements) for l in skip_labels]): + continue input_structures.append(TransformedStructure.from_dict(doc['structure'])) else: tar = tarfile.open(archive, 'r:gz') @@ -724,78 +744,89 @@ def add_snls(archive, add_snls_dbs, insert): print(len(input_structures), 'structure(s) loaded.') - snls = [] - for struct in input_structures: + snls, index = [], None + for idx, istruct in enumerate(input_structures): - formula = struct.composition.reduced_formula + struct = istruct.final_structure if isinstance(istruct, TransformedStructure) else istruct + formula = struct.composition.reduced_formula + try: sg = get_sg(struct) - if not (struct.is_ordered and struct.is_valid()): - print('Structure for', formula, sg, 'not ordered and valid!') + except Exception as ex: + struct.to(fmt='json', filename='sgnum_{}_{}.json'.format(tag, formula)) + print('Structure for {}: {}'.format(formula, ex)) + continue + if not (struct.is_ordered and struct.is_valid()): + print('Structure for', formula, sg, 'not ordered and valid!') + continue + try: + struct.remove_oxidation_states() + except Exception as ex: + print(struct.sites) + print(ex) + print('Structure for', formula, sg, 'error in remove_oxidation_states!') + sys.exit(0) #continue + + struct_added = False + for snl_coll in snl_collections: + try: + q = {'$or': [{k: formula} for k in aggregation_keys]} + group = aggregate_by_formula(snl_coll, q).next() # only one formula + except StopIteration: continue - struct_added = False - for snl_coll in snl_collections: + structures = [] + for dct in group['structures']: + s = Structure.from_dict(dct) + s.snl_id = dct['snl_id'] + s.remove_oxidation_states() try: - q = {'$or': [{k: formula} for k in aggregation_keys]} - group = aggregate_by_formula(snl_coll, q).next() # only one formula - except StopIteration: - continue - - structures = [] - for dct in group['structures']: - s = Structure.from_dict(dct) - s.snl_id = dct['snl_id'] - s.remove_oxidation_states() - try: - sgnum = get_sg(s) - except Exception as ex: - s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) - print('SNL {}: {}'.format(s.snl_id, ex)) - continue - if sgnum == sg: - structures.append(s) - - if not structures: + sgnum = get_sg(s) + except Exception as ex: + s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) + print('SNL {}: {}'.format(s.snl_id, ex)) continue + if sgnum == sg: + structures.append(s) - canonical_structures = [] - for g in group_structures(structures): - canonical_structures.append(g[0]) + if not structures: + continue - if not canonical_structures: - continue + canonical_structures = [] + for g in group_structures(structures): + canonical_structures.append(g[0]) - for s in canonical_structures: - if structures_match(struct, s): - print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name) - struct_added = True - break + if not canonical_structures: + continue - if struct_added: + for s in canonical_structures: + if structures_match(struct, s): + print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name) + struct_added = True break if struct_added: - continue + break - prefix = snl_collections[0].database.name - index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1 - snl_id = '{}-{}'.format(prefix, index) - print('append SNL for structure with', formula, sg, 'as', snl_id) - references = meta.get('references', '').strip() - if isinstance(struct, TransformedStructure): - snl = struct.to_snl(meta['authors'], references=references, projects=[tag]) - else: - snl = StructureNL(struct, meta['authors'], references=references, projects=[tag]) - snl_dct = snl.as_dict() - snl_dct.update(get_meta_from_structure(struct)) - snl_dct['snl_id'] = snl_id - snls.append(snl_dct) - - if snls: - print('add', len(snls), 'SNLs') - if insert: - result = snl_collections[0].insert_many(snls) - print('#SNLs inserted:', len(result.inserted_ids)) - else: - print('no SNLs to insert') + if struct_added: + continue + + prefix = snl_collections[0].database.name + if index is None: + index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + 1 + else: + index += 1 + snl_id = '{}-{}'.format(prefix, index) + print('append SNL for structure with', formula, sg, 'as', snl_id) + references = meta.get('references', '').strip() + if isinstance(istruct, TransformedStructure): + snl = istruct.to_snl(meta['authors'], references=references, projects=[tag]) + else: + snl = StructureNL(istruct, meta['authors'], references=references, projects=[tag]) + snl_dct = snl.as_dict() + snl_dct.update(get_meta_from_structure(struct)) + snl_dct['snl_id'] = snl_id + snls.append(snl_dct) + + if idx and not idx%100 or idx == len(input_structures)-1: + insert_snls(snls) From 0a97d882b84506a2fd32a1c9622a99375dd8ade2 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 6 Jul 2018 14:43:36 -0700 Subject: [PATCH 24/97] cli: include tags from all collections --- emmet/scripts/emmet.py | 482 ++++++++++++++++++++--------------------- 1 file changed, 241 insertions(+), 241 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 43c75ad402..000af25502 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -257,24 +257,22 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]} q.update(base_query) if t not in all_tags: - all_tags[t] = [snl_coll.count(q), snl_coll] - else: - print('tag -', t, '- already in', all_tags[t][-1].full_name) + all_tags[t] = [] + all_tags[t].append([snl_coll.count(q), snl_coll]) print('sort and analyze tags ...') - sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0]) + sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0][0]) for item in sorted_tags: - to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]}) - if item[1][0] < max_structures and to_scan: - tags[item[0]] = [item[1][0], to_scan, item[1][-1]] + total = sum([x[0] for x in item[1]]) + to_scan = total - lpad.db.add_wflows_logs.count({'tags': item[0]}) + if total < max_structures and to_scan: + tags[item[0]] = [total, to_scan, [x[-1] for x in item[1]]] else: query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]} query.update(base_query) - for snl_coll in snl_collections: - cnt = snl_coll.count(query) - if cnt: - to_scan = cnt - lpad.db.add_wflows_logs.count({'tags': tag}) - tags[tag] = [cnt, to_scan, snl_coll] - break + total = sum([snl_coll.count(query) for snl_coll in snl_collections]) + if total: + to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag}) + tags[tag] = [total, to_scan, snl_collections] if not tags: print('nothing to scan') @@ -282,7 +280,7 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur print(len(tags), 'tags to scan in source SNL collections:') if tag is None: print('[with < {} structures to scan]'.format(max_structures)) - print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()])) + print('\n'.join(['{} ({}) --> {} TO SCAN'.format(k, v[0], v[1]) for k, v in tags.items()])) canonical_task_structures = {} grouped_workflow_structures = {} @@ -335,251 +333,253 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if skip_all_scanned and not value[1]: continue - print('aggregate', value[0], 'structures for', tag, '...') - structure_groups = aggregate_by_formula(value[-1], {'$or': [{'about.remarks': tag}, {'about.projects': tag}]}) - - print('loop formulas for', tag, '...') - counter = Counter() - structures, canonical_structures = {}, {} + print(value[0], 'structures for', tag, '...') + for coll in value[-1]: + print('aggregate structures in', coll.full_name, '...') + structure_groups = aggregate_by_formula(coll, {'$or': [{'about.remarks': tag}, {'about.projects': tag}]}) - try: - for idx_group, group in enumerate(structure_groups): - - counter['formulas'] += 1 - formula = group['_id'] - if formula not in structures: - structures[formula] = {} - if formula not in canonical_structures: - canonical_structures[formula] = {} - if idx_group and not idx_group%1000: - print(idx_group, '...') - - for dct in group['structures']: - q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']} - if mongo_handler.collection.find_one(q): - lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}}) - continue # already checked - q['level'] = 'ERROR' - if skip_all_scanned and mongo_handler.collection.find_one(q): - continue - mongo_handler.collection.remove(q) # avoid dups - counter['structures'] += 1 - s = Structure.from_dict(dct) - s.snl_id = dct['snl_id'] - s.task_id = dct.get('task_id') - try: - s.remove_oxidation_states() - except Exception as ex: - msg = 'SNL {}: {}'.format(s.snl_id, ex) - print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) - continue - try: - sgnum = get_sg(s) - except Exception as ex: - s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) - msg = 'SNL {}: {}'.format(s.snl_id, ex) - print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) - continue - if sgnum not in structures[formula]: - structures[formula][sgnum] = [] - structures[formula][sgnum].append(s) - - for sgnum, slist in structures[formula].items(): - for g in group_structures(slist): - if sgnum not in canonical_structures[formula]: - canonical_structures[formula][sgnum] = [] - canonical_structures[formula][sgnum].append(g[0]) - if len(g) > 1: - for s in g[1:]: - logger.warning('duplicate structure', extra={ - 'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id - }) + print('loop formulas for', tag, '...') + counter = Counter() + structures, canonical_structures = {}, {} - if not canonical_structures[formula]: - continue - canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist] - - if formula not in canonical_workflow_structures: - canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {} - workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1}) - if workflows.count() > 0: - workflow_structures = {} - for wf in workflows: - s = Structure.from_dict(wf['metadata']['structure']) + try: + for idx_group, group in enumerate(structure_groups): + + counter['formulas'] += 1 + formula = group['_id'] + if formula not in structures: + structures[formula] = {} + if formula not in canonical_structures: + canonical_structures[formula] = {} + if idx_group and not idx_group%1000: + print(idx_group, '...') + + for dct in group['structures']: + q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']} + if mongo_handler.collection.find_one(q): + lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}}) + continue # already checked + q['level'] = 'ERROR' + if skip_all_scanned and mongo_handler.collection.find_one(q): + continue + mongo_handler.collection.remove(q) # avoid dups + counter['structures'] += 1 + s = Structure.from_dict(dct) + s.snl_id = dct['snl_id'] + s.task_id = dct.get('task_id') + try: s.remove_oxidation_states() - sgnum = get_sg(s) - if sgnum in canonical_structures[formula]: - if sgnum not in workflow_structures: - workflow_structures[sgnum] = [] - s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework - workflow_structures[sgnum].append(s) - if workflow_structures: - for sgnum, slist in workflow_structures.items(): - grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] - canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] - #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) - - for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): - - for struc in slist: - + except Exception as ex: + msg = 'SNL {}: {}'.format(s.snl_id, ex) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) + continue try: - struct = vp.get_predicted_structure(struc) - struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + sgnum = get_sg(s) except Exception as ex: - print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') - print(ex) - struct = struc - - if not structures_match(struct, struc): - print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') - struct = struc - - wf_found = False - if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: - for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): - if structures_match(struct, s): - msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id) - print(msg) - if struct.task_id is not None: - task_query = {'task_id': struct.task_id} - task_query.update(task_base_query) - for full_name in reversed(tasks_collections): - task = tasks_collections[full_name].find_one(task_query, ['input.structure']) + s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id)) + msg = 'SNL {}: {}'.format(s.snl_id, ex) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)}) + continue + if sgnum not in structures[formula]: + structures[formula][sgnum] = [] + structures[formula][sgnum].append(s) + + for sgnum, slist in structures[formula].items(): + for g in group_structures(slist): + if sgnum not in canonical_structures[formula]: + canonical_structures[formula][sgnum] = [] + canonical_structures[formula][sgnum].append(g[0]) + if len(g) > 1: + for s in g[1:]: + logger.warning('duplicate structure', extra={ + 'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id + }) + + if not canonical_structures[formula]: + continue + canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist] + + if formula not in canonical_workflow_structures: + canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {} + workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1}) + if workflows.count() > 0: + workflow_structures = {} + for wf in workflows: + s = Structure.from_dict(wf['metadata']['structure']) + s.remove_oxidation_states() + sgnum = get_sg(s) + if sgnum in canonical_structures[formula]: + if sgnum not in workflow_structures: + workflow_structures[sgnum] = [] + s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework + workflow_structures[sgnum].append(s) + if workflow_structures: + for sgnum, slist in workflow_structures.items(): + grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)] + canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]] + #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula) + + for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()): + + for struc in slist: + + try: + struct = vp.get_predicted_structure(struc) + struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + except Exception as ex: + print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') + print(ex) + struct = struc + + if not structures_match(struct, struc): + print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') + struct = struc + + wf_found = False + if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: + for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]): + if structures_match(struct, s): + msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id) + print(msg) + if struct.task_id is not None: + task_query = {'task_id': struct.task_id} + task_query.update(task_base_query) + for full_name in reversed(tasks_collections): + task = tasks_collections[full_name].find_one(task_query, ['input.structure']) + if task: + break if task: - break - if task: - s_task = Structure.from_dict(task['input']['structure']) - s_task.remove_oxidation_states() - if not structures_match(struct, s_task): - msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) - msg += ' --> CLEANUP: remove task_id from SNL' - print(msg) - value[-1].update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) - counter['snl-task_mismatch'] += 1 - else: - msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) - print(msg) - logger.warning(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag] - }) - else: - print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) - fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]] - fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks']) - fw_found = False - for fw in fws: - if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: - msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) + s_task = Structure.from_dict(task['input']['structure']) + s_task.remove_oxidation_states() + if not structures_match(struct, s_task): + msg = ' --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id) + msg += ' --> CLEANUP: remove task_id from SNL' print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]}) - fw_found = True - break - if not fw_found: - print(' --> no WF with enforced task-id', struct.task_id) - fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1}) - print(' -->', s.fw_id, fw['state']) - if fw['state'] == 'COMPLETED': - # the task is in lpad.db.tasks with different integer task_id - # => find task => overwrite task_id => add_tasks will pick it up - full_name = list(tasks_collections.keys())[0] - load_canonical_task_structures(formula, full_name) - matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name) - if len(matched_task_ids) == 1: - tasks_collections[full_name].update( - {'task_id': matched_task_ids[0]}, { - '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()}, - '$addToSet': {'tags': tag} - } - ) - print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name) - elif matched_task_ids: - msg = ' --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id) + coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) + counter['snl-task_mismatch'] += 1 + else: + msg = ' --> OK: workflow resulted in matching task {}'.format(struct.task_id) + print(msg) + logger.warning(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag] + }) + else: + print(' --> did not find task', struct.task_id, 'for WF', s.fw_id) + fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]] + fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks']) + fw_found = False + for fw in fws: + if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: + msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) print(msg) - logger.error(msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF' - }) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]}) + fw_found = True + break + if not fw_found: + print(' --> no WF with enforced task-id', struct.task_id) + fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1}) + print(' -->', s.fw_id, fw['state']) + if fw['state'] == 'COMPLETED': + # the task is in lpad.db.tasks with different integer task_id + # => find task => overwrite task_id => add_tasks will pick it up + full_name = list(tasks_collections.keys())[0] + load_canonical_task_structures(formula, full_name) + matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name) + if len(matched_task_ids) == 1: + tasks_collections[full_name].update( + {'task_id': matched_task_ids[0]}, { + '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()}, + '$addToSet': {'tags': tag} + } + ) + print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name) + elif matched_task_ids: + msg = ' --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id) + print(msg) + logger.error(msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF' + }) + else: + msg = ' --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id) + msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id) + print(msg) + lpad.delete_wf(s.fw_id) + break else: - msg = ' --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id) - msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id) - print(msg) + print(' --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'])) lpad.delete_wf(s.fw_id) break - else: - print(' --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'])) - lpad.delete_wf(s.fw_id) - break - else: - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) - wf_found = True + else: + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]}) + wf_found = True + break + + if wf_found: + continue + + # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) + msg, matched_task_ids = '', OrderedDict() + for full_name in reversed(tasks_collections): + load_canonical_task_structures(formula, full_name) + matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name) + if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]: + msg = ' --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name]) + print(msg) + if matched_task_ids[full_name]: break + if any(matched_task_ids.values()): + logger.warning('matched task ids' + msg, extra={ + 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], + 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) + }) + continue - if wf_found: - continue - - # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing) - msg, matched_task_ids = '', OrderedDict() - for full_name in reversed(tasks_collections): - load_canonical_task_structures(formula, full_name) - matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name) - if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]: - msg = ' --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name]) + no_potcars = set(NO_POTCARS) & set(struct.composition.elements) + if len(no_potcars) > 0: + msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) print(msg) - if matched_task_ids[full_name]: - break - if any(matched_task_ids.values()): - logger.warning('matched task ids' + msg, extra={ - 'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], - 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items()) - }) - continue - - no_potcars = set(NO_POTCARS) & set(struct.composition.elements) - if len(no_potcars) > 0: - msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars) - print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars}) - continue + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars}) + continue + + try: + wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) + wf = add_trackers(wf) + wf = add_tags(wf, [tag]) + if struct.task_id is not None: + wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) + except Exception as ex: + msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex)) + print(msg) + logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'}) + continue - try: - wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) - wf = add_trackers(wf) - wf = add_tags(wf, [tag]) + msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) if struct.task_id is not None: - wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) - except Exception as ex: - msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex)) + msg += ' --> enforcing task-id {}'.format(struct.task_id) print(msg) - logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'}) - continue - - msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id) - if struct.task_id is not None: - msg += ' --> enforcing task-id {}'.format(struct.task_id) - print(msg) - - if insert: - old_new = lpad.add_wf(wf) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]}) - else: - logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]}) - counter['add(ed)'] += 1 - except CursorNotFound as ex: - print(ex) - sites_elements = set([ - (len(set([e.symbol for e in x.composition.elements])), x.num_sites) - for x in canonical_structures_list - ]) - print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) - if tag is not None: - print('trying again ...') - add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True) - - print(counter) + if insert: + old_new = lpad.add_wf(wf) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]}) + else: + logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]}) + counter['add(ed)'] += 1 + + except CursorNotFound as ex: + print(ex) + sites_elements = set([ + (len(set([e.symbol for e in x.composition.elements])), x.num_sites) + for x in canonical_structures_list + ]) + print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) + if tag is not None: + print('trying again ...') + add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True) + + print(counter) def structures_match(s1, s2): From 1d40b10838912b32e726bc62f9dc413297b81214 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 6 Jul 2018 16:41:32 -0700 Subject: [PATCH 25/97] cli: multiple collections for tag flag --- emmet/scripts/emmet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 000af25502..b9e2e52dec 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -269,10 +269,11 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur else: query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]} query.update(base_query) - total = sum([snl_coll.count(query) for snl_coll in snl_collections]) + cnts = [snl_coll.count(query) for snl_coll in snl_collections] + total = sum(cnts) if total: to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag}) - tags[tag] = [total, to_scan, snl_collections] + tags[tag] = [total, to_scan, [snl_coll for idx, snl_coll in enumerate(snl_collections) if cnts[idx]]] if not tags: print('nothing to scan') From fee49e6983347f814f05769cd7a47eb77225ae2c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 10 Jul 2018 10:27:23 -0700 Subject: [PATCH 26/97] cli: exclude query bugfix --- emmet/scripts/emmet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index b9e2e52dec..beb4bcd783 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -106,13 +106,13 @@ def get_subdir(dn): tags = [tag] if tag is None: - tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None] + tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None] print(len(tags), 'tags in source collection') for t in tags: print('### {} ###'.format(t)) - query = {'$and': [{'tags': t}, exclude]} + query = {'$and': [{'tags': t}, task_base_query]} source_count = source.collection.count(query) print('source / target:', source_count, '/', target.collection.count(query)) From 410034f2efc07bc746d26a73964a21b87e689cd1 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 17 Jul 2018 13:48:11 -0700 Subject: [PATCH 27/97] cli: skip_all_scanned fix, fizzled rate --- emmet/scripts/emmet.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index beb4bcd783..5be0bff2f0 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -263,7 +263,10 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0][0]) for item in sorted_tags: total = sum([x[0] for x in item[1]]) - to_scan = total - lpad.db.add_wflows_logs.count({'tags': item[0]}) + q = {'tags': item[0]} + if not skip_all_scanned: + q['level'] = 'WARNING' + to_scan = total - lpad.db.add_wflows_logs.count(q) if total < max_structures and to_scan: tags[item[0]] = [total, to_scan, [x[-1] for x in item[1]]] else: @@ -272,7 +275,10 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur cnts = [snl_coll.count(query) for snl_coll in snl_collections] total = sum(cnts) if total: - to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag}) + q = {'tags': tag} + if not skip_all_scanned: + q['level'] = 'WARNING' + to_scan = total - lpad.db.add_wflows_logs.count(q) tags[tag] = [total, to_scan, [snl_coll for idx, snl_coll in enumerate(snl_collections) if cnts[idx]]] if not tags: @@ -362,6 +368,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): continue # already checked q['level'] = 'ERROR' if skip_all_scanned and mongo_handler.collection.find_one(q): + lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}}) continue mongo_handler.collection.remove(q) # avoid dups counter['structures'] += 1 @@ -642,8 +649,8 @@ def report(tag): progress = '{:.0f}%'.format(progress) entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states] fizzled = counter['FIZZLED'] / total if total else 0. - if progress != '-': - fizzled = counter['FIZZLED'] / counter['COMPLETED'] if counter['COMPLETED'] else 0. + if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']): + fizzled = counter['FIZZLED'] / (counter['COMPLETED'] + counter['FIZZLED']) percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) entry.append(percent_fizzled) From 4c035c3bbdd78bc8bbf4656825e5b2b44f1ea3f7 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 15 Aug 2018 10:43:55 -0700 Subject: [PATCH 28/97] cli: insert continue, in-progress flag --- emmet/scripts/emmet.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 5be0bff2f0..8ac85e7129 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -141,6 +141,8 @@ def get_subdir(dn): continue print(len(subdirs), 'candidate tasks to insert') + if not insert: + continue for subdir_doc in subdirs: subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}} @@ -615,7 +617,8 @@ def format(self, record): @cli.command() @click.option('--tag', default=None, help='only include structures with specific tag') -def report(tag): +@click.option('--in-progress/--no-in-progress', default=False, help='show in-progress only') +def report(tag, in_progress): """generate a report of calculations status""" lpad = LaunchPad.auto_load() @@ -647,6 +650,8 @@ def report(tag): tc = "\033[1;34m{}\033[0m".format(t) progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0. progress = '{:.0f}%'.format(progress) + elif in_progress: + continue entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states] fizzled = counter['FIZZLED'] / total if total else 0. if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']): From 9db0478ee555fc1640f0f5a41197a4ef9b9f8973 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 15 Aug 2018 10:46:04 -0700 Subject: [PATCH 29/97] add data maintenance scripts --- emmet/scripts/garden_to_hpss.sh | 12 +++++ emmet/scripts/hpss_to_mpdrive.sh | 77 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100755 emmet/scripts/garden_to_hpss.sh create mode 100755 emmet/scripts/hpss_to_mpdrive.sh diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh new file mode 100755 index 0000000000..c139184e15 --- /dev/null +++ b/emmet/scripts/garden_to_hpss.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do + echo $block_dir + subdir=`basename $block_dir` + if [ ! -e ${subdir}.tar.gz ]; then + tar -czvf ${subdir}.tar.gz ${block_dir} + fi + hsi cput ${subdir}.tar.gz : garden/${subdir}.tar.gz + [[ $? -ne 0 ]] && echo "not removing ${block_dir}" && continue + rm -rv $block_dir && rm -v ${subdir}.tar.gz +done diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh new file mode 100755 index 0000000000..7f6a6da177 --- /dev/null +++ b/emmet/scripts/hpss_to_mpdrive.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# $(find $dir -name 'INCAR.orig*' -printf '%h ') +dirs=`awk -F/ '{print $1}' $1 | sort -u` +hpss_missing="blocks_missing_in_hpss.txt" + +stage_dir="rclone_to_mp_drive" +[[ ! -d $stage_dir ]] && mkdir $stage_dir +[[ ! -e $hpss_missing ]] && touch $hpss_missing + +for dir in $dirs; do + [[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove + + files=`grep "^$dir" $1` + extract="${dir}.extract" + grep -q "$dir" $hpss_missing + [[ $? -eq 0 ]] && continue + + [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + + missing_paths="${dir}.paths" + echo $files | tr ' ' '\n' | sort -u > ${dir}.files + rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf + for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive + launch_dir_tar="${stage_dir}/${f}.tar.gz" + if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then + echo $f >> $missing_paths + elif [ -d $f ]; then + rm -rv $f + fi + done + + for f in $(comm --check-order -12 ${dir}.files ${dir}.rclone_lsf | tr '\n' ' '); do # already cloned launch dirs -> cleanup + launch_dir_tar="${stage_dir}/${f}.tar.gz" + [[ -d $f ]] && rm -rv $f + [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar + done + rm -v ${dir}.files ${dir}.rclone_lsf + + [[ ! -e $missing_paths ]] && continue + + if [ ! -e ${dir}.tar.gz ] || [ ! -s ${dir}.tar.gz ]; then + hsi -q "get garden/${dir}.tar.gz" + [[ $? -ne 0 ]] && echo ${dir} >> $hpss_missing && continue + fi + ls -ltrh ${dir}.tar.gz + + if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then + echo "make ${dir}.tar_list ..." + tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list + fi + + paths=`cat $missing_paths` + for f in $paths; do + [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract + done + + if [ -e $extract ] && [ -s $extract ]; then + echo "extract" `wc -l $extract` + tar -xvzf ${dir}.tar.gz --files-from $extract + fi + rm -v $extract + + for f in $paths; do + launch_dir_tar="${stage_dir}/${f}.tar.gz" + echo $launch_dir_tar ... + mkdir -p `dirname $launch_dir_tar` + tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`) + [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && break + ls -ltrh $launch_dir_tar + [[ -d $f ]] && rm -r $f + done + rm -v $missing_paths + + rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + +done From 455e94fb21de1a347228066dd4c50489ead29658 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 15 Aug 2018 13:48:22 -0700 Subject: [PATCH 30/97] cli parse subcommand --- emmet/scripts/emmet.py | 68 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 8ac85e7129..2af685db8c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -10,7 +10,9 @@ from pymatgen.alchemy.materials import TransformedStructure from pymatgen.util.provenance import StructureNL, Author from fireworks import LaunchPad +from fireworks.fw_config import FW_BLOCK_FORMAT from atomate.vasp.database import VaspCalcDb +from atomate.vasp.drones import VaspDrone from atomate.vasp.workflows.presets.core import wf_structure_optimization from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs from emmet.vasp.materials import group_structures, get_sg @@ -82,6 +84,9 @@ def ensure_meta(snls_db): ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll]) +def get_subdir(dn): + return dn.rsplit(os.sep, 1)[-1] + @cli.command() @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @@ -92,9 +97,6 @@ def copy_tasks(target_db_file, tag, insert): if not insert: print('DRY RUN: add --insert flag to actually add tasks to production') - def get_subdir(dn): - return dn.rsplit(os.sep, 1)[-1] - lpad = LaunchPad.auto_load() source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to source db with', source.collection.count(), 'tasks') @@ -843,3 +845,63 @@ def insert_snls(snls_list): if idx and not idx%100 or idx == len(input_structures)-1: insert_snls(snls) + +@cli.command() +@click.argument('base_path', type=click.Path(exists=True)) +@click.option('--insert/--no-insert', default=False, help='actually execute task insertion') +def parse(base_path, insert): + """parse VASP output directories in base_path into tasks and tag""" + if not insert: + print('DRY RUN: add --insert flag to actually insert tasks') + + lpad = LaunchPad.auto_load() + target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) + print('connected to target db with', target.collection.count(), 'tasks') + base_path_split = base_path.split(os.sep) + tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2] + drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]}) + already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')] + print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag) + + def get_timestamp_dir(prefix='launcher'): + time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT) + return '_'.join([prefix, time_now]) + + def get_vasp_dirs(): + for root, dirs, files in os.walk(base_path): + # TODO ignore relax1/2 subdirs if INCAR.orig found + if any(f.startswith("INCAR") for f in files): + if insert: + root_split = os.path.realpath(root).split(os.sep) + idx = len(base_path_split) + if not root_split[idx-1].startswith('block_'): + rootdir = os.sep.join(root_split[:idx]) + block = get_timestamp_dir(prefix='block') + block_dir = os.sep.join(root_split[:idx-1] + [block]) + os.rename(rootdir, block_dir) + os.symlink(block_dir, rootdir) + print(rootdir, '->', block_dir) + subdir = os.sep.join(root_split) + if not root_split[-1].startswith('launcher_'): + launch = get_timestamp_dir() + launch_dir = os.sep.join(root_split[:-1] + [launch]) + os.rename(subdir, launch_dir) + os.symlink(launch_dir, subdir) + print(subdir, '->', launch_dir) + yield launch_dir + else: + yield subdir + else: + yield root + + for vaspdir in get_vasp_dirs(): + subdir = get_subdir(vaspdir) + if subdir not in already_inserted_subdirs: + print(vaspdir) + try: + task_doc = drone.assimilate(vaspdir) + except Exception as ex: + print(str(ex)) + continue + if insert: + target.insert_task(task_doc, use_gridfs=True) From 7c8cdb14e16444f42e51a30124e419533c30d3b6 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 10 Sep 2018 15:40:47 -0700 Subject: [PATCH 31/97] cli report: add --to-csv option --- emmet/scripts/emmet.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 2af685db8c..38f91163c3 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, tarfile, bson, gzip +import click, os, yaml, sys, logging, tarfile, bson, gzip, csv from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict @@ -620,7 +620,8 @@ def format(self, record): @cli.command() @click.option('--tag', default=None, help='only include structures with specific tag') @click.option('--in-progress/--no-in-progress', default=False, help='show in-progress only') -def report(tag, in_progress): +@click.option('--to-csv/--no-to-csv', default=False, help='save report as CSV') +def report(tag, in_progress, to_csv): """generate a report of calculations status""" lpad = LaunchPad.auto_load() @@ -649,7 +650,7 @@ def report(tag, in_progress): total = sum(v for k, v in counter.items() if k in states) tc, progress = t, '-' if wflows_to_add or counter['COMPLETED'] + counter['FIZZLED'] != total: - tc = "\033[1;34m{}\033[0m".format(t) + tc = "\033[1;34m{}\033[0m".format(t) if not to_csv else t progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0. progress = '{:.0f}%'.format(progress) elif in_progress: @@ -658,8 +659,8 @@ def report(tag, in_progress): fizzled = counter['FIZZLED'] / total if total else 0. if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']): fizzled = counter['FIZZLED'] / (counter['COMPLETED'] + counter['FIZZLED']) - percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \ - if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.) + sfmt = "\033[1;31m{:.0f}%\033[0m" if (not to_csv and fizzled > 0.2) else '{:.0f}%' + percent_fizzled = sfmt.format(fizzled*100.) entry.append(percent_fizzled) entry.append(progress) for idx, e in enumerate(entry): @@ -669,10 +670,19 @@ def report(tag, in_progress): table.add_row(entry) if tag is None: - table.add_row(['\033[1;32m{}\033[0m'.format(s if s else '-') for s in sums]) + sfmt = '{}' if to_csv else '\033[1;32m{}\033[0m' + table.add_row([sfmt.format(s if s else '-') for s in sums]) table.align['Tag'] = 'r' print(table) + if to_csv: + with open('emmet_report.csv', 'w') as csv_file: + writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) + writer.writerow(table._field_names) + options = table._get_options({}) + for row in table._get_rows(options): + writer.writerow(row) + @cli.command() @click.argument('archive', type=click.Path(exists=True)) From e4636d4ac8b5a5e9e553907a7ec0f26c342e38e7 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 17 Sep 2018 14:24:36 -0700 Subject: [PATCH 32/97] cli: add launcher_paths --- emmet/scripts/launcher_paths.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 emmet/scripts/launcher_paths.py diff --git a/emmet/scripts/launcher_paths.py b/emmet/scripts/launcher_paths.py new file mode 100644 index 0000000000..fc7625bbf2 --- /dev/null +++ b/emmet/scripts/launcher_paths.py @@ -0,0 +1,39 @@ +import json +from atomate.vasp.database import VaspCalcDb + +target_db_file = '../dbfiles/db_atomate.json' +target = VaspCalcDb.from_db_file(target_db_file, admin=True) +print('connected to target db with', target.collection.count(), 'tasks') +print(target.db.materials.count(), 'materials') + +splits = ['block_', 'aflow_'] +mpids = json.load(open('KRao_Li_FullList.txt', 'r')) +print(len(mpids), 'mpids') +query = {'task_id': {'$in': mpids}} + +# {'mp-1002': [{'task_id': ..., 'task_type': ..., 'launcher_path': ...}, ...], ...} +out = {} + +for idx, doc in enumerate(target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})): + mp_id = doc['task_id'] + out[mp_id] = [] + print(idx, mp_id) + for task_type, task_id in doc['blessed_tasks'].items(): + dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name'] + if 'maarten_piezo' in dir_name: + continue + for s in splits: + ds = dir_name.split(s) + if len(ds) == 2: + launcher = s + ds[-1] + print(task_id, task_type, launcher) + out[mp_id].append({'task_id': task_id, 'task_type': task_type, 'launcher_path': launcher}) + break + +with open('launcher_paths.json', 'w') as f: + json.dump(out, f) + +with open('launcher_paths.txt', 'w') as f: + for mp_id, tasks in out.items(): + for task in tasks: + f.write(task['launcher_path']+'\n') From c2e763a7683321b03dbc127b0dd0c7cc872a5073 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 17 Sep 2018 17:43:43 -0700 Subject: [PATCH 33/97] cli.copy_tasks: also copy SNLs --- emmet/scripts/emmet.py | 50 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 38f91163c3..65b5cec444 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -111,6 +111,16 @@ def copy_tasks(target_db_file, tag, insert): tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None] print(len(tags), 'tags in source collection') + def insert_snls(snls_list): + if snls_list: + print('copy', len(snls_list), 'SNLs') + if insert: + result = target.db.snls.insert_many(snls_list) + print('#SNLs inserted:', len(result.inserted_ids)) + snls_list.clear() + else: + print('no SNLs to insert') + for t in tags: print('### {} ###'.format(t)) @@ -118,6 +128,44 @@ def copy_tasks(target_db_file, tag, insert): source_count = source.collection.count(query) print('source / target:', source_count, '/', target.collection.count(query)) + # get list of SNLs to copy over + # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls + # also only need to check about.projects; add_snls adds tag to about.projects and not remarks + snls = lpad.db.snls.find({'about.projects': t}) + nr_snls = snls.count() + if nr_snls < target.db.snls.count({'about.projects': t}): + snls_to_copy, index, prefix = [], None, 'snl' + for idx, doc in enumerate(snls): + snl = StructureNL.from_dict(doc) + formula = snl.structure.composition.reduced_formula + snl_copied = False + try: + q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]} + group = aggregate_by_formula(target.db.snls, q).next() # only one formula + for dct in group['structures']: + existing_structure = Structure.from_dict(dct) + if structures_match(snl.structure, existing_structure): + snl_copied = True + print('SNL', doc['snl_id'], 'already added.') + break + except StopIteration: + pass + if snl_copied: + continue + snl_dct = snl.as_dict() + if index is None: + index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1 + else: + index += 1 + snl_id = '{}-{}'.format(prefix, index) + snl_dct['snl_id'] = snl_id + snl_dct.update(get_meta_from_structure(snl.structure)) + snls_to_copy.append(snl_dct) + if idx and not idx%100 or idx == nr_snls-1: + insert_snls(snls_to_copy) + else: + print('SNLs already copied.') + # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*]) nr_source_mp_tasks, skip_task_ids = 0, [] for doc in source.collection.find(query, ['task_id', 'dir_name']): @@ -712,7 +760,7 @@ def insert_snls(snls_list): if insert: result = snl_collections[0].insert_many(snls_list) print('#SNLs inserted:', len(result.inserted_ids)) - snls_list.clear() + snls_list.clear() else: print('no SNLs to insert') From 6384056a7159331d2baa251cf3fb5993db813e72 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 17 Sep 2018 17:44:01 -0700 Subject: [PATCH 34/97] cli.parse: only insert task if successful --- emmet/scripts/emmet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 65b5cec444..f636cc81c8 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -961,5 +961,5 @@ def get_vasp_dirs(): except Exception as ex: print(str(ex)) continue - if insert: + if insert and task_doc['state'] == 'successful': target.insert_task(task_doc, use_gridfs=True) From cc70059c8249fa42becadc907e35f3a96d25af55 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 18 Sep 2018 16:40:49 -0700 Subject: [PATCH 35/97] cli: separate add_snls for load/parse --- emmet/scripts/emmet.py | 92 ++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index f636cc81c8..efe5999860 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -243,14 +243,14 @@ def insert_snls(snls_list): @cli.command() -@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') +@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan') @click.option('--tag', default=None, help='only include structures with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag') @click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan') @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors') -def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): +def add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): """add workflows based on tags in SNL collection""" if not insert: @@ -259,8 +259,8 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur lpad = LaunchPad.auto_load() snl_collections = [lpad.db.snls] - if add_snls_dbs is not None: - for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')): + if add_snlcolls is not None: + for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')): snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) snl_db = snl_db_conn[snl_db_config['db']] snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) @@ -637,7 +637,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) if tag is not None: print('trying again ...') - add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True) + add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True) print(counter) @@ -734,37 +734,15 @@ def report(tag, in_progress, to_csv): @cli.command() @click.argument('archive', type=click.Path(exists=True)) -@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against') +@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against') @click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion') -def add_snls(archive, add_snls_dbs, insert): +def load(archive, add_snlcolls, insert): """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection""" # TODO assign task_ids to structures? if not insert: print('DRY RUN! Add --insert flag to actually add SNLs') - lpad = LaunchPad.auto_load() - snl_collections = [lpad.db.snls] - if add_snls_dbs is not None: - for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')): - snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) - snl_db = snl_db_conn[snl_db_config['db']] - snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) - snl_collections.append(snl_db[snl_db_config['collection']]) - for snl_coll in snl_collections: - print(snl_coll.count(), 'SNLs in', snl_coll.full_name) - - def insert_snls(snls_list): - if snls_list: - print('add', len(snls_list), 'SNLs') - if insert: - result = snl_collections[0].insert_many(snls_list) - print('#SNLs inserted:', len(result.inserted_ids)) - snls_list.clear() - else: - print('no SNLs to insert') - - fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, ''] if sec_ext: @@ -774,16 +752,6 @@ def insert_snls(snls_list): print(ext, 'not supported (yet)! Please use one of', exts) return - meta_path = '{}.yaml'.format(tag) - meta = None - if not os.path.exists(meta_path): - meta = {'authors': ['Materials Project ']} - print(meta_path, 'not found. Using', meta) - else: - with open(meta_path, 'r') as f: - meta = yaml.safe_load(f) - meta['authors'] = [Author.parse_author(a) for a in meta['authors']] - input_structures = [] if ext == 'bson.gz': for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))): @@ -816,6 +784,42 @@ def insert_snls(snls_list): break #continue print(len(input_structures), 'structure(s) loaded.') + add_snls(tag, input_structures, add_snlcolls, insert) + + +def add_snls(tag, input_structures, add_snlcolls, insert): + """add structures to (local) SNLs collection""" + + meta_path = '{}.yaml'.format(tag) + meta = None + if not os.path.exists(meta_path): + meta = {'authors': ['Materials Project ']} + print(meta_path, 'not found. Using', meta) + else: + with open(meta_path, 'r') as f: + meta = yaml.safe_load(f) + meta['authors'] = [Author.parse_author(a) for a in meta['authors']] + + lpad = LaunchPad.auto_load() + snl_collections = [lpad.db.snls] + if add_snlcolls is not None: + for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')): + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_collections.append(snl_db[snl_db_config['collection']]) + for snl_coll in snl_collections: + print(snl_coll.count(), 'SNLs in', snl_coll.full_name) + + def insert_snls(snls_list): + if snls_list: + print('add', len(snls_list), 'SNLs') + if insert: + result = snl_collections[0].insert_many(snls_list) + print('#SNLs inserted:', len(result.inserted_ids)) + snls_list.clear() + else: + print('no SNLs to insert') snls, index = [], None for idx, istruct in enumerate(input_structures): @@ -906,8 +910,9 @@ def insert_snls(snls_list): @cli.command() @click.argument('base_path', type=click.Path(exists=True)) +@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') @click.option('--insert/--no-insert', default=False, help='actually execute task insertion') -def parse(base_path, insert): +def parse(base_path, add_snlcolls, insert): """parse VASP output directories in base_path into tasks and tag""" if not insert: print('DRY RUN: add --insert flag to actually insert tasks') @@ -952,6 +957,7 @@ def get_vasp_dirs(): else: yield root + input_structures = [] for vaspdir in get_vasp_dirs(): subdir = get_subdir(vaspdir) if subdir not in already_inserted_subdirs: @@ -963,3 +969,9 @@ def get_vasp_dirs(): continue if insert and task_doc['state'] == 'successful': target.insert_task(task_doc, use_gridfs=True) + s = Structure.from_dict(task_doc['input']['structure']) + input_structures.append(s) + + print('add SNLs for', len(input_structures), 'structures') + add_snls(tag, input_structures, add_snlcolls, insert) + From 3bb9b5721072cdc5c639880656d44791a6139381 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 18 Sep 2018 16:41:12 -0700 Subject: [PATCH 36/97] cli.copy: only if SNLs available --- emmet/scripts/emmet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index efe5999860..271e1d6cd5 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -133,7 +133,7 @@ def insert_snls(snls_list): # also only need to check about.projects; add_snls adds tag to about.projects and not remarks snls = lpad.db.snls.find({'about.projects': t}) nr_snls = snls.count() - if nr_snls < target.db.snls.count({'about.projects': t}): + if nr_snls and nr_snls < target.db.snls.count({'about.projects': t}): snls_to_copy, index, prefix = [], None, 'snl' for idx, doc in enumerate(snls): snl = StructureNL.from_dict(doc) @@ -164,7 +164,7 @@ def insert_snls(snls_list): if idx and not idx%100 or idx == nr_snls-1: insert_snls(snls_to_copy) else: - print('SNLs already copied.') + print('SNLs not available or already copied.') # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*]) nr_source_mp_tasks, skip_task_ids = 0, [] From f5f762260096c62c5971698e1ed060ebf83b11fe Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 18 Sep 2018 19:23:49 -0700 Subject: [PATCH 37/97] cli: minor subcommand renames --- emmet/scripts/emmet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 271e1d6cd5..2fd514c4e9 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -91,8 +91,8 @@ def get_subdir(dn): @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute task addition') -def copy_tasks(target_db_file, tag, insert): - """Retrieve tasks from source and copy to target task collection""" +def copy(target_db_file, tag, insert): + """Retrieve tasks from source and copy to target task collection (incl. SNLs if available)""" if not insert: print('DRY RUN: add --insert flag to actually add tasks to production') @@ -250,7 +250,7 @@ def insert_snls(snls_list): @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag') @click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan') @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors') -def add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): +def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned): """add workflows based on tags in SNL collection""" if not insert: @@ -637,7 +637,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements) if tag is not None: print('trying again ...') - add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True) + wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True) print(counter) From b2408b2ada5ad61707fef8f16104f6a84f1d7b98 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 19 Sep 2018 15:15:48 -0700 Subject: [PATCH 38/97] cli: copy/make_snls flags, untar launchers --- emmet/scripts/emmet.py | 178 +++++++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 76 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 2fd514c4e9..23bfc55d52 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, tarfile, bson, gzip, csv +import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict @@ -91,7 +91,8 @@ def get_subdir(dn): @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @click.option('--insert/--no-insert', default=False, help='actually execute task addition') -def copy(target_db_file, tag, insert): +@click.option('--copy-snls/--no-copy-snls', default=False, help='also copy SNLs') +def copy(target_db_file, tag, insert, copy_snls): """Retrieve tasks from source and copy to target task collection (incl. SNLs if available)""" if not insert: @@ -131,40 +132,42 @@ def insert_snls(snls_list): # get list of SNLs to copy over # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls # also only need to check about.projects; add_snls adds tag to about.projects and not remarks - snls = lpad.db.snls.find({'about.projects': t}) - nr_snls = snls.count() - if nr_snls and nr_snls < target.db.snls.count({'about.projects': t}): - snls_to_copy, index, prefix = [], None, 'snl' - for idx, doc in enumerate(snls): - snl = StructureNL.from_dict(doc) - formula = snl.structure.composition.reduced_formula - snl_copied = False - try: - q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]} - group = aggregate_by_formula(target.db.snls, q).next() # only one formula - for dct in group['structures']: - existing_structure = Structure.from_dict(dct) - if structures_match(snl.structure, existing_structure): - snl_copied = True - print('SNL', doc['snl_id'], 'already added.') - break - except StopIteration: - pass - if snl_copied: - continue - snl_dct = snl.as_dict() - if index is None: - index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1 - else: - index += 1 - snl_id = '{}-{}'.format(prefix, index) - snl_dct['snl_id'] = snl_id - snl_dct.update(get_meta_from_structure(snl.structure)) - snls_to_copy.append(snl_dct) - if idx and not idx%100 or idx == nr_snls-1: - insert_snls(snls_to_copy) - else: - print('SNLs not available or already copied.') + # TODO only need to copy if author not Materials Project!? + if copy_snls: + snls = lpad.db.snls.find({'about.projects': t}) + nr_snls = snls.count() + if nr_snls: + snls_to_copy, index, prefix = [], None, 'snl' + for idx, doc in enumerate(snls): + snl = StructureNL.from_dict(doc) + formula = snl.structure.composition.reduced_formula + snl_copied = False + try: + q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]} + group = aggregate_by_formula(target.db.snls, q).next() # only one formula + for dct in group['structures']: + existing_structure = Structure.from_dict(dct) + if structures_match(snl.structure, existing_structure): + snl_copied = True + print('SNL', doc['snl_id'], 'already added.') + break + except StopIteration: + pass + if snl_copied: + continue + snl_dct = snl.as_dict() + if index is None: + index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1 + else: + index += 1 + snl_id = '{}-{}'.format(prefix, index) + snl_dct['snl_id'] = snl_id + snl_dct.update(get_meta_from_structure(snl.structure)) + snls_to_copy.append(snl_dct) + if idx and not idx%100 or idx == nr_snls-1: + insert_snls(snls_to_copy) + else: + print('No SNLs available for', t) # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*]) nr_source_mp_tasks, skip_task_ids = 0, [] @@ -912,7 +915,8 @@ def insert_snls(snls_list): @click.argument('base_path', type=click.Path(exists=True)) @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') @click.option('--insert/--no-insert', default=False, help='actually execute task insertion') -def parse(base_path, add_snlcolls, insert): +@click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks') +def parse(base_path, add_snlcolls, insert, make_snls): """parse VASP output directories in base_path into tasks and tag""" if not insert: print('DRY RUN: add --insert flag to actually insert tasks') @@ -930,48 +934,70 @@ def get_timestamp_dir(prefix='launcher'): time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT) return '_'.join([prefix, time_now]) - def get_vasp_dirs(): - for root, dirs, files in os.walk(base_path): + def get_symlinked_path(root): + root_split = os.path.realpath(root).split(os.sep) + idx = len(base_path_split) + if not root_split[idx-1].startswith('block_'): + rootdir = os.sep.join(root_split[:idx]) + block = get_timestamp_dir(prefix='block') + block_dir = os.sep.join(root_split[:idx-1] + [block]) + if insert: + os.rename(rootdir, block_dir) + os.symlink(block_dir, rootdir) + print(rootdir, '->', block_dir) + subdir = os.sep.join(root_split) + if not root_split[-1].startswith('launcher_'): + launch = get_timestamp_dir() + launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch) + if insert: + os.rename(subdir, launch_dir) + os.symlink(launch_dir, subdir) + print(subdir, '->', launch_dir) + return launch_dir + else: + return os.path.realpath(subdir) + + def contains_vasp_dirs(list_of_files): + for f in list_of_files: + if f.startswith("INCAR"): + return True + + def get_vasp_dirs(scan_path): + # NOTE os.walk followlinks=False by default, as intended here + for root, dirs, files in os.walk(scan_path): # TODO ignore relax1/2 subdirs if INCAR.orig found - if any(f.startswith("INCAR") for f in files): - if insert: - root_split = os.path.realpath(root).split(os.sep) - idx = len(base_path_split) - if not root_split[idx-1].startswith('block_'): - rootdir = os.sep.join(root_split[:idx]) - block = get_timestamp_dir(prefix='block') - block_dir = os.sep.join(root_split[:idx-1] + [block]) - os.rename(rootdir, block_dir) - os.symlink(block_dir, rootdir) - print(rootdir, '->', block_dir) - subdir = os.sep.join(root_split) - if not root_split[-1].startswith('launcher_'): - launch = get_timestamp_dir() - launch_dir = os.sep.join(root_split[:-1] + [launch]) - os.rename(subdir, launch_dir) - os.symlink(launch_dir, subdir) - print(subdir, '->', launch_dir) - yield launch_dir - else: - yield subdir - else: - yield root + if contains_vasp_dirs(files): + yield get_symlinked_path(root) + else: + for f in files: + if f.endswith('.tar.gz'): + cwd = os.path.realpath(root) + path = os.path.join(cwd, f) + with tarfile.open(path, 'r:gz') as tf: + tf.extractall(cwd) + os.remove(path) + for vaspdir in get_vasp_dirs(path.replace('.tar.gz', '')): + yield vaspdir + input_structures = [] - for vaspdir in get_vasp_dirs(): + for vaspdir in get_vasp_dirs(base_path): subdir = get_subdir(vaspdir) if subdir not in already_inserted_subdirs: - print(vaspdir) - try: - task_doc = drone.assimilate(vaspdir) - except Exception as ex: - print(str(ex)) - continue - if insert and task_doc['state'] == 'successful': - target.insert_task(task_doc, use_gridfs=True) - s = Structure.from_dict(task_doc['input']['structure']) - input_structures.append(s) - - print('add SNLs for', len(input_structures), 'structures') - add_snls(tag, input_structures, add_snlcolls, insert) + print('vaspdir:', vaspdir) + if insert: + try: + task_doc = drone.assimilate(vaspdir) + except Exception as ex: + print(str(ex)) + continue + if task_doc['state'] == 'successful': + target.insert_task(task_doc, use_gridfs=True) + if make_snls: + s = Structure.from_dict(task_doc['input']['structure']) + input_structures.append(s) + + if insert and make_snls: + print('add SNLs for', len(input_structures), 'structures') + add_snls(tag, input_structures, add_snlcolls, insert) From 622db525293378d135c57c0fc5b61e8c6f5034b5 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 21 Sep 2018 12:42:19 -0700 Subject: [PATCH 39/97] cli: copy orig inputs if necessary --- emmet/scripts/emmet.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 23bfc55d52..d8101b718c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,6 @@ import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile +from shutil import copyfile +from glob import glob from fnmatch import fnmatch from datetime import datetime from collections import Counter, OrderedDict @@ -980,6 +982,7 @@ def get_vasp_dirs(scan_path): yield vaspdir + inputs = ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR'] input_structures = [] for vaspdir in get_vasp_dirs(base_path): subdir = get_subdir(vaspdir) @@ -987,6 +990,12 @@ def get_vasp_dirs(scan_path): print('vaspdir:', vaspdir) if insert: try: + for inp in inputs: + input_path = os.path.join(vaspdir, inp) + orig_path = input_path + '.orig' + if not glob(orig_path+'*'): + copyfile(input_path, orig_path) + print('cp', input_path, '->', orig_path) task_doc = drone.assimilate(vaspdir) except Exception as ex: print(str(ex)) From e9a5307f9afad343240584ba9ed5be538cf6410e Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 24 Sep 2018 14:00:55 -0700 Subject: [PATCH 40/97] cli.parse: remove empty dirs --- emmet/scripts/emmet.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index d8101b718c..34225bac06 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,5 +1,5 @@ import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile -from shutil import copyfile +from shutil import copyfile, rmtree from glob import glob from fnmatch import fnmatch from datetime import datetime @@ -998,7 +998,11 @@ def get_vasp_dirs(scan_path): print('cp', input_path, '->', orig_path) task_doc = drone.assimilate(vaspdir) except Exception as ex: - print(str(ex)) + err = str(ex) + print(err) + if err == 'No VASP files found!': + rmtree(vaspdir) + print('removed', vaspdir) continue if task_doc['state'] == 'successful': target.insert_task(task_doc, use_gridfs=True) From 94f18f704d00d2e0c7b52a44d938c084da775509 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 24 Sep 2018 14:01:12 -0700 Subject: [PATCH 41/97] cli.parse: deal with DocumentTooLarge --- emmet/scripts/emmet.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 34225bac06..cd27661908 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -7,6 +7,7 @@ from pymongo import MongoClient from pymongo.errors import CursorNotFound from pymongo.collection import ReturnDocument +from pymongo.errors import DocumentTooLarge from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure from pymatgen.alchemy.materials import TransformedStructure @@ -1005,7 +1006,19 @@ def get_vasp_dirs(scan_path): print('removed', vaspdir) continue if task_doc['state'] == 'successful': - target.insert_task(task_doc, use_gridfs=True) + try: + target.insert_task(task_doc, use_gridfs=True) + except DocumentTooLarge as ex: + print(str(ex)) + print('remove normalmode_eigenvecs and retry ...') + task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs') + try: + target.insert_task(task_doc, use_gridfs=True) + except DocumentTooLarge as ex: + print(str(ex)) + print('also remove force_constants and retry ...') + task_doc['calcs_reversed'][0]['output'].pop('force_constants') + target.insert_task(task_doc, use_gridfs=True) if make_snls: s = Structure.from_dict(task_doc['input']['structure']) input_structures.append(s) From c907204c8e869c776ef95f94a299585d791ff1a5 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 24 Sep 2018 16:18:08 -0700 Subject: [PATCH 42/97] cli.parse: minor rearrange --- emmet/scripts/emmet.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index cd27661908..b511e22471 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -990,17 +990,16 @@ def get_vasp_dirs(scan_path): if subdir not in already_inserted_subdirs: print('vaspdir:', vaspdir) if insert: + for inp in inputs: + input_path = os.path.join(vaspdir, inp) + orig_path = input_path + '.orig' + if not glob(orig_path+'*'): + copyfile(input_path, orig_path) + print('cp', input_path, '->', orig_path) try: - for inp in inputs: - input_path = os.path.join(vaspdir, inp) - orig_path = input_path + '.orig' - if not glob(orig_path+'*'): - copyfile(input_path, orig_path) - print('cp', input_path, '->', orig_path) task_doc = drone.assimilate(vaspdir) except Exception as ex: err = str(ex) - print(err) if err == 'No VASP files found!': rmtree(vaspdir) print('removed', vaspdir) From f6dd746bd5df5aff41b31ea9a6321be18b8ea8ff Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 25 Sep 2018 14:30:32 -0700 Subject: [PATCH 43/97] cli.setup: add log4mongo, prettytable --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e5243cefa7..6a05ac67a7 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ zip_safe=False, install_requires=[ 'atomate', 'pymatgen>=2018.4.20','maggma','monty', - 'six', 'pydash', 'tqdm', 'matminer', + 'six', 'pydash', 'tqdm', 'matminer', 'log4mongo', 'prettytable', 'prettyplotlib', 'pybtex', 'Click', 'networkx', 'sumo', ], classifiers=["Programming Language :: Python :: 3", From f3bd2797f9441cb9dfcbaff3b53d971e2fb68d7d Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 25 Sep 2018 14:54:02 -0700 Subject: [PATCH 44/97] cli.parse: ensure trailing slash in base_path --- emmet/scripts/emmet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index b511e22471..7eaf52e047 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -927,8 +927,10 @@ def parse(base_path, add_snlcolls, insert, make_snls): lpad = LaunchPad.auto_load() target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to target db with', target.collection.count(), 'tasks') + base_path = os.path.join(base_path, '') base_path_split = base_path.split(os.sep) tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2] + idx = len(base_path_split) drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')] print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag) @@ -939,7 +941,6 @@ def get_timestamp_dir(prefix='launcher'): def get_symlinked_path(root): root_split = os.path.realpath(root).split(os.sep) - idx = len(base_path_split) if not root_split[idx-1].startswith('block_'): rootdir = os.sep.join(root_split[:idx]) block = get_timestamp_dir(prefix='block') From 7df703eb208adfd004cb6e54e1e729e46b18fc2a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 25 Sep 2018 18:46:56 -0700 Subject: [PATCH 45/97] cli.parse: use multiprocessing pool --- emmet/scripts/emmet.py | 254 +++++++++++++++++++++++++---------------- 1 file changed, 156 insertions(+), 98 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 7eaf52e047..b1dead5fcb 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,9 +1,9 @@ -import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile +import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing from shutil import copyfile, rmtree from glob import glob from fnmatch import fnmatch from datetime import datetime -from collections import Counter, OrderedDict +from collections import Counter, OrderedDict, deque from pymongo import MongoClient from pymongo.errors import CursorNotFound from pymongo.collection import ReturnDocument @@ -21,6 +21,7 @@ from emmet.vasp.materials import group_structures, get_sg from emmet.vasp.task_tagger import task_type from log4mongo.handlers import MongoHandler, MongoFormatter +from prettytable import PrettyTable if 'FW_CONFIG_FILE' not in os.environ: print('Please set FW_CONFIG_FILE!') @@ -61,6 +62,124 @@ def get_meta_from_structure(struct): d['is_valid'] = struct.is_valid() return d +# a utility function to get us a slice of an iterator, as an iterator +# when working with iterators maximum lazyness is preferred +def iterator_slice(iterator, length): + iterator = iter(iterator) + while True: + res = tuple(itertools.islice(iterator, length)) + if not res: + break + yield res + +def get_subdir(dn): + return dn.rsplit(os.sep, 1)[-1] + +def get_timestamp_dir(prefix='launcher'): + time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT) + return '_'.join([prefix, time_now]) + +def contains_vasp_dirs(list_of_files): + for f in list_of_files: + if f.startswith("INCAR"): + return True + +def get_symlinked_path(root, base_path_index): + root_split = os.path.realpath(root).split(os.sep) + if not root_split[base_path_index-1].startswith('block_'): + rootdir = os.sep.join(root_split[:base_path_index]) + block = get_timestamp_dir(prefix='block') + block_dir = os.sep.join(root_split[:base_path_index-1] + [block]) + if insert: + os.rename(rootdir, block_dir) + os.symlink(block_dir, rootdir) + print(rootdir, '->', block_dir) + subdir = os.sep.join(root_split) + if not root_split[-1].startswith('launcher_'): + launch = get_timestamp_dir() + launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch) + if insert: + os.rename(subdir, launch_dir) + os.symlink(launch_dir, subdir) + print(subdir, '->', launch_dir) + return launch_dir + else: + return os.path.realpath(subdir) + +def get_vasp_dirs(scan_path, base_path, max_dirs): + base_path_split = base_path.split(os.sep) + base_path_index = len(base_path_split) + # NOTE os.walk followlinks=False by default, as intended here + counter = 0 + for root, dirs, files in os.walk(scan_path): + # TODO ignore relax1/2 subdirs if INCAR.orig found + if contains_vasp_dirs(files): + yield get_symlinked_path(root, base_path_index) + counter += 1 + if counter >= max_dirs: + break + else: + for f in files: + if f.endswith('.tar.gz'): + cwd = os.path.realpath(root) + path = os.path.join(cwd, f) + with tarfile.open(path, 'r:gz') as tf: + tf.extractall(cwd) + os.remove(path) + for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs): + yield vaspdir + counter += 1 + if counter >= max_dirs: + break + +def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): + name = multiprocessing.current_process().name + print(name, 'starting') + lpad = LaunchPad.auto_load() + target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) + print(name, 'connected to target db with', target.collection.count(), 'tasks') + for vaspdir in vaspdirs: + if get_subdir(vaspdir) in already_inserted_subdirs: + print(name, vaspdir, 'already parsed') + continue + print(name, 'vaspdir:', vaspdir) + #poscar_path = os.path.join(vaspdir, 'POSCAR.relax2.gz') + #s = Structure.from_file(poscar_path) + #nelements = len(s.composition.elements) + #if nelements > 1: + # print(name, ' -> SKIP (#elements > 1)') + # continue + if insert: + for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']: + input_path = os.path.join(vaspdir, inp) + orig_path = input_path + '.orig' + if not glob(orig_path+'*'): + copyfile(input_path, orig_path) + print(name, 'cp', input_path, '->', orig_path) + try: + task_doc = drone.assimilate(vaspdir) + except Exception as ex: + err = str(ex) + if err == 'No VASP files found!': + rmtree(vaspdir) + print(name, 'removed', vaspdir) + continue + if task_doc['state'] == 'successful': + try: + target.insert_task(task_doc, use_gridfs=True) + except DocumentTooLarge as ex: + print(name, 'remove normalmode_eigenvecs and retry ...') + task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs') + try: + target.insert_task(task_doc, use_gridfs=True) + except DocumentTooLarge as ex: + print(name, 'also remove force_constants and retry ...') + task_doc['calcs_reversed'][0]['output'].pop('force_constants') + target.insert_task(task_doc, use_gridfs=True) + nr_vaspdirs = len(vaspdirs) + print(name, 'processed', nr_vaspdirs, 'VASP directories') + return nr_vaspdirs + @click.group() def cli(): pass @@ -87,9 +206,6 @@ def ensure_meta(snls_db): ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll]) -def get_subdir(dn): - return dn.rsplit(os.sep, 1)[-1] - @cli.command() @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--tag', default=None, help='only insert tasks with specific tag') @@ -691,7 +807,6 @@ def report(tag, in_progress, to_csv): tags = [t[0] for t in sorted(all_tags, key=lambda x: x[1], reverse=True)] print(len(tags), 'tags in WFs and logs collections') - from prettytable import PrettyTable table = PrettyTable() table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress'] sums = ['total'] + [0] * (len(table.field_names)-1) @@ -919,7 +1034,9 @@ def insert_snls(snls_list): @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') @click.option('--insert/--no-insert', default=False, help='actually execute task insertion') @click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks') -def parse(base_path, add_snlcolls, insert, make_snls): +@click.option('--nproc', '-n', type=int, default=1, help='number of processes for parallel parsing') +@click.option('--max-dirs', '-m', type=int, default=10, help='maximum number of directories to parse') +def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): """parse VASP output directories in base_path into tasks and tag""" if not insert: print('DRY RUN: add --insert flag to actually insert tasks') @@ -930,100 +1047,41 @@ def parse(base_path, add_snlcolls, insert, make_snls): base_path = os.path.join(base_path, '') base_path_split = base_path.split(os.sep) tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2] - idx = len(base_path_split) drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')] print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag) - def get_timestamp_dir(prefix='launcher'): - time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT) - return '_'.join([prefix, time_now]) - - def get_symlinked_path(root): - root_split = os.path.realpath(root).split(os.sep) - if not root_split[idx-1].startswith('block_'): - rootdir = os.sep.join(root_split[:idx]) - block = get_timestamp_dir(prefix='block') - block_dir = os.sep.join(root_split[:idx-1] + [block]) - if insert: - os.rename(rootdir, block_dir) - os.symlink(block_dir, rootdir) - print(rootdir, '->', block_dir) - subdir = os.sep.join(root_split) - if not root_split[-1].startswith('launcher_'): - launch = get_timestamp_dir() - launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch) - if insert: - os.rename(subdir, launch_dir) - os.symlink(launch_dir, subdir) - print(subdir, '->', launch_dir) - return launch_dir - else: - return os.path.realpath(subdir) - - def contains_vasp_dirs(list_of_files): - for f in list_of_files: - if f.startswith("INCAR"): - return True - - def get_vasp_dirs(scan_path): - # NOTE os.walk followlinks=False by default, as intended here - for root, dirs, files in os.walk(scan_path): - # TODO ignore relax1/2 subdirs if INCAR.orig found - if contains_vasp_dirs(files): - yield get_symlinked_path(root) + chunk_size = 100 + if nproc > 1 and max_dirs <= chunk_size: + nproc = 1 + print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially') + pool = multiprocessing.Pool(processes=nproc) + iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs) + iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks + queue = deque() + total_nr_vaspdirs_parsed = 0 + while iterator or queue: + try: + args = [next(iterator), insert, drone, already_inserted_subdirs] + queue.append(pool.apply_async(parse_vasp_dirs, args)) + except (StopIteration, TypeError): + iterator = None + while queue and (len(queue) >= pool._processes or not iterator): + process = queue.pop() + process.wait(1) + if not process.ready(): + queue.append(process) else: - for f in files: - if f.endswith('.tar.gz'): - cwd = os.path.realpath(root) - path = os.path.join(cwd, f) - with tarfile.open(path, 'r:gz') as tf: - tf.extractall(cwd) - os.remove(path) - for vaspdir in get_vasp_dirs(path.replace('.tar.gz', '')): - yield vaspdir - - - inputs = ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR'] - input_structures = [] - for vaspdir in get_vasp_dirs(base_path): - subdir = get_subdir(vaspdir) - if subdir not in already_inserted_subdirs: - print('vaspdir:', vaspdir) - if insert: - for inp in inputs: - input_path = os.path.join(vaspdir, inp) - orig_path = input_path + '.orig' - if not glob(orig_path+'*'): - copyfile(input_path, orig_path) - print('cp', input_path, '->', orig_path) - try: - task_doc = drone.assimilate(vaspdir) - except Exception as ex: - err = str(ex) - if err == 'No VASP files found!': - rmtree(vaspdir) - print('removed', vaspdir) - continue - if task_doc['state'] == 'successful': - try: - target.insert_task(task_doc, use_gridfs=True) - except DocumentTooLarge as ex: - print(str(ex)) - print('remove normalmode_eigenvecs and retry ...') - task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs') - try: - target.insert_task(task_doc, use_gridfs=True) - except DocumentTooLarge as ex: - print(str(ex)) - print('also remove force_constants and retry ...') - task_doc['calcs_reversed'][0]['output'].pop('force_constants') - target.insert_task(task_doc, use_gridfs=True) - if make_snls: - s = Structure.from_dict(task_doc['input']['structure']) - input_structures.append(s) - - if insert and make_snls: - print('add SNLs for', len(input_structures), 'structures') - add_snls(tag, input_structures, add_snlcolls, insert) + total_nr_vaspdirs_parsed += process.get() + pool.close() + print('DONE:', total_nr_vaspdirs_parsed, 'parsed') + + #input_structures = [] + # if make_snls: + # s = Structure.from_dict(task_doc['input']['structure']) + # input_structures.append(s) + + #if insert and make_snls: + # print('add SNLs for', len(input_structures), 'structures') + # add_snls(tag, input_structures, add_snlcolls, insert) From 8d6cff99028d81df571b7f89b026135abc3178f6 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 7 Dec 2018 16:33:12 +0100 Subject: [PATCH 46/97] Initial commit --- .gitignore | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++ LICENSE | 21 +++++++++++ README.md | 2 ++ 3 files changed, 127 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..894a44cc06 --- /dev/null +++ b/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..8c40a5f90a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Materials Project + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000..9045646106 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# mp-nomad +Disseminate raw MP calculations through NoMaD From 601b520880941dbb20886dbd8018bb4e9325ce7a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 7 Dec 2018 16:35:45 +0100 Subject: [PATCH 47/97] first steps with google drive api --- .gitignore | 4 ++ retrieve_mpraw_data.py | 113 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 retrieve_mpraw_data.py diff --git a/.gitignore b/.gitignore index 894a44cc06..0fe7e91a59 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,7 @@ venv.bak/ # mypy .mypy_cache/ + +token.json +credentials.json +mpraw/* diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py new file mode 100644 index 0000000000..b38f721026 --- /dev/null +++ b/retrieve_mpraw_data.py @@ -0,0 +1,113 @@ +from __future__ import print_function +import io, os +from googleapiclient.discovery import build +from httplib2 import Http +from oauth2client import file, client, tools +from googleapiclient.http import MediaIoBaseDownload +from pprint import pprint + +# If modifying these scopes, delete the file token.json. +# see https://developers.google.com/identity/protocols/googlescopes#drivev3 +SCOPES = 'https://www.googleapis.com/auth/drive' +OUTDIR = 'mpraw' +CHUNKSIZE = 5*1024*1024 # 5MB + +def download_file(service, file_id): + request = service.files().get_media(fileId=file_id) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE) + done = False + while done is False: + status, done = downloader.next_chunk() + print("Download {:d}%.".format(int(status.progress() * 100))) + return fh.getvalue() + +def main(): + """Shows basic usage of the Drive v3 API. + Prints the names and ids of the first 10 files the user has access to. + """ + # The file token.json stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + store = file.Storage('token.json') + creds = store.get() + if not creds or creds.invalid: + flow = client.flow_from_clientsecrets('credentials.json', SCOPES) + creds = tools.run_flow(flow, store) + service = build('drive', 'v3', http=creds.authorize(Http())) + + # Call the Drive v3 API + # https://developers.google.com/drive/api/v3/search-parameters#fn1 + # TODO older launcher directories don't have prefix + # TODO also cover non-b/l hierarchy + block_page_token = None + garden_id = os.environ.get('MPDRIVE_GARDEN_ID') + if garden_id: + block_query = "'{}' in parents and name contains 'block_'".format(garden_id) + else: + print('MPDRIVE_GARDEN_ID not set!') + return + + while True: + block_response = service.files().list( + q=block_query, spaces='drive', pageToken=block_page_token, + fields='nextPageToken, files(id, name)', pageSize=2 + ).execute() + + for block in block_response['files']: + print(block['name']) + block_dir = os.path.join(OUTDIR, block['name']) + if not os.path.exists(block_dir): + os.makedirs(block_dir) + + block_page_token = block_response.get('nextPageToken', None) + if block_page_token is None: + break # done with blocks + + # recurse into the block to retrieve launch_dir's + launcher_page_token = None + launcher_query = "'{}' in parents".format(block['id']) + + while True: + launcher_response = service.files().list( + q=launcher_query, spaces='drive', pageToken=launcher_page_token, + fields='nextPageToken, files(id, name, modifiedTime, size)', + pageSize=10 + ).execute() + + for launcher in launcher_response['files']: + # TODO 'size' doesn't exist if launcher is another dir + # due to non-reservation mode production + if int(launcher['size']) < 50: + service.files().delete(fileId=launcher['id']).execute() + print('removed', launcher['name']) + else: + # download (incl. block) + #pprint(launcher) + path = os.path.join(block_dir, launcher['name']) + print(path) + if not os.path.exists(path): + content = download_file(service, launcher['id']) + with open(path, 'wb') as f: + f.write(content) + print(path, 'downloaded.') + + launcher_page_token = launcher_response.get('nextPageToken', None) + if launcher_page_token is None: + break # done with launchers in current block + + # search for launchers in block again, and rm block if empty dir + launcher_response = service.files().list( + q=launcher_query, spaces='drive', pageSize=1 + ).execute() + if not launcher_response['files']: + service.files().delete(fileId=block['id']).execute() + print('removed', block['name']) + + break # blocks loop TODO remove + + # TODO in production, subscribe to watch garden directory? + # https://developers.google.com/drive/api/v3/reference/files/watch + +if __name__ == '__main__': + main() From ec4f55adad78e6f5f6b013dd9287c89b072d76cf Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 7 Dec 2018 16:36:14 +0100 Subject: [PATCH 48/97] add reqs --- requirements.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000..7c69c3704e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +cachetools==3.0.0 +certifi==2018.10.15 +google-api-python-client==1.7.5 +google-auth==1.6.1 +google-auth-httplib2==0.0.3 +httplib2==0.12.0 +oauth2client==4.1.3 +pyasn1==0.4.4 +pyasn1-modules==0.2.2 +rsa==4.0 +six==1.11.0 +uritemplate==3.0.0 From 0f3101a39d192b0b9610691a874eae9a0bdc6bc2 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 7 Dec 2018 16:41:52 +0100 Subject: [PATCH 49/97] use tqdm for download progress --- retrieve_mpraw_data.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py index b38f721026..6a2a3423b5 100644 --- a/retrieve_mpraw_data.py +++ b/retrieve_mpraw_data.py @@ -5,21 +5,23 @@ from oauth2client import file, client, tools from googleapiclient.http import MediaIoBaseDownload from pprint import pprint +from tqdm import tqdm # If modifying these scopes, delete the file token.json. # see https://developers.google.com/identity/protocols/googlescopes#drivev3 SCOPES = 'https://www.googleapis.com/auth/drive' OUTDIR = 'mpraw' -CHUNKSIZE = 5*1024*1024 # 5MB +CHUNKSIZE = 1024*1024 # 5MB def download_file(service, file_id): request = service.files().get_media(fileId=file_id) fh = io.BytesIO() downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE) done = False - while done is False: - status, done = downloader.next_chunk() - print("Download {:d}%.".format(int(status.progress() * 100))) + with tqdm(total=100) as pbar: + while done is False: + status, done = downloader.next_chunk() + pbar.update(int(status.progress() * 100)) return fh.getvalue() def main(): From 0a10393d01ca4f42d862d0767325bdd3fac7b0d9 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Fri, 7 Dec 2018 16:46:45 +0100 Subject: [PATCH 50/97] update reqs --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7c69c3704e..9e850ed138 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ pyasn1==0.4.4 pyasn1-modules==0.2.2 rsa==4.0 six==1.11.0 +tqdm==4.28.1 uritemplate==3.0.0 From c560068b868e2e72bb2ed17772010ea54e607114 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 10 Dec 2018 07:56:13 -0800 Subject: [PATCH 51/97] cli: correct chunk_size --- emmet/scripts/emmet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index b1dead5fcb..240aa4b969 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing +import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math from shutil import copyfile, rmtree from glob import glob from fnmatch import fnmatch @@ -1051,7 +1051,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')] print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag) - chunk_size = 100 + chunk_size = math.ceil(max_dirs/nproc) if nproc > 1 and max_dirs <= chunk_size: nproc = 1 print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially') From 34a917d9cd002884d883d5d553ab98f2467ab364 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 10 Dec 2018 07:56:57 -0800 Subject: [PATCH 52/97] cli.copy: better task-id --- emmet/scripts/emmet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 240aa4b969..e6a08af74d 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -339,8 +339,8 @@ def insert_snls(snls_list): if isinstance(task_doc['task_id'], int): if insert: - c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"] - task_doc['task_id'] = 'mp-{}'.format(c) + next_tid = max([int(tid[len('mp')+1:]) for tid in target.collection.distinct('task_id')]) + 1 + task_doc['task_id'] = 'mp-{}'.format(next_tid) else: task = target.collection.find_one({'task_id': task_doc['task_id']}, ['orig_inputs', 'output.structure']) if task: From 982726bb7fbc0ca8ff7a27cccf4b3bb183f21d08 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 18 Dec 2018 04:40:13 -0800 Subject: [PATCH 53/97] cli: start gdrive subcommand --- .gitignore | 6 ++- emmet/scripts/emmet.py | 70 +++++++++++++++++++++++++++++++++ emmet/scripts/launcher_paths.py | 39 ------------------ 3 files changed, 75 insertions(+), 40 deletions(-) delete mode 100644 emmet/scripts/launcher_paths.py diff --git a/.gitignore b/.gitignore index ffc341b871..9826898353 100644 --- a/.gitignore +++ b/.gitignore @@ -105,4 +105,8 @@ ENV/ .DS_Store # PyCharm -.idea \ No newline at end of file +.idea + +# GDrive +emmet/scripts/credentials.json +emmet/scripts/token.json diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index e6a08af74d..b866f6da90 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -22,6 +22,9 @@ from emmet.vasp.task_tagger import task_type from log4mongo.handlers import MongoHandler, MongoFormatter from prettytable import PrettyTable +from googleapiclient.discovery import build +from httplib2 import Http +from oauth2client import file, client, tools if 'FW_CONFIG_FILE' not in os.environ: print('Please set FW_CONFIG_FILE!') @@ -33,6 +36,7 @@ task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}} structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] aggregation_keys = ['reduced_cell_formula', 'formula_pretty'] +SCOPES = 'https://www.googleapis.com/auth/drive' def aggregate_by_formula(coll, q, key=None): query = {'$and': [q, exclude]} @@ -1085,3 +1089,69 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): # print('add SNLs for', len(input_structures), 'structures') # add_snls(tag, input_structures, add_snlcolls, insert) +@cli.command() +@click.argument('target_db_file', type=click.Path(exists=True)) +def gdrive(target_db_file): + """sync launch directories for target task DB to Google Drive""" + target = VaspCalcDb.from_db_file(target_db_file, admin=True) + print('connected to target db with', target.collection.count(), 'tasks') + print(target.db.materials.count(), 'materials') + + store = file.Storage('token.json') + creds = store.get() + if not creds or creds.invalid: + flow = client.flow_from_clientsecrets('credentials.json', SCOPES) + creds = tools.run_flow(flow, store) + service = build('drive', 'v3', http=creds.authorize(Http())) + garden_id = os.environ.get('MPDRIVE_GARDEN_ID') + if not garden_id: + print('MPDRIVE_GARDEN_ID not set!') + return + + query = {} + materials = target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1}) + blessed_tasks = dict((doc['task_id'], doc['blessed_tasks']) for doc in materials) + nr_blessed_tasks = sum([len(l) for l in blessed_tasks.values()]) + print(nr_blessed_tasks, 'tasks to sync') + + batch = service.new_batch_http_request() + splits = ['block_', 'aflow_'] + nr_tasks_processed = 0 + for mpid, tasks in blessed_tasks.items(): + for task_type, task_id in tasks.items(): + if task_type == 'GGA Structure Optimization': # TODO remove + if len(batch._order) == 100: + print('execute batch request ...') + batch.execute() + batch = service.new_batch_http_request() + dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name'] + if '_2011-' not in dir_name and '_2012-' not in dir_name: # TODO remove + continue + for s in splits: + ds = dir_name.split(s) + if len(ds) == 2: + block_launcher = s + ds[-1] + print(mpid, task_id, block_launcher) + block, launcher = block_launcher.rsplit(os.sep, 1) + query = "name = '{}.tar.gz'".format(launcher) + response = service.files().list( + q=query, spaces='drive', fields='files(id, name, size)', pageSize=1 + ).execute() + files = response['files'] + if files: + if int(files[0]['size']) < 50: + batch.add(service.files().delete(fileId=files[0]['id'])) + print('TODO: re-upload', files[0]['name']) + else: + print('to upload') + nr_tasks_processed += 1 + break + else: + print(mpid, task_id, ': could not split', dir_name) + return + + if len(batch._order) > 0: + print('execute final batch request ...') + batch.execute() + print(nr_tasks_processed) + diff --git a/emmet/scripts/launcher_paths.py b/emmet/scripts/launcher_paths.py deleted file mode 100644 index fc7625bbf2..0000000000 --- a/emmet/scripts/launcher_paths.py +++ /dev/null @@ -1,39 +0,0 @@ -import json -from atomate.vasp.database import VaspCalcDb - -target_db_file = '../dbfiles/db_atomate.json' -target = VaspCalcDb.from_db_file(target_db_file, admin=True) -print('connected to target db with', target.collection.count(), 'tasks') -print(target.db.materials.count(), 'materials') - -splits = ['block_', 'aflow_'] -mpids = json.load(open('KRao_Li_FullList.txt', 'r')) -print(len(mpids), 'mpids') -query = {'task_id': {'$in': mpids}} - -# {'mp-1002': [{'task_id': ..., 'task_type': ..., 'launcher_path': ...}, ...], ...} -out = {} - -for idx, doc in enumerate(target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})): - mp_id = doc['task_id'] - out[mp_id] = [] - print(idx, mp_id) - for task_type, task_id in doc['blessed_tasks'].items(): - dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name'] - if 'maarten_piezo' in dir_name: - continue - for s in splits: - ds = dir_name.split(s) - if len(ds) == 2: - launcher = s + ds[-1] - print(task_id, task_type, launcher) - out[mp_id].append({'task_id': task_id, 'task_type': task_type, 'launcher_path': launcher}) - break - -with open('launcher_paths.json', 'w') as f: - json.dump(out, f) - -with open('launcher_paths.txt', 'w') as f: - for mp_id, tasks in out.items(): - for task in tasks: - f.write(task['launcher_path']+'\n') From fa65fc8fe68c6d660d13d011940dc39f3ae7d08f Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 18 Dec 2018 04:40:42 -0800 Subject: [PATCH 54/97] cli: parse bugfix --- emmet/scripts/emmet.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index b866f6da90..90dcc3cd75 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -88,9 +88,10 @@ def contains_vasp_dirs(list_of_files): if f.startswith("INCAR"): return True -def get_symlinked_path(root, base_path_index): +def get_symlinked_path(root, base_path_index, insert): root_split = os.path.realpath(root).split(os.sep) - if not root_split[base_path_index-1].startswith('block_'): + if base_path_index != len(root_split) and \ + not root_split[base_path_index-1].startswith('block_'): rootdir = os.sep.join(root_split[:base_path_index]) block = get_timestamp_dir(prefix='block') block_dir = os.sep.join(root_split[:base_path_index-1] + [block]) @@ -110,7 +111,7 @@ def get_symlinked_path(root, base_path_index): else: return os.path.realpath(subdir) -def get_vasp_dirs(scan_path, base_path, max_dirs): +def get_vasp_dirs(scan_path, base_path, max_dirs, insert): base_path_split = base_path.split(os.sep) base_path_index = len(base_path_split) # NOTE os.walk followlinks=False by default, as intended here @@ -118,7 +119,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs): for root, dirs, files in os.walk(scan_path): # TODO ignore relax1/2 subdirs if INCAR.orig found if contains_vasp_dirs(files): - yield get_symlinked_path(root, base_path_index) + yield get_symlinked_path(root, base_path_index, insert) counter += 1 if counter >= max_dirs: break @@ -130,7 +131,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs): with tarfile.open(path, 'r:gz') as tf: tf.extractall(cwd) os.remove(path) - for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs): + for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs, insert): yield vaspdir counter += 1 if counter >= max_dirs: @@ -1060,7 +1061,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): nproc = 1 print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially') pool = multiprocessing.Pool(processes=nproc) - iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs) + iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs, insert) iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks queue = deque() total_nr_vaspdirs_parsed = 0 From 3dead80747bc1791f4ca4caa694a76e6e3c766a8 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 19 Dec 2018 08:00:46 -0800 Subject: [PATCH 55/97] cli.parse: fix orig copy --- emmet/scripts/emmet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 90dcc3cd75..8802d22f25 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -157,8 +157,9 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): if insert: for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']: input_path = os.path.join(vaspdir, inp) - orig_path = input_path + '.orig' - if not glob(orig_path+'*'): + if not glob(input_path+'.orig*'): + input_path = glob(input_path+'*')[0] + orig_path = input_path.replace(inp, inp+'.orig') copyfile(input_path, orig_path) print(name, 'cp', input_path, '->', orig_path) try: From c916d3c2c1e8d5325f3c46e839be8fada83e1b3a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 19 Dec 2018 08:01:37 -0800 Subject: [PATCH 56/97] cli: progress on gdrive cmd --- emmet/scripts/emmet.py | 128 ++++++++++++++++++++++--------- emmet/scripts/hpss_to_mpdrive.sh | 18 +++-- 2 files changed, 102 insertions(+), 44 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 8802d22f25..b10eb3c2d2 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -25,6 +25,7 @@ from googleapiclient.discovery import build from httplib2 import Http from oauth2client import file, client, tools +from googleapiclient.http import MediaFileUpload if 'FW_CONFIG_FILE' not in os.environ: print('Please set FW_CONFIG_FILE!') @@ -1091,6 +1092,17 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): # print('add SNLs for', len(input_structures), 'structures') # add_snls(tag, input_structures, add_snlcolls, insert) +def upload_archive(path, name, service, parent=None): + media = MediaFileUpload(path, mimetype='application/gzip', resumable=True) + body = {'name': name, 'parents': [parent]} + request = service.files().create(media_body=media, body=body) + response = None + while response is None: + status, response = request.next_chunk() + if status: + print("Uploaded %d%%." % int(status.progress() * 100)) + print("Upload Complete!") + @cli.command() @click.argument('target_db_file', type=click.Path(exists=True)) def gdrive(target_db_file): @@ -1111,49 +1123,89 @@ def gdrive(target_db_file): return query = {} - materials = target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1}) - blessed_tasks = dict((doc['task_id'], doc['blessed_tasks']) for doc in materials) - nr_blessed_tasks = sum([len(l) for l in blessed_tasks.values()]) - print(nr_blessed_tasks, 'tasks to sync') + blessed_task_ids = [ + task_id for doc in target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1}) + for task_type, task_id in doc['blessed_tasks'].items() + ] + print(len(blessed_task_ids), 'blessed tasks.') + + dir_names = [] + for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): + dir_name = task['dir_name'] + if '2011-' in dir_name or '2012-' in dir_name: # TODO remove + dir_names.append(dir_name) + dir_names.sort() + print(len(dir_names), 'launcher directories to sync.') - batch = service.new_batch_http_request() splits = ['block_', 'aflow_'] nr_tasks_processed = 0 - for mpid, tasks in blessed_tasks.items(): - for task_type, task_id in tasks.items(): - if task_type == 'GGA Structure Optimization': # TODO remove - if len(batch._order) == 100: - print('execute batch request ...') - batch.execute() - batch = service.new_batch_http_request() - dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name'] - if '_2011-' not in dir_name and '_2012-' not in dir_name: # TODO remove - continue - for s in splits: - ds = dir_name.split(s) - if len(ds) == 2: - block_launcher = s + ds[-1] - print(mpid, task_id, block_launcher) - block, launcher = block_launcher.rsplit(os.sep, 1) - query = "name = '{}.tar.gz'".format(launcher) - response = service.files().list( - q=query, spaces='drive', fields='files(id, name, size)', pageSize=1 - ).execute() - files = response['files'] - if files: - if int(files[0]['size']) < 50: - batch.add(service.files().delete(fileId=files[0]['id'])) - print('TODO: re-upload', files[0]['name']) + prev = None + outfile = open('launcher_paths.txt', 'w') + stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive' + + for dir_name in dir_names: + + for s in splits: + ds = dir_name.split(s) + if len(ds) == 2: + block_launcher = s + ds[-1] + block_launcher_split = block_launcher.split(os.sep) + #if prev is not None and block_launcher_split[0] != prev \ + # and block_launcher_split[0] != 'aflow_engines-mag_special': + # return # TODO remove + + print(block_launcher) + archive_name = '{}.tar.gz'.format(block_launcher_split[-1]) + query = "name = '{}'".format(archive_name) + response = service.files().list( + q=query, spaces='drive', fields='files(id, name, size, parents)', pageSize=1 + ).execute() + files = response['files'] + archive_path = os.path.join(stage_dir, block_launcher + '.tar.gz') + if files: + if int(files[0]['size']) < 50: + service.files().delete(fileId=files[0]['id']) + if os.path.exists(archive_path): + parent = files[0]['parents'][0] + #upload_archive(archive_path, archive_name, service, parent=parent) + #return # TODO remove else: - print('to upload') - nr_tasks_processed += 1 - break + print('TODO: get from HPSS') + outfile.write(block_launcher + '\n') + else: + print('OK:', files[0]) else: - print(mpid, task_id, ': could not split', dir_name) - return + if os.path.exists(archive_path): + # make directories + parents = [garden_id] + for folder in block_launcher_split[:-1]: + query = "name = '{}'".format(folder) + response = service.files().list( + q=query, spaces='drive', fields='files(id, name)', pageSize=1 + ).execute() + if not response['files']: + print('create dir ...', folder) + body = { + 'name': folder, + 'mimeType': "application/vnd.google-apps.folder", + 'parents': [parents[-1]] + } + gdrive_folder = service.files().create(body=body).execute() + parents.append(gdrive_folder['id']) + else: + parents.append(response['files'][0]['id']) + + #upload_archive(archive_path, archive_name, service, parent=parents[-1]) + else: + print('TODO: get from HPSS') + outfile.write(block_launcher + '\n') + nr_tasks_processed += 1 + prev = block_launcher_split[0] + break + else: + print('could not split', dir_name) + return - if len(batch._order) > 0: - print('execute final batch request ...') - batch.execute() print(nr_tasks_processed) + outfile.close() diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh index 7f6a6da177..2f5e10b7ef 100755 --- a/emmet/scripts/hpss_to_mpdrive.sh +++ b/emmet/scripts/hpss_to_mpdrive.sh @@ -9,18 +9,20 @@ stage_dir="rclone_to_mp_drive" [[ ! -e $hpss_missing ]] && touch $hpss_missing for dir in $dirs; do - [[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove + #[[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove files=`grep "^$dir" $1` extract="${dir}.extract" grep -q "$dir" $hpss_missing [[ $? -eq 0 ]] && continue - [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + #[[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir - missing_paths="${dir}.paths" echo $files | tr ' ' '\n' | sort -u > ${dir}.files rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf + + missing_paths="${dir}.paths" + [[ -e $missing_paths ]] && rm -v $missing_paths for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive launch_dir_tar="${stage_dir}/${f}.tar.gz" if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then @@ -48,9 +50,11 @@ for dir in $dirs; do if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then echo "make ${dir}.tar_list ..." tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list + [[ $? -ne 0 ]] && exit fi paths=`cat $missing_paths` + [[ -e $extract ]] && rm -v $extract for f in $paths; do [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract done @@ -58,6 +62,7 @@ for dir in $dirs; do if [ -e $extract ] && [ -s $extract ]; then echo "extract" `wc -l $extract` tar -xvzf ${dir}.tar.gz --files-from $extract + [[ $? -ne 0 ]] && rm -v $extract && exit fi rm -v $extract @@ -66,12 +71,13 @@ for dir in $dirs; do echo $launch_dir_tar ... mkdir -p `dirname $launch_dir_tar` tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`) - [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && break + [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && exit ls -ltrh $launch_dir_tar - [[ -d $f ]] && rm -r $f + #[[ -d $f ]] && rm -rf $f done rm -v $missing_paths - rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + #rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + #rm -v ${dir}.tar.gz done From af61614481c529ee7403b85e171e70befbef3834 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 20 Dec 2018 05:30:26 -0800 Subject: [PATCH 57/97] cli: progress on gdrive sync --- emmet/scripts/emmet.py | 131 +++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 64 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index b10eb3c2d2..4d3a6bc1d4 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1129,83 +1129,86 @@ def gdrive(target_db_file): ] print(len(blessed_task_ids), 'blessed tasks.') + splits = ['block_', 'aflow_engines-'] dir_names = [] for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): dir_name = task['dir_name'] - if '2011-' in dir_name or '2012-' in dir_name: # TODO remove - dir_names.append(dir_name) + # aflow_engines-mag_special + if '2011-' in dir_name and 'block_2011-10-07-08-57-17-804213' in dir_name: # TODO remove + for s in splits: + ds = dir_name.split(s) + if len(ds) == 2: + block_launcher = s + ds[-1] + dir_names.append(block_launcher) + break + else: + print('could not split', dir_name) + return + dir_names.sort() print(len(dir_names), 'launcher directories to sync.') - splits = ['block_', 'aflow_'] nr_tasks_processed = 0 prev = None outfile = open('launcher_paths.txt', 'w') stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive' - for dir_name in dir_names: - - for s in splits: - ds = dir_name.split(s) - if len(ds) == 2: - block_launcher = s + ds[-1] - block_launcher_split = block_launcher.split(os.sep) - #if prev is not None and block_launcher_split[0] != prev \ - # and block_launcher_split[0] != 'aflow_engines-mag_special': - # return # TODO remove - - print(block_launcher) - archive_name = '{}.tar.gz'.format(block_launcher_split[-1]) - query = "name = '{}'".format(archive_name) - response = service.files().list( - q=query, spaces='drive', fields='files(id, name, size, parents)', pageSize=1 - ).execute() - files = response['files'] - archive_path = os.path.join(stage_dir, block_launcher + '.tar.gz') - if files: - if int(files[0]['size']) < 50: - service.files().delete(fileId=files[0]['id']) - if os.path.exists(archive_path): - parent = files[0]['parents'][0] - #upload_archive(archive_path, archive_name, service, parent=parent) - #return # TODO remove - else: - print('TODO: get from HPSS') - outfile.write(block_launcher + '\n') - else: - print('OK:', files[0]) + for idx, dir_name in enumerate(dir_names): + block_launcher_split = dir_name.split(os.sep) + #if prev is not None and prev != block_launcher_split[0]: # TODO remove + # break + print(idx, dir_name) + archive_name = '{}.tar.gz'.format(block_launcher_split[-1]) + query = "name = '{}'".format(archive_name) + response = service.files().list( + q=query, spaces='drive', fields='files(id, name, size, parents)' + ).execute() + files = response['files'] + archive_path = os.path.join(stage_dir, dir_name + '.tar.gz') + if files: + if len(files) > 1: + # duplicate uploads - delete all and re-upload + for f in files: + print('removing', f['name'], '...') + service.files().delete(fileId=f['id']).execute() + print('TODO: rerun to upload!') + elif int(files[0]['size']) < 50: + service.files().delete(fileId=files[0]['id']).execute() + if os.path.exists(archive_path): + parent = files[0]['parents'][0] + upload_archive(archive_path, archive_name, service, parent=parent) else: - if os.path.exists(archive_path): - # make directories - parents = [garden_id] - for folder in block_launcher_split[:-1]: - query = "name = '{}'".format(folder) - response = service.files().list( - q=query, spaces='drive', fields='files(id, name)', pageSize=1 - ).execute() - if not response['files']: - print('create dir ...', folder) - body = { - 'name': folder, - 'mimeType': "application/vnd.google-apps.folder", - 'parents': [parents[-1]] - } - gdrive_folder = service.files().create(body=body).execute() - parents.append(gdrive_folder['id']) - else: - parents.append(response['files'][0]['id']) - - #upload_archive(archive_path, archive_name, service, parent=parents[-1]) - else: - print('TODO: get from HPSS') - outfile.write(block_launcher + '\n') - nr_tasks_processed += 1 - prev = block_launcher_split[0] - break + print('TODO: get from HPSS') + outfile.write(dir_name + '\n') + else: + print('OK:', files[0]) else: - print('could not split', dir_name) - return + if os.path.exists(archive_path): + # make directories + parents = [garden_id] + for folder in block_launcher_split[:-1]: + query = "name = '{}'".format(folder) + response = service.files().list( + q=query, spaces='drive', fields='files(id, name)', pageSize=1 + ).execute() + if not response['files']: + print('create dir ...', folder) + body = { + 'name': folder, + 'mimeType': "application/vnd.google-apps.folder", + 'parents': [parents[-1]] + } + gdrive_folder = service.files().create(body=body).execute() + parents.append(gdrive_folder['id']) + else: + parents.append(response['files'][0]['id']) + + upload_archive(archive_path, archive_name, service, parent=parents[-1]) + else: + print('TODO: get from HPSS') + outfile.write(dir_name + '\n') + nr_tasks_processed += 1 + prev = block_launcher_split[0] print(nr_tasks_processed) outfile.close() - From 757534f89ba653023ceec0aa84368af370326355 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 20 Dec 2018 05:34:34 -0800 Subject: [PATCH 58/97] cli: save hpss_to_mpdrive --- emmet/scripts/hpss_to_mpdrive.sh | 36 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh index 2f5e10b7ef..486957eadf 100755 --- a/emmet/scripts/hpss_to_mpdrive.sh +++ b/emmet/scripts/hpss_to_mpdrive.sh @@ -16,7 +16,7 @@ for dir in $dirs; do grep -q "$dir" $hpss_missing [[ $? -eq 0 ]] && continue - #[[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir echo $files | tr ' ' '\n' | sort -u > ${dir}.files rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf @@ -26,16 +26,16 @@ for dir in $dirs; do for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive launch_dir_tar="${stage_dir}/${f}.tar.gz" if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then - echo $f >> $missing_paths - elif [ -d $f ]; then - rm -rv $f - fi + echo $f >> $missing_paths + elif [ -d $f ]; then + rm -rv $f + fi done for f in $(comm --check-order -12 ${dir}.files ${dir}.rclone_lsf | tr '\n' ' '); do # already cloned launch dirs -> cleanup launch_dir_tar="${stage_dir}/${f}.tar.gz" [[ -d $f ]] && rm -rv $f - [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar + [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar done rm -v ${dir}.files ${dir}.rclone_lsf @@ -61,8 +61,17 @@ for dir in $dirs; do if [ -e $extract ] && [ -s $extract ]; then echo "extract" `wc -l $extract` - tar -xvzf ${dir}.tar.gz --files-from $extract - [[ $? -ne 0 ]] && rm -v $extract && exit + if tar -xvzf ${dir}.tar.gz --files-from $extract; then + echo 'extract OK' + else + rm -v $extract + echo 'problem with extract!' + continue + fi + else + echo 'nothing to extract' + rm -v $extract + continue fi rm -v $extract @@ -70,14 +79,17 @@ for dir in $dirs; do launch_dir_tar="${stage_dir}/${f}.tar.gz" echo $launch_dir_tar ... mkdir -p `dirname $launch_dir_tar` - tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`) - [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && exit - ls -ltrh $launch_dir_tar + if tar -czf $launch_dir_tar -C `dirname $f` `basename $f`; then + ls -ltrh $launch_dir_tar + else + echo 'problem with launch dir tar!' + continue + fi #[[ -d $f ]] && rm -rf $f done rm -v $missing_paths - #rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir #rm -v ${dir}.tar.gz done From 9352efca725abd12ff27eb0dd3909d2da1ac797a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 20 Dec 2018 16:58:31 +0100 Subject: [PATCH 59/97] retrieve from gdrive and compare to NoMaD works --- retrieve_mpraw_data.py | 116 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py index 6a2a3423b5..7360afa514 100644 --- a/retrieve_mpraw_data.py +++ b/retrieve_mpraw_data.py @@ -1,22 +1,22 @@ from __future__ import print_function -import io, os +import io, os, sys from googleapiclient.discovery import build from httplib2 import Http from oauth2client import file, client, tools from googleapiclient.http import MediaIoBaseDownload -from pprint import pprint from tqdm import tqdm +import requests # If modifying these scopes, delete the file token.json. # see https://developers.google.com/identity/protocols/googlescopes#drivev3 SCOPES = 'https://www.googleapis.com/auth/drive' -OUTDIR = 'mpraw' -CHUNKSIZE = 1024*1024 # 5MB +OUTDIR = '/nomad/nomadlab/mpraw' +NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}' def download_file(service, file_id): request = service.files().get_media(fileId=file_id) fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE) + downloader = MediaIoBaseDownload(fh, request) done = False with tqdm(total=100) as pbar: while done is False: @@ -24,6 +24,53 @@ def download_file(service, file_id): pbar.update(int(status.progress() * 100)) return fh.getvalue() +full_launcher_path = [] + +def recurse(service, folder_id): + page_token = None + query = "'{}' in parents".format(folder_id) + while True: + response = service.files().list( + q=query, spaces='drive', pageToken=page_token, + fields='nextPageToken, files(id, name, modifiedTime, size)', + pageSize=50 + ).execute() + + for launcher in response['files']: + if '.tar.gz' in launcher['name']: + print(launcher) + launcher_name = launcher['name'].replace('.tar.gz', '') + full_launcher_path.append(launcher_name) + nomad_query='repository_main_file_uri="{}"'.format(launcher_name) + #nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO + print(nomad_query) + resp = requests.get(NOMAD_REPO.format(nomad_query)).json() + if 'meta' in resp: + path = os.path.join(*full_launcher_path) + '.tar.gz' + if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo + print('Retrieve', path, '...') + if not os.path.exists(path): + os.makedirs(path) + #content = download_file(service, launcher['id']) + #with open(path, 'wb') as f: + # f.write(content) + print('... DONE.') + else: + print(path, 'found in NoMaD repo:') + for d in resp['data']: + print('\t', d['attributes']['repository_uri']) + else: + raise Exception(resp['errors'][0]['detail']) + else: + full_launcher_path.append(launcher['name']) + recurse(service, launcher['id']) + + del full_launcher_path[-1:] + + page_token = response.get('nextPageToken', None) + if page_token is None: + break # done with launchers in current block + def main(): """Shows basic usage of the Drive v3 API. Prints the names and ids of the first 10 files the user has access to. @@ -45,7 +92,8 @@ def main(): block_page_token = None garden_id = os.environ.get('MPDRIVE_GARDEN_ID') if garden_id: - block_query = "'{}' in parents and name contains 'block_'".format(garden_id) + #block_query = "'{}' in parents and name contains 'block_'".format(garden_id) + block_query = "'{}' in parents and name contains 'block_2011-10-07-08-57-17-804213'".format(garden_id) else: print('MPDRIVE_GARDEN_ID not set!') return @@ -53,60 +101,18 @@ def main(): while True: block_response = service.files().list( q=block_query, spaces='drive', pageToken=block_page_token, - fields='nextPageToken, files(id, name)', pageSize=2 + fields='nextPageToken, files(id, name)', pageSize=10 ).execute() for block in block_response['files']: print(block['name']) - block_dir = os.path.join(OUTDIR, block['name']) - if not os.path.exists(block_dir): - os.makedirs(block_dir) - - block_page_token = block_response.get('nextPageToken', None) - if block_page_token is None: - break # done with blocks - - # recurse into the block to retrieve launch_dir's - launcher_page_token = None - launcher_query = "'{}' in parents".format(block['id']) - - while True: - launcher_response = service.files().list( - q=launcher_query, spaces='drive', pageToken=launcher_page_token, - fields='nextPageToken, files(id, name, modifiedTime, size)', - pageSize=10 - ).execute() - - for launcher in launcher_response['files']: - # TODO 'size' doesn't exist if launcher is another dir - # due to non-reservation mode production - if int(launcher['size']) < 50: - service.files().delete(fileId=launcher['id']).execute() - print('removed', launcher['name']) - else: - # download (incl. block) - #pprint(launcher) - path = os.path.join(block_dir, launcher['name']) - print(path) - if not os.path.exists(path): - content = download_file(service, launcher['id']) - with open(path, 'wb') as f: - f.write(content) - print(path, 'downloaded.') - - launcher_page_token = launcher_response.get('nextPageToken', None) - if launcher_page_token is None: - break # done with launchers in current block - - # search for launchers in block again, and rm block if empty dir - launcher_response = service.files().list( - q=launcher_query, spaces='drive', pageSize=1 - ).execute() - if not launcher_response['files']: - service.files().delete(fileId=block['id']).execute() - print('removed', block['name']) + full_launcher_path.clear() + full_launcher_path.append(block['name']) + recurse(service, block['id']) - break # blocks loop TODO remove + block_page_token = block_response.get('nextPageToken', None) + if block_page_token is None: + break # done with blocks # TODO in production, subscribe to watch garden directory? # https://developers.google.com/drive/api/v3/reference/files/watch From a252b78b2fa1f4dc8400a101dbe0a88cebc0d4be Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 8 Jan 2019 16:41:01 -0800 Subject: [PATCH 60/97] cli.copy: add overview table --- emmet/scripts/emmet.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 4d3a6bc1d4..59d0cbb365 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -248,12 +248,17 @@ def insert_snls(snls_list): else: print('no SNLs to insert') + table = PrettyTable() + table.field_names = ['Tag', 'Source', 'Target', 'Skipped', 'Insert'] + sums = ['total'] + [0] * (len(table.field_names)-1) + for t in tags: - print('### {} ###'.format(t)) + print('- {}'.format(t)) + row = [t] query = {'$and': [{'tags': t}, task_base_query]} source_count = source.collection.count(query) - print('source / target:', source_count, '/', target.collection.count(query)) + row += [source_count, target.collection.count(query)] # get list of SNLs to copy over # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls @@ -303,8 +308,9 @@ def insert_snls(snls_list): task_query = {'task_id': doc['task_id'], '$or': [{'dir_name': doc['dir_name']}, {'_mpworks_meta': {'$exists': 0}}]} if target.collection.count(task_query): skip_task_ids.append(doc['task_id']) - if len(skip_task_ids): - print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks) + #if len(skip_task_ids): + # print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks) + row.append(len(skip_task_ids)) query.update({'task_id': {'$nin': skip_task_ids}}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')] @@ -319,7 +325,11 @@ def insert_snls(snls_list): if len(subdirs) < 1: continue - print(len(subdirs), 'candidate tasks to insert') + row.append(len(subdirs)) + table.add_row(row) + for idx, e in enumerate(row): + if isinstance(e, int): + sums[idx] += e if not insert: continue @@ -370,6 +380,12 @@ def insert_snls(snls_list): if insert: target.insert_task(task_doc, use_gridfs=True) + table.align['Tag'] = 'r' + if tag is None: + sfmt = '\033[1;32m{}\033[0m' + table.add_row([sfmt.format(s if s else '-') for s in sums]) + print(table) + @cli.command() @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') From 0d501260e54236331c42bc0869d28111cbc1b7ce Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 9 Jan 2019 14:58:31 -0800 Subject: [PATCH 61/97] cli.gdrive: better scanning --- emmet/scripts/emmet.py | 86 +++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 59d0cbb365..44be35c615 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1138,6 +1138,58 @@ def gdrive(target_db_file): print('MPDRIVE_GARDEN_ID not set!') return + launcher_paths = [] + full_launcher_path = [] + + def recurse(service, folder_id): + page_token = None + query = "'{}' in parents".format(folder_id) + while True: + response = service.files().list( + q=query, spaces='drive', pageToken=page_token, + fields='nextPageToken, files(id, name, modifiedTime, size)', + ).execute() + + for launcher in response['files']: + if '.json' not in launcher['name']: + if '.tar.gz' in launcher['name']: + launcher_name = launcher['name'].replace('.tar.gz', '') + full_launcher_path.append(launcher_name) + launcher_paths.append(os.path.join(*full_launcher_path)) + else: + full_launcher_path.append(launcher['name']) + recurse(service, launcher['id']) + + del full_launcher_path[-1:] + + page_token = response.get('nextPageToken', None) + if page_token is None: + break # done with launchers in current block + + block_page_token = None + sample_block = 'block_2012-0' #'block_2011-10-07-08-57-17-804213' + block_query = "'{}' in parents".format(garden_id) if sample_block is None \ + else "'{}' in parents and name contains '{}'".format(garden_id, sample_block) + + while True: + block_response = service.files().list( + q=block_query, spaces='drive', pageToken=block_page_token, + fields='nextPageToken, files(id, name)' + ).execute() + + for block in block_response['files']: + print(block['name']) + full_launcher_path.clear() + full_launcher_path.append(block['name']) + recurse(service, block['id']) + + block_page_token = block_response.get('nextPageToken', None) + if block_page_token is None: + break # done with blocks + + launcher_paths.sort() + print(len(launcher_paths), 'launcher directories in GDrive') + query = {} blessed_task_ids = [ task_id for doc in target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1}) @@ -1145,24 +1197,30 @@ def gdrive(target_db_file): ] print(len(blessed_task_ids), 'blessed tasks.') - splits = ['block_', 'aflow_engines-'] - dir_names = [] + nr_launchers_sync = 0 + outfile = open('launcher_paths.txt', 'w') + splits = ['block_', 'aflow_engines-', 'launcher_'] for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): dir_name = task['dir_name'] # aflow_engines-mag_special - if '2011-' in dir_name and 'block_2011-10-07-08-57-17-804213' in dir_name: # TODO remove - for s in splits: - ds = dir_name.split(s) - if len(ds) == 2: - block_launcher = s + ds[-1] - dir_names.append(block_launcher) - break - else: - print('could not split', dir_name) - return + if sample_block is not None and sample_block not in dir_name: + continue - dir_names.sort() - print(len(dir_names), 'launcher directories to sync.') + for s in splits: + ds = dir_name.split(s) + if len(ds) == 2: + block_launcher = s + ds[-1] + if dir_name not in launcher_paths: + nr_launchers_sync += 1 + outfile.write(block_launcher + '\n') + break + else: + print('could not split', dir_name) + return + + outfile.close() + print(nr_launchers_sync, 'launchers to sync') + return nr_tasks_processed = 0 prev = None From de620cb94bdd5d87e019e335bc275712aa6a1e5a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 10 Jan 2019 12:41:31 -0800 Subject: [PATCH 62/97] cli.report: fix status columns --- emmet/scripts/emmet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 44be35c615..6b42abe23c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -818,7 +818,7 @@ def report(tag, in_progress, to_csv): """generate a report of calculations status""" lpad = LaunchPad.auto_load() - states = ['COMPLETED', 'FIZZLED', 'READY', 'RUNNING'] + states = ['READY', 'RESERVED', 'RUNNING', 'FIZZLED', 'COMPLETED'] tags = [tag] if tag is None: From ab07d90643fb4ac8b4723fe840061c31f52b3e4a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 10 Jan 2019 12:42:01 -0800 Subject: [PATCH 63/97] cli.wflows: add current year to future wflows --- emmet/scripts/emmet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 6b42abe23c..d91c7da4c9 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -38,6 +38,7 @@ structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id'] aggregation_keys = ['reduced_cell_formula', 'formula_pretty'] SCOPES = 'https://www.googleapis.com/auth/drive' +current_year = int(datetime.today().year) def aggregate_by_formula(coll, q, key=None): query = {'$and': [q, exclude]} @@ -752,7 +753,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): try: wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) wf = add_trackers(wf) - wf = add_tags(wf, [tag]) + wf = add_tags(wf, [tag, 'mp_{}'.format(current_year)]) if struct.task_id is not None: wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) except Exception as ex: From 32e54ccd7a5d68d314dc5a44d3dba4311792f005 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 10 Jan 2019 15:23:18 -0800 Subject: [PATCH 64/97] cli.copy: avoid accidentally copying tasks w/o year tag --- emmet/scripts/emmet.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index d91c7da4c9..93ce3fb5e1 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -39,6 +39,7 @@ aggregation_keys = ['reduced_cell_formula', 'formula_pretty'] SCOPES = 'https://www.googleapis.com/auth/drive' current_year = int(datetime.today().year) +year_tags = ['mp_{}'.format(y) for y in range(2018, current_year+1)] def aggregate_by_formula(coll, q, key=None): query = {'$and': [q, exclude]} @@ -69,7 +70,7 @@ def get_meta_from_structure(struct): return d # a utility function to get us a slice of an iterator, as an iterator -# when working with iterators maximum lazyness is preferred +# when working with iterators maximum lazyness is preferred def iterator_slice(iterator, length): iterator = iter(iterator) while True: @@ -89,7 +90,7 @@ def contains_vasp_dirs(list_of_files): for f in list_of_files: if f.startswith("INCAR"): return True - + def get_symlinked_path(root, base_path_index, insert): root_split = os.path.realpath(root).split(os.sep) if base_path_index != len(root_split) and \ @@ -234,9 +235,12 @@ def copy(target_db_file, tag, insert, copy_snls): ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection]) + # don't accidentally copy tasks without year tag + task_base_query['tags']['$in'] = year_tags + tags = [tag] if tag is None: - tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None] + tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None and t not in year_tags] print(len(tags), 'tags in source collection') def insert_snls(snls_list): @@ -753,7 +757,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): try: wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True}) wf = add_trackers(wf) - wf = add_tags(wf, [tag, 'mp_{}'.format(current_year)]) + wf = add_tags(wf, [tag, year_tags[-1]]) if struct.task_id is not None: wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id}) except Exception as ex: @@ -823,7 +827,7 @@ def report(tag, in_progress, to_csv): tags = [tag] if tag is None: - tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None] + tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None and t not in year_tags] tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags] all_tags = [] for t in tags: @@ -1138,7 +1142,7 @@ def gdrive(target_db_file): if not garden_id: print('MPDRIVE_GARDEN_ID not set!') return - + launcher_paths = [] full_launcher_path = [] From fe00279b923f2c838c78b7351c83dc918b8f4603 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 14 Jan 2019 13:56:55 -0800 Subject: [PATCH 65/97] cli.gdrive: add block filter option --- emmet/scripts/emmet.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 93ce3fb5e1..2170673493 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1126,7 +1126,8 @@ def upload_archive(path, name, service, parent=None): @cli.command() @click.argument('target_db_file', type=click.Path(exists=True)) -def gdrive(target_db_file): +@click.option('--block-filter', '-f', help='block filter substring (e.g. block_2017-)') +def gdrive(target_db_file, block_filter): """sync launch directories for target task DB to Google Drive""" target = VaspCalcDb.from_db_file(target_db_file, admin=True) print('connected to target db with', target.collection.count(), 'tasks') @@ -1172,9 +1173,8 @@ def recurse(service, folder_id): break # done with launchers in current block block_page_token = None - sample_block = 'block_2012-0' #'block_2011-10-07-08-57-17-804213' - block_query = "'{}' in parents".format(garden_id) if sample_block is None \ - else "'{}' in parents and name contains '{}'".format(garden_id, sample_block) + block_query = "'{}' in parents".format(garden_id) if block_filter is None \ + else "'{}' in parents and name contains '{}'".format(garden_id, block_filter) while True: block_response = service.files().list( @@ -1203,19 +1203,19 @@ def recurse(service, folder_id): print(len(blessed_task_ids), 'blessed tasks.') nr_launchers_sync = 0 - outfile = open('launcher_paths.txt', 'w') + outfile = open('launcher_paths_{}.txt'.format(block_filter), 'w') splits = ['block_', 'aflow_engines-', 'launcher_'] for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): dir_name = task['dir_name'] # aflow_engines-mag_special - if sample_block is not None and sample_block not in dir_name: + if block_filter is not None and block_filter not in dir_name: continue for s in splits: ds = dir_name.split(s) if len(ds) == 2: block_launcher = s + ds[-1] - if dir_name not in launcher_paths: + if block_launcher not in launcher_paths: nr_launchers_sync += 1 outfile.write(block_launcher + '\n') break From 490a1fd6c23872805598ac02771df68e81ed5d84 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 14 Jan 2019 21:16:35 -0800 Subject: [PATCH 66/97] cli.parse: better block/launcher organization --- emmet/scripts/emmet.py | 73 ++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 2170673493..12c6a66f72 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -91,34 +91,52 @@ def contains_vasp_dirs(list_of_files): if f.startswith("INCAR"): return True +def clean_path(path): + return os.path.join(os.path.abspath(os.path.realpath(path)), '') # trailing slash + +def make_block(base_path): + block = get_timestamp_dir(prefix='block') + block_dir = os.path.join(base_path, block) + os.mkdir(block_dir) + print('created', block_dir) + return block_dir + def get_symlinked_path(root, base_path_index, insert): - root_split = os.path.realpath(root).split(os.sep) - if base_path_index != len(root_split) and \ - not root_split[base_path_index-1].startswith('block_'): - rootdir = os.sep.join(root_split[:base_path_index]) - block = get_timestamp_dir(prefix='block') - block_dir = os.sep.join(root_split[:base_path_index-1] + [block]) - if insert: - os.rename(rootdir, block_dir) - os.symlink(block_dir, rootdir) - print(rootdir, '->', block_dir) - subdir = os.sep.join(root_split) + """organize directory in block_*/launcher_* via symbolic links""" + root_split = root.split(os.sep) + base_path = os.sep.join(root_split[:base_path_index]) + + if not root_split[base_path_index].startswith('block_'): + all_blocks = glob(os.path.join(base_path, 'block_*/')) + if all_blocks: + block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block + nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/'))) + if nr_launchers > 300: # start new block + block_dir = make_block(base_path) + else: + block_dir = make_block(base_path) + else: + block_dir = os.sep.join(root_split[:base_path_index+1]) + if not root_split[-1].startswith('launcher_'): - launch = get_timestamp_dir() - launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch) - if insert: - os.rename(subdir, launch_dir) - os.symlink(launch_dir, subdir) - print(subdir, '->', launch_dir) - return launch_dir + launch = get_timestamp_dir(prefix='launcher') + launch_dir = os.path.join(block_dir, launch) else: - return os.path.realpath(subdir) + launch_dir = os.sep.join(block_dir, root_split[-1]) + + if insert: + os.rename(root, launch_dir) + os.symlink(launch_dir, root) + print(root, '->', launch_dir) + return launch_dir def get_vasp_dirs(scan_path, base_path, max_dirs, insert): - base_path_split = base_path.split(os.sep) - base_path_index = len(base_path_split) - # NOTE os.walk followlinks=False by default, as intended here + scan_path = clean_path(scan_path) + base_path = clean_path(base_path) + base_path_index = len(base_path.split(os.sep))-1 # account for abspath counter = 0 + + # NOTE os.walk followlinks=False by default, as intended here for root, dirs, files in os.walk(scan_path): # TODO ignore relax1/2 subdirs if INCAR.orig found if contains_vasp_dirs(files): @@ -140,12 +158,14 @@ def get_vasp_dirs(scan_path, base_path, max_dirs, insert): if counter >= max_dirs: break + def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): name = multiprocessing.current_process().name print(name, 'starting') lpad = LaunchPad.auto_load() target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print(name, 'connected to target db with', target.collection.count(), 'tasks') + for vaspdir in vaspdirs: if get_subdir(vaspdir) in already_inserted_subdirs: print(name, vaspdir, 'already parsed') @@ -173,6 +193,7 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): rmtree(vaspdir) print(name, 'removed', vaspdir) continue + if task_doc['state'] == 'successful': try: target.insert_task(task_doc, use_gridfs=True) @@ -185,6 +206,7 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): print(name, 'also remove force_constants and retry ...') task_doc['calcs_reversed'][0]['output'].pop('force_constants') target.insert_task(task_doc, use_gridfs=True) + nr_vaspdirs = len(vaspdirs) print(name, 'processed', nr_vaspdirs, 'VASP directories') return nr_vaspdirs @@ -536,7 +558,6 @@ def find_matching_canonical_task_structures(formula, struct, full_name): matched_task_ids.append(s.task_id) return matched_task_ids - for tag, value in tags.items(): if skip_all_scanned and not value[1]: @@ -1075,7 +1096,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): base_path = os.path.join(base_path, '') base_path_split = base_path.split(os.sep) tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2] - drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]}) + drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag, year_tags[-1]]}) already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')] print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag) @@ -1083,11 +1104,13 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): if nproc > 1 and max_dirs <= chunk_size: nproc = 1 print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially') + pool = multiprocessing.Pool(processes=nproc) iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs, insert) iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks queue = deque() total_nr_vaspdirs_parsed = 0 + while iterator or queue: try: args = [next(iterator), insert, drone, already_inserted_subdirs] @@ -1101,6 +1124,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): queue.append(process) else: total_nr_vaspdirs_parsed += process.get() + pool.close() print('DONE:', total_nr_vaspdirs_parsed, 'parsed') @@ -1232,6 +1256,7 @@ def recurse(service, folder_id): outfile = open('launcher_paths.txt', 'w') stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive' + for idx, dir_name in enumerate(dir_names): block_launcher_split = dir_name.split(os.sep) #if prev is not None and prev != block_launcher_split[0]: # TODO remove From ef82ffbb50ddb32044eb1bd7224c10319b49b744 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 14 Jan 2019 21:17:22 -0800 Subject: [PATCH 67/97] cli.parse: improve orig files logic --- emmet/scripts/emmet.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 12c6a66f72..de183b0b00 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -171,20 +171,20 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): print(name, vaspdir, 'already parsed') continue print(name, 'vaspdir:', vaspdir) - #poscar_path = os.path.join(vaspdir, 'POSCAR.relax2.gz') - #s = Structure.from_file(poscar_path) - #nelements = len(s.composition.elements) - #if nelements > 1: - # print(name, ' -> SKIP (#elements > 1)') - # continue + if insert: - for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']: - input_path = os.path.join(vaspdir, inp) - if not glob(input_path+'.orig*'): - input_path = glob(input_path+'*')[0] - orig_path = input_path.replace(inp, inp+'.orig') - copyfile(input_path, orig_path) - print(name, 'cp', input_path, '->', orig_path) + try: + for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']: + input_path = os.path.join(vaspdir, inp) + if not glob(input_path+'.orig*'): + input_path = glob(input_path+'*')[0] + orig_path = input_path.replace(inp, inp+'.orig') + copyfile(input_path, orig_path) + print(name, 'cp', input_path, '->', orig_path) + except Exception as ex: + print(str(ex)) + continue + try: task_doc = drone.assimilate(vaspdir) except Exception as ex: From e77dbf9b1e736bb7d1b9541af469134557064ac7 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 11:26:12 -0800 Subject: [PATCH 68/97] cli.parse: fix get_vasp_dirs --- emmet/scripts/emmet.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index de183b0b00..c5c161cb23 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -109,6 +109,7 @@ def get_symlinked_path(root, base_path_index, insert): if not root_split[base_path_index].startswith('block_'): all_blocks = glob(os.path.join(base_path, 'block_*/')) if all_blocks: + # TODO: getmtime doesn't get last created block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/'))) if nr_launchers > 300: # start new block @@ -121,13 +122,13 @@ def get_symlinked_path(root, base_path_index, insert): if not root_split[-1].startswith('launcher_'): launch = get_timestamp_dir(prefix='launcher') launch_dir = os.path.join(block_dir, launch) + if insert: + os.rename(root, launch_dir) + os.symlink(launch_dir, root) + print(root, '->', launch_dir) else: - launch_dir = os.sep.join(block_dir, root_split[-1]) + launch_dir = os.path.join(block_dir, root_split[-1]) - if insert: - os.rename(root, launch_dir) - os.symlink(launch_dir, root) - print(root, '->', launch_dir) return launch_dir def get_vasp_dirs(scan_path, base_path, max_dirs, insert): From 23f9b285b4b94a1d6ef894dd40139f9f15f045c5 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 12:26:23 -0800 Subject: [PATCH 69/97] cli.parse: better way to make new block --- emmet/scripts/emmet.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index c5c161cb23..070eefdf58 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -109,10 +109,11 @@ def get_symlinked_path(root, base_path_index, insert): if not root_split[base_path_index].startswith('block_'): all_blocks = glob(os.path.join(base_path, 'block_*/')) if all_blocks: - # TODO: getmtime doesn't get last created - block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block - nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/'))) - if nr_launchers > 300: # start new block + for block_dir in all_blocks: + nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/'))) + if nr_launchers < 300: + break # found an existing block with < 300 launchers + else: block_dir = make_block(base_path) else: block_dir = make_block(base_path) From ef653c1142bd181470870597e3d3afb6ca75fa36 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 14:19:12 -0800 Subject: [PATCH 70/97] garden_to_hpss: pack in archives, access hpss from user account --- emmet/scripts/garden_to_hpss.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index c139184e15..2462b83933 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -1,12 +1,15 @@ #!/bin/bash +[[ ! -d $1/archives ]] && mkdir -v $1/archives + for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do echo $block_dir subdir=`basename $block_dir` - if [ ! -e ${subdir}.tar.gz ]; then - tar -czvf ${subdir}.tar.gz ${block_dir} + if [ ! -e $1/archives/${subdir}.tar.gz ]; then + tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir fi - hsi cput ${subdir}.tar.gz : garden/${subdir}.tar.gz - [[ $? -ne 0 ]] && echo "not removing ${block_dir}" && continue - rm -rv $block_dir && rm -v ${subdir}.tar.gz + hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz + flag=$? + [[ $flag -ne 0 ]] && echo "not removing ${subdir}.tar.gz (flag=$flag)" && continue + rm -v $1/archives/${subdir}.tar.gz done From e47c6c658b023110290230ede39f97ade43a60d6 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 15:19:45 -0800 Subject: [PATCH 71/97] garden_to_hpss: add fail safes --- emmet/scripts/garden_to_hpss.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index 2462b83933..eac04c477f 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -7,9 +7,19 @@ for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do subdir=`basename $block_dir` if [ ! -e $1/archives/${subdir}.tar.gz ]; then tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir + flag=$? + if [ $flag -ne 0 ]; then + echo "error with ${subdir}.tar.gz (flag=$flag)" + rm -v $1/archives/${subdir}.tar.gz + continue + fi fi hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz flag=$? - [[ $flag -ne 0 ]] && echo "not removing ${subdir}.tar.gz (flag=$flag)" && continue + if [ $flag -ne 0 ]; then + echo "error with hsi transfer for ${subdir}.tar.gz (flag=$flag)" + exit + fi rm -v $1/archives/${subdir}.tar.gz + rm -rv $block_dir done From bef604a0b09a236d2486b4c3281a9f1ff17eb484 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 19:46:38 -0800 Subject: [PATCH 72/97] cli.gdrive: fix splits --- emmet/scripts/emmet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 070eefdf58..4777688297 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1230,10 +1230,9 @@ def recurse(service, folder_id): nr_launchers_sync = 0 outfile = open('launcher_paths_{}.txt'.format(block_filter), 'w') - splits = ['block_', 'aflow_engines-', 'launcher_'] + splits = ['block_', 'res_1_aflow_engines-', 'aflow_engines-'] for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): dir_name = task['dir_name'] - # aflow_engines-mag_special if block_filter is not None and block_filter not in dir_name: continue From 7bfe7e443fe0715a379d2a9e5ab0b0d742a3d23c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 15 Jan 2019 19:47:07 -0800 Subject: [PATCH 73/97] garden_to_hpss: ok to force rm --- emmet/scripts/garden_to_hpss.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index eac04c477f..56abd5f0d9 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -21,5 +21,5 @@ for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do exit fi rm -v $1/archives/${subdir}.tar.gz - rm -rv $block_dir + rm -rfv $block_dir done From 22f98a85226b8eb513d7c7babfa147a4cc9867b5 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 16 Jan 2019 13:03:30 -0800 Subject: [PATCH 74/97] cli.copy: fix year tags before copying --- emmet/scripts/emmet.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 4777688297..796652f06b 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -259,14 +259,37 @@ def copy(target_db_file, tag, insert, copy_snls): ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection]) - # don't accidentally copy tasks without year tag - task_base_query['tags']['$in'] = year_tags - tags = [tag] if tag is None: tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None and t not in year_tags] print(len(tags), 'tags in source collection') + # fix year tags before copying tasks + counter = Counter() + source_tasks = source.collection.find( + {'$and': [{'tags': {'$in': tags}}, {'tags': {'$nin': year_tags}}]}, {'_id': 0, 'dir_name': 1} + ) + source_tasks_to_fix = source_tasks.count() + if source_tasks_to_fix > 0: + print(source_tasks_to_fix, 'source tasks are missing a year tag!') + print('ERROR: Aborting since this needs testing') + return + + for idx, doc in enumerate(source_tasks): + print(idx, doc['dir_name']) + # check whether I copied it over to production already -> add tag for previous year + # anything not copied is tagged with the current year + prod_task = target.collection.find_one({'dir_name': doc['dir_name']}, {'dir_name': 1, 'tags': 1}) + year_tag = year_tags[-1] + if prod_task: + for t in prod_task['tags']: + if t in year_tags: + year_tag = t + print(year_tag) + #r = source.collection.update({'dir_name': doc['dir_name']}, {'$addToSet': {'tags': year_tag}}) + #counter[year_tag] += r['nModified'] + #print(counter) + def insert_snls(snls_list): if snls_list: print('copy', len(snls_list), 'SNLs') From 525ecf73af0c8703485fc6c8b4675f988a732315 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Wed, 16 Jan 2019 16:17:17 -0800 Subject: [PATCH 75/97] cli: new subcommand find --- emmet/scripts/emmet.py | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 796652f06b..86f6e715f9 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -439,6 +439,66 @@ def insert_snls(snls_list): print(table) +@cli.command() +@click.argument('email') +@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') +@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan') +def find(email, add_snlcolls, add_tasks_db): + """checks status of calculations by submitter or author email in SNLs""" + lpad = LaunchPad.auto_load() + + snl_collections = [lpad.db.snls] + if add_snlcolls is not None: + for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')): + snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False) + snl_db = snl_db_conn[snl_db_config['db']] + snl_db.authenticate(snl_db_config['username'], snl_db_config['password']) + snl_collections.append(snl_db[snl_db_config['collection']]) + for snl_coll in snl_collections: + print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name) + + tasks_collections = OrderedDict() + tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks + if add_tasks_db is not None: # TODO multiple alt_task_db_files? + target = VaspCalcDb.from_db_file(add_tasks_db, admin=True) + tasks_collections[target.collection.full_name] = target.collection + for full_name, tasks_coll in tasks_collections.items(): + print(tasks_coll.count(), 'tasks in', full_name) + + #ensure_indexes(['snl_id', 'about.remarks', 'submitter_email', 'about.authors.email'], snl_collections) + ensure_indexes(['snl_id', 'fw_id'], [lpad.db.add_wflows_logs]) + ensure_indexes(['fw_id'], [lpad.fireworks]) + ensure_indexes(['launch_id'], [lpad.launches]) + ensure_indexes(['dir_name', 'task_id'], tasks_collections.values()) + + snl_ids = [] + query = {'$or': [{'submitter_email': email}, {'about.authors.email': email}]} + query.update(exclude) + for snl_coll in snl_collections: + snl_ids.extend(snl_coll.distinct('snl_id', query)) + print(len(snl_ids), 'SNLs') + + fw_ids = lpad.db.add_wflows_logs.distinct('fw_id', {'snl_id': {'$in': snl_ids}}) + print(len(fw_ids), 'FWs') + + launch_ids = lpad.fireworks.distinct('launches', {'fw_id': {'$in': fw_ids}}) + print(len(launch_ids), 'launches') + + launches = lpad.launches.find({'launch_id': {'$in': launch_ids}}, {'launch_dir': 1}) + subdirs = [get_subdir(launch['launch_dir']) for launch in launches] + print(len(subdirs), 'launch directories') + + for full_name, tasks_coll in tasks_collections.items(): + print(full_name) + for subdir in subdirs: + subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}} + task = tasks_coll.find_one(subdir_query, {'task_id': 1}) + if task: + print(task['task_id']) + else: + print(subdir, 'not found') + + @cli.command() @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan') From a458e087f380b096f27c69e24a99f6a222204599 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 00:39:04 -0800 Subject: [PATCH 76/97] hpss: use htar --- emmet/scripts/garden_to_hpss.sh | 25 ++++++++++--------------- emmet/scripts/targz_to_htar.sh | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 15 deletions(-) create mode 100755 emmet/scripts/targz_to_htar.sh diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index 56abd5f0d9..b6f485166e 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -1,25 +1,20 @@ #!/bin/bash -[[ ! -d $1/archives ]] && mkdir -v $1/archives +cd $1 && pwd for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do echo $block_dir - subdir=`basename $block_dir` - if [ ! -e $1/archives/${subdir}.tar.gz ]; then - tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir - flag=$? - if [ $flag -ne 0 ]; then - echo "error with ${subdir}.tar.gz (flag=$flag)" - rm -v $1/archives/${subdir}.tar.gz - continue - fi - fi - hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz + chmod -Rv ug+rw $block_dir + [[ $? -ne 0 ]] && echo 'error in chmod' && exit + find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \; + [[ $? -ne 0 ]] && echo "error in pigz" && exit + block=`basename $block_dir` + htar -cvf garden/${block}.tar $block flag=$? if [ $flag -ne 0 ]; then - echo "error with hsi transfer for ${subdir}.tar.gz (flag=$flag)" + echo "error with htar (flag=$flag)" exit fi - rm -v $1/archives/${subdir}.tar.gz - rm -rfv $block_dir + #rm -rfv $block_dir + break done diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh new file mode 100755 index 0000000000..c0a40ec502 --- /dev/null +++ b/emmet/scripts/targz_to_htar.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# NOTE make sure matcomp is first entry in ~/.netrc! +[[ ! -e garden.txt ]] && hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt + +while read block_tar_gz; do + block=`basename ${block_tar_gz%%.tar.gz}` + echo $block + if [ ! -e ${block}.tar.gz ]; then + hsi -q -l matcomp get garden/${block}.tar.gz + [[ $? -ne 0 ]] && echo 'error in hsi get' && exit + fi + if [ ! -d ${block} ]; then + tar -xvzf ${block}.tar.gz + [[ $? -ne 0 ]] && echo 'error in tar -x' && exit + fi + chmod -Rv ug+rw ${block} + [[ $? -ne 0 ]] && echo 'error in chmod' && exit + find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \; + [[ $? -ne 0 ]] && echo "error in pigz" && exit + htar -cvf garden/${block}.tar ${block} + [[ $? -ne 0 ]] && echo 'error in htar -c' && exit + hsi -q -l matcomp rm garden/${block}.tar.gz + [[ $? -ne 0 ]] && echo 'error in htar rm' && exit + rm -rv ${block} + rm -v ${block}.tar.gz + break # TODO remove +done < garden.txt + + From 9ea8a991d35392a29348f528db638a24ca402884 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 13:38:43 -0800 Subject: [PATCH 77/97] htar ready: garden_to_hpss & targz_to_htar --- emmet/scripts/garden_to_hpss.sh | 13 ++++--------- emmet/scripts/targz_to_htar.sh | 20 ++++++++------------ 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index b6f485166e..64da6cfcc1 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -4,17 +4,12 @@ cd $1 && pwd for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do echo $block_dir - chmod -Rv ug+rw $block_dir + find $block_dir -not -perm -660 -exec chmod -v g+rw {} \; [[ $? -ne 0 ]] && echo 'error in chmod' && exit find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \; [[ $? -ne 0 ]] && echo "error in pigz" && exit block=`basename $block_dir` - htar -cvf garden/${block}.tar $block - flag=$? - if [ $flag -ne 0 ]; then - echo "error with htar (flag=$flag)" - exit - fi - #rm -rfv $block_dir - break + htar -M 5000000 -cvf garden/${block}.tar $block + [[ $? -ne 0 ]] && echo "error with htar" && exit + rm -rfv $block_dir done diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh index c0a40ec502..67c0eeeab4 100755 --- a/emmet/scripts/targz_to_htar.sh +++ b/emmet/scripts/targz_to_htar.sh @@ -1,30 +1,26 @@ #!/bin/bash # NOTE make sure matcomp is first entry in ~/.netrc! -[[ ! -e garden.txt ]] && hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt +cd $1 && pwd +hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt while read block_tar_gz; do block=`basename ${block_tar_gz%%.tar.gz}` echo $block - if [ ! -e ${block}.tar.gz ]; then - hsi -q -l matcomp get garden/${block}.tar.gz - [[ $? -ne 0 ]] && echo 'error in hsi get' && exit - fi - if [ ! -d ${block} ]; then - tar -xvzf ${block}.tar.gz - [[ $? -ne 0 ]] && echo 'error in tar -x' && exit - fi - chmod -Rv ug+rw ${block} + hsi -q -l matcomp cget garden/${block}.tar.gz + [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit + tar --skip-old-files -xvzf ${block}.tar.gz + [[ $? -ne 0 ]] && echo 'error in tar -x' && exit + find $block -not -perm -660 -exec chmod -v g+rw {} \; [[ $? -ne 0 ]] && echo 'error in chmod' && exit find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \; [[ $? -ne 0 ]] && echo "error in pigz" && exit - htar -cvf garden/${block}.tar ${block} + htar -M 5000000 -cvf garden/${block}.tar ${block} [[ $? -ne 0 ]] && echo 'error in htar -c' && exit hsi -q -l matcomp rm garden/${block}.tar.gz [[ $? -ne 0 ]] && echo 'error in htar rm' && exit rm -rv ${block} rm -v ${block}.tar.gz - break # TODO remove done < garden.txt From fed1f3c4c6cf044255115dd65d97553a8a22c402 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 15:18:16 -0800 Subject: [PATCH 78/97] cli.gdrive: fix block_filter logic --- emmet/scripts/emmet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 86f6e715f9..bf5304e867 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1316,14 +1316,14 @@ def recurse(service, folder_id): splits = ['block_', 'res_1_aflow_engines-', 'aflow_engines-'] for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}): dir_name = task['dir_name'] - if block_filter is not None and block_filter not in dir_name: - continue - for s in splits: ds = dir_name.split(s) if len(ds) == 2: block_launcher = s + ds[-1] - if block_launcher not in launcher_paths: + if block_launcher not in launcher_paths and ( + block_filter is None or \ + (block_filter is not None and block_launcher.startswith(block_filter)) + ): nr_launchers_sync += 1 outfile.write(block_launcher + '\n') break From 04baacef0f266b6a00ef67e1116d4b3d6e4c0bea Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 16:13:02 -0800 Subject: [PATCH 79/97] hpss_to_mpdrive ready for htar --- emmet/scripts/hpss_to_mpdrive.sh | 76 +++++++++----------------------- 1 file changed, 21 insertions(+), 55 deletions(-) diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh index 486957eadf..cb82a0574e 100755 --- a/emmet/scripts/hpss_to_mpdrive.sh +++ b/emmet/scripts/hpss_to_mpdrive.sh @@ -1,27 +1,23 @@ #!/bin/bash -# $(find $dir -name 'INCAR.orig*' -printf '%h ') -dirs=`awk -F/ '{print $1}' $1 | sort -u` -hpss_missing="blocks_missing_in_hpss.txt" +input=$PWD/launcher_paths.txt +[[ ! -e $input ]] && echo $input missing && exit +dirs=`awk -F/ '{print $1}' $input | sort -u` -stage_dir="rclone_to_mp_drive" -[[ ! -d $stage_dir ]] && mkdir $stage_dir -[[ ! -e $hpss_missing ]] && touch $hpss_missing +cd $1 && pwd +stage_dir=rclone_to_mp_drive +[[ ! -d $stage_dir ]] && mkdir -pv $stage_dir for dir in $dirs; do - #[[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove - - files=`grep "^$dir" $1` - extract="${dir}.extract" - grep -q "$dir" $hpss_missing - [[ $? -eq 0 ]] && continue - - [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + echo $dir + files=`grep "^$dir" $input` echo $files | tr ' ' '\n' | sort -u > ${dir}.files + wc -l ${dir}.files rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf + wc -l ${dir}.rclone_lsf - missing_paths="${dir}.paths" + missing_paths=${dir}.paths [[ -e $missing_paths ]] && rm -v $missing_paths for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive launch_dir_tar="${stage_dir}/${f}.tar.gz" @@ -39,57 +35,27 @@ for dir in $dirs; do done rm -v ${dir}.files ${dir}.rclone_lsf - [[ ! -e $missing_paths ]] && continue - - if [ ! -e ${dir}.tar.gz ] || [ ! -s ${dir}.tar.gz ]; then - hsi -q "get garden/${dir}.tar.gz" - [[ $? -ne 0 ]] && echo ${dir} >> $hpss_missing && continue - fi - ls -ltrh ${dir}.tar.gz - - if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then - echo "make ${dir}.tar_list ..." - tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list - [[ $? -ne 0 ]] && exit - fi - - paths=`cat $missing_paths` - [[ -e $extract ]] && rm -v $extract - for f in $paths; do - [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract - done + [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && exit #continue + wc -l $missing_paths - if [ -e $extract ] && [ -s $extract ]; then - echo "extract" `wc -l $extract` - if tar -xvzf ${dir}.tar.gz --files-from $extract; then - echo 'extract OK' - else - rm -v $extract - echo 'problem with extract!' - continue - fi - else - echo 'nothing to extract' - rm -v $extract - continue - fi - rm -v $extract + #htar -xvf garden/${dir}.tar -L $missing_paths + #[[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && exit #continue + ls -ltrhd ${dir} - for f in $paths; do + for f in `cat $missing_paths`; do launch_dir_tar="${stage_dir}/${f}.tar.gz" echo $launch_dir_tar ... mkdir -p `dirname $launch_dir_tar` - if tar -czf $launch_dir_tar -C `dirname $f` `basename $f`; then + if tar --use-compress-program="pigz -9rv" -cf $launch_dir_tar -C `dirname $f` `basename $f`; then ls -ltrh $launch_dir_tar else echo 'problem with launch dir tar!' - continue + rm -v $launch_dir_tar + exit fi - #[[ -d $f ]] && rm -rf $f + [[ -d $f ]] && rm -rv $f done rm -v $missing_paths rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir - #rm -v ${dir}.tar.gz - done From be147e4b57b6c41d1b1ac7776d89dfc8a959912a Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 16:13:28 -0800 Subject: [PATCH 80/97] targz_to_htar: use pigz --- emmet/scripts/targz_to_htar.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh index 67c0eeeab4..ed62b40516 100755 --- a/emmet/scripts/targz_to_htar.sh +++ b/emmet/scripts/targz_to_htar.sh @@ -9,7 +9,7 @@ while read block_tar_gz; do echo $block hsi -q -l matcomp cget garden/${block}.tar.gz [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit - tar --skip-old-files -xvzf ${block}.tar.gz + tar -I pigz --skip-old-files -xvf ${block}.tar.gz [[ $? -ne 0 ]] && echo 'error in tar -x' && exit find $block -not -perm -660 -exec chmod -v g+rw {} \; [[ $? -ne 0 ]] && echo 'error in chmod' && exit From 4589ca96301dde7ce31641975abd228807884222 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Thu, 17 Jan 2019 17:23:42 -0800 Subject: [PATCH 81/97] targz_to_htar: speed improvement on chmod --- emmet/scripts/targz_to_htar.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh index ed62b40516..b923350d32 100755 --- a/emmet/scripts/targz_to_htar.sh +++ b/emmet/scripts/targz_to_htar.sh @@ -11,7 +11,7 @@ while read block_tar_gz; do [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit tar -I pigz --skip-old-files -xvf ${block}.tar.gz [[ $? -ne 0 ]] && echo 'error in tar -x' && exit - find $block -not -perm -660 -exec chmod -v g+rw {} \; + parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0) [[ $? -ne 0 ]] && echo 'error in chmod' && exit find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \; [[ $? -ne 0 ]] && echo "error in pigz" && exit From 5495ba78a8fe3edc60f4b82582004d39cb215d46 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 28 Jan 2019 14:15:54 -0800 Subject: [PATCH 82/97] cli: add fw_id for intended mp-ids --- emmet/scripts/emmet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index bf5304e867..c33fed3f60 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -791,7 +791,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name): if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id: msg = ' --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id) print(msg) - logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]}) + logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'fw_id': fw['fw_id'], 'tags': [tag]}) fw_found = True break if not fw_found: From 54f103533178e85d8c5fe448efff56cc6878aa28 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 28 Jan 2019 14:16:10 -0800 Subject: [PATCH 83/97] cli: whitespace fix --- emmet/scripts/emmet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index c33fed3f60..6df758a21c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1321,9 +1321,9 @@ def recurse(service, folder_id): if len(ds) == 2: block_launcher = s + ds[-1] if block_launcher not in launcher_paths and ( - block_filter is None or \ - (block_filter is not None and block_launcher.startswith(block_filter)) - ): + block_filter is None or \ + (block_filter is not None and block_launcher.startswith(block_filter)) + ): nr_launchers_sync += 1 outfile.write(block_launcher + '\n') break From 23524f41cd7092b578063b0ff2275a0018a35c02 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 28 Jan 2019 14:16:51 -0800 Subject: [PATCH 84/97] cli: minor hpss/garden scripts update --- emmet/scripts/garden_to_hpss.sh | 2 +- emmet/scripts/targz_to_htar.sh | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh index 64da6cfcc1..0a9655c3d9 100755 --- a/emmet/scripts/garden_to_hpss.sh +++ b/emmet/scripts/garden_to_hpss.sh @@ -4,7 +4,7 @@ cd $1 && pwd for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do echo $block_dir - find $block_dir -not -perm -660 -exec chmod -v g+rw {} \; + parallel -0m 'chmod -v g+rw {}' :::: <(find $block_dir -not -perm -660 -print0) [[ $? -ne 0 ]] && echo 'error in chmod' && exit find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \; [[ $? -ne 0 ]] && echo "error in pigz" && exit diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh index b923350d32..812085ee88 100755 --- a/emmet/scripts/targz_to_htar.sh +++ b/emmet/scripts/targz_to_htar.sh @@ -1,8 +1,11 @@ #!/bin/bash # NOTE make sure matcomp is first entry in ~/.netrc! -cd $1 && pwd -hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt +indir=$1 +year=$2 +garden=garden_${year}.txt +cd $indir && pwd +hsi -P -l matcomp ls -1 "garden/block_${year}*.tar.gz" > $garden while read block_tar_gz; do block=`basename ${block_tar_gz%%.tar.gz}` @@ -11,6 +14,10 @@ while read block_tar_gz; do [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit tar -I pigz --skip-old-files -xvf ${block}.tar.gz [[ $? -ne 0 ]] && echo 'error in tar -x' && exit + [[ -d garden_pauling_files/$block ]] && mv -vi garden_pauling_files/$block . + [[ -d garden_cori/$block ]] && mv -vi garden_cori/$block . + [[ -d garden_JulAug2018/$block ]] && mv -vi garden_JulAug2018/$block . + [[ -d garden_Jul2018/$block ]] && mv -vi garden_Jul2018/$block . parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0) [[ $? -ne 0 ]] && echo 'error in chmod' && exit find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \; @@ -21,6 +28,6 @@ while read block_tar_gz; do [[ $? -ne 0 ]] && echo 'error in htar rm' && exit rm -rv ${block} rm -v ${block}.tar.gz -done < garden.txt +done < $garden From 30e6c18f1cedc2021dab50e00821233e28598d37 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 28 Jan 2019 16:18:23 -0800 Subject: [PATCH 85/97] setup: remove py_modules --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 6a05ac67a7..4f78e444b3 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ 'Topic :: Scientific/Engineering'], test_suite='nose.collector', tests_require=['nose'], - py_modules=['emmet'], entry_points=''' [console_scripts] emmet=emmet.scripts.emmet:cli From c4a5553108faa7b45f67c3bc10c6fc5f04771f5d Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 29 Jan 2019 14:00:15 -0800 Subject: [PATCH 86/97] cli: add bandstructure subcommand --- emmet/scripts/emmet.py | 52 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 6df758a21c..67cf3bb14d 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -16,8 +16,8 @@ from fireworks.fw_config import FW_BLOCK_FORMAT from atomate.vasp.database import VaspCalcDb from atomate.vasp.drones import VaspDrone -from atomate.vasp.workflows.presets.core import wf_structure_optimization -from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs +from atomate.vasp.workflows.presets.core import wf_structure_optimization, wf_bandstructure +from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs, add_wf_metadata from emmet.vasp.materials import group_structures, get_sg from emmet.vasp.task_tagger import task_type from log4mongo.handlers import MongoHandler, MongoFormatter @@ -498,6 +498,54 @@ def find(email, add_snlcolls, add_tasks_db): else: print(subdir, 'not found') +@cli.command() +@click.argument('target_db_file', type=click.Path(exists=True)) +@click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') +def bandstructure(target_db_file, insert): + """add workflows for bandstructure based on materials collection""" + lpad = LaunchPad.auto_load() + source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) + print('connected to source db with', source.collection.count(), 'tasks') + target = VaspCalcDb.from_db_file(target_db_file, admin=True) + print('connected to target db with', target.collection.count(), 'tasks') + materials = target.db["materials.core"] + ensure_indexes(['task_id'], [materials]) + ensure_indexes(['metadata.task_id'], [lpad.workflows]) + print(materials.count(), 'core materials') + + all_mat_ids = set(materials.distinct('task_id')) + existing_mat_ids = set(filter(None, lpad.workflows.distinct('metadata.task_id'))) + mat_ids = all_mat_ids.symmetric_difference(existing_mat_ids) + print(len(mat_ids), 'bandstructure workflows to add') + + wflows = [] + for mat_id in mat_ids: + structure = Structure.from_dict(materials.find_one({'task_id': mat_id}, {'structure': 1})['structure']) + dir_name = target.collection.find_one({'task_id': mat_id}, {'dir_name': 1})['dir_name'] + subdir = get_subdir(dir_name) + subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}} + source_task = source.collection.find_one(subdir_query, {'tags': 1}) + if not source_task: + print('source task not found -> TODO') + break + + # bandstructure task has this year's tag (remove other year tags from source_task) + tags = [t for t in source_task['tags'] if t not in year_tags] + tags.append(year_tags[-1]) + + wf = wf_bandstructure(structure, c={'ADD_MODIFY_INCAR': True}) # TODO non-SO bandstructure workflow -> Alex + wf = add_trackers(wf) + wf = add_tags(wf, tags) + wf = add_wf_metadata(wf, structure) + wf.metadata["task_id"] = mat_id + wflows.append(wf) + print(wf.as_dict()) + break + + if insert: + lpad.bulk_add_wfs(wflows) + + @cli.command() @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan') From 23ddab4cd1b7931b7ee4e2acb613985457ebd6f7 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 29 Jan 2019 15:20:17 -0800 Subject: [PATCH 87/97] cli.report: add all states --- emmet/scripts/emmet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index 67cf3bb14d..bb9f299926 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -12,7 +12,7 @@ from pymatgen import Structure from pymatgen.alchemy.materials import TransformedStructure from pymatgen.util.provenance import StructureNL, Author -from fireworks import LaunchPad +from fireworks import LaunchPad, Firework from fireworks.fw_config import FW_BLOCK_FORMAT from atomate.vasp.database import VaspCalcDb from atomate.vasp.drones import VaspDrone @@ -977,7 +977,8 @@ def report(tag, in_progress, to_csv): """generate a report of calculations status""" lpad = LaunchPad.auto_load() - states = ['READY', 'RESERVED', 'RUNNING', 'FIZZLED', 'COMPLETED'] + states = Firework.STATE_RANKS + states = sorted(states, key=states.get) tags = [tag] if tag is None: From b5650951664b9b46855942ac075f6954e83adb14 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 29 Jan 2019 15:22:28 -0800 Subject: [PATCH 88/97] cli: skip VolumePredictor --- emmet/scripts/emmet.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index bb9f299926..e77876130d 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -789,17 +789,18 @@ def find_matching_canonical_task_structures(formula, struct, full_name): for struc in slist: - try: - struct = vp.get_predicted_structure(struc) - struct.snl_id, struct.task_id = struc.snl_id, struc.task_id - except Exception as ex: - print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') - print(ex) - struct = struc - - if not structures_match(struct, struc): - print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') - struct = struc + #try: + # struct = vp.get_predicted_structure(struc) + # struct.snl_id, struct.task_id = struc.snl_id, struc.task_id + #except Exception as ex: + # print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!') + # print(ex) + # struct = struc + + #if not structures_match(struct, struc): + # print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!') + # struct = struc + struct = struc wf_found = False if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]: From 0227a9eb4436cc92ba2e0820f013941462dcd94c Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 11:53:50 -0800 Subject: [PATCH 89/97] cli.gdrive: fix store init --- emmet/scripts/emmet.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index e77876130d..a67bed480c 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1292,10 +1292,13 @@ def gdrive(target_db_file, block_filter): print('connected to target db with', target.collection.count(), 'tasks') print(target.db.materials.count(), 'materials') - store = file.Storage('token.json') - creds = store.get() + creds, store = None, None + if os.path.exists('token.json'): + store = file.Storage('token.json') + creds = store.get() if not creds or creds.invalid: flow = client.flow_from_clientsecrets('credentials.json', SCOPES) + store = file.Storage('token.json') creds = tools.run_flow(flow, store) service = build('drive', 'v3', http=creds.authorize(Http())) garden_id = os.environ.get('MPDRIVE_GARDEN_ID') From eb128bd60880a9df6d2d8f9a4e4aaad56a047301 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 11:54:53 -0800 Subject: [PATCH 90/97] cli minor hpss scripts update --- emmet/scripts/hpss_to_mpdrive.sh | 10 ++++++---- emmet/scripts/targz_to_htar.sh | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh index cb82a0574e..7bc7689793 100755 --- a/emmet/scripts/hpss_to_mpdrive.sh +++ b/emmet/scripts/hpss_to_mpdrive.sh @@ -1,6 +1,6 @@ #!/bin/bash -input=$PWD/launcher_paths.txt +input=$2 [[ ! -e $input ]] && echo $input missing && exit dirs=`awk -F/ '{print $1}' $input | sort -u` @@ -35,14 +35,15 @@ for dir in $dirs; do done rm -v ${dir}.files ${dir}.rclone_lsf - [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && exit #continue + [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && continue wc -l $missing_paths - #htar -xvf garden/${dir}.tar -L $missing_paths - #[[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && exit #continue + htar -xvf garden/${dir}.tar `cat $missing_paths | tr '\n' ' '` ls -ltrhd ${dir} + [[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && continue for f in `cat $missing_paths`; do + [[ ! -e $f ]] && echo $f not found in HPSS!? && continue launch_dir_tar="${stage_dir}/${f}.tar.gz" echo $launch_dir_tar ... mkdir -p `dirname $launch_dir_tar` @@ -58,4 +59,5 @@ for dir in $dirs; do rm -v $missing_paths rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir + find $dir -type d -empty -print -delete done diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh index 812085ee88..3c33754719 100755 --- a/emmet/scripts/targz_to_htar.sh +++ b/emmet/scripts/targz_to_htar.sh @@ -18,6 +18,8 @@ while read block_tar_gz; do [[ -d garden_cori/$block ]] && mv -vi garden_cori/$block . [[ -d garden_JulAug2018/$block ]] && mv -vi garden_JulAug2018/$block . [[ -d garden_Jul2018/$block ]] && mv -vi garden_Jul2018/$block . + [[ -d garden_Aug14-16_2018/$block ]] && mv -vi garden_Aug14-16_2018/$block . + [[ -d garden_Aug2018/$block ]] && mv -vi garden_Aug2018/$block . parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0) [[ $? -ne 0 ]] && echo 'error in chmod' && exit find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \; From 63ab2f0411da93289ee555814c9fe9b1f70bf633 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 11:55:14 -0800 Subject: [PATCH 91/97] cli: add update_hpss_archive.sh --- emmet/scripts/update_hpss_archive.sh | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 emmet/scripts/update_hpss_archive.sh diff --git a/emmet/scripts/update_hpss_archive.sh b/emmet/scripts/update_hpss_archive.sh new file mode 100755 index 0000000000..31896c2c12 --- /dev/null +++ b/emmet/scripts/update_hpss_archive.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw +cd $indir && pwd + +#for block in $(find . -maxdepth 1 -type d -name "block_2011*" -exec basename {} \;); do +#for block in $(cat hpss_update_2013.txt); do +for block_targz in $(ls block_201*.tar.gz); do + tar -I pigz --skip-old-files -xvf ${block_targz} + [[ $? -ne 0 ]] && echo "error in tar -x" && exit + block=${block_targz%%.tar.gz} + echo $block + [[ ! -d $block ]] && echo $block does not exist && exit + find $block -type d -empty -print -delete + [[ ! -d $block ]] && echo $block only contained empty directories && exit + + parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0) + [[ $? -ne 0 ]] && echo 'error in chmod' && exit + find $block -type f -not -name "*.gz" -exec pigz -9v {} \; + [[ $? -ne 0 ]] && echo "error in pigz" && exit + + htar -vtf garden/${block}.tar | awk '{ print $7 }' | sort -u > ${block}.tar.idx + [[ $? -ne 0 ]] && echo "error in htar -t" && exit + find $block -type f | sort -u > ${block}.idx + + comm -13 ${block}.tar.idx ${block}.idx > ${block}.missing + if [ -s ${block}.missing ]; then + nfiles=$(wc -l ${block}.missing | awk '{ print $1}') + echo need syncing of $nfiles files + htar -xvf garden/${block}.tar + [[ $? -ne 0 ]] && echo "error in htar -x" && exit + hsi -q -l matcomp mv garden/${block}.tar garden/${block}.tar.bkp + hsi -q -l matcomp mv garden/${block}.tar.idx garden/${block}.tar.idx.bkp + htar -M 5000000 -cvf garden/${block}.tar ${block} + [[ $? -ne 0 ]] && echo "error in htar -c" && exit + hsi -q -l matcomp rm garden/${block}.tar*.bkp + [[ $? -ne 0 ]] && echo 'error in htar rm' && exit + else + echo all files already in HTAR archive + fi + rm -rv ${block} + rm -v ${block}.tar.idx ${block}.idx ${block}.missing + rm -v ${block_targz} +done From f4c60869283d13fbb040d6587c5623ff6b0e6eb0 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 11:55:58 -0800 Subject: [PATCH 92/97] cli: add sbatch scripts --- emmet/scripts/sbatch/submit_garden_to_hpss.txt | 14 ++++++++++++++ emmet/scripts/sbatch/submit_hpss_MatProj.script | 16 ++++++++++++++++ .../scripts/sbatch/submit_hpss_to_mpdrive.script | 13 +++++++++++++ emmet/scripts/sbatch/submit_restore_MatProj.txt | 16 ++++++++++++++++ emmet/scripts/sbatch/submit_rsync.script | 13 +++++++++++++ emmet/scripts/sbatch/submit_targz_to_htar.script | 15 +++++++++++++++ .../sbatch/submit_update_hpss_archive.script | 11 +++++++++++ 7 files changed, 98 insertions(+) create mode 100644 emmet/scripts/sbatch/submit_garden_to_hpss.txt create mode 100644 emmet/scripts/sbatch/submit_hpss_MatProj.script create mode 100644 emmet/scripts/sbatch/submit_hpss_to_mpdrive.script create mode 100644 emmet/scripts/sbatch/submit_restore_MatProj.txt create mode 100644 emmet/scripts/sbatch/submit_rsync.script create mode 100644 emmet/scripts/sbatch/submit_targz_to_htar.script create mode 100644 emmet/scripts/sbatch/submit_update_hpss_archive.script diff --git a/emmet/scripts/sbatch/submit_garden_to_hpss.txt b/emmet/scripts/sbatch/submit_garden_to_hpss.txt new file mode 100644 index 0000000000..4014113384 --- /dev/null +++ b/emmet/scripts/sbatch/submit_garden_to_hpss.txt @@ -0,0 +1,14 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=48:00:00 +#SBATCH --job-name=garden_to_hpss +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=garden_to_hpss-%j.out +#SBATCH --error=garden_to_hpss-%j.error +#SBATCH --mem=10GB + +script=$HOME/mp_prod/codes/emmet/emmet/scripts/garden_to_hpss.sh +indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/ +$script $indir diff --git a/emmet/scripts/sbatch/submit_hpss_MatProj.script b/emmet/scripts/sbatch/submit_hpss_MatProj.script new file mode 100644 index 0000000000..dcb854abfe --- /dev/null +++ b/emmet/scripts/sbatch/submit_hpss_MatProj.script @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=48:00:00 +#SBATCH --job-name=hpss_MatProj +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=hpss_MatProj-%j.out +#SBATCH --error=hpss_MatProj-%j.error +#SBATCH --mem=10GB + +while read line; do + echo $line + hsi -q -l matcomp ls -1 ${line}.idx + [[ $? -ne 0 ]] && htar -Xvf $line +done < hpss_MatProj_2014.txt diff --git a/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script b/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script new file mode 100644 index 0000000000..b22041d5da --- /dev/null +++ b/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=19:00:00 +#SBATCH --job-name=hpss_to_mpdrive +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=hpss_to_mpdrive-%j.out +#SBATCH --error=hpss_to_mpdrive-%j.error + +indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive +input=/global/homes/h/huck/mp_prod/workdir/emmet_gdrive/launcher_paths_block_2019.txt +~/mp_prod/codes/emmet/emmet/scripts/hpss_to_mpdrive.sh $indir $input diff --git a/emmet/scripts/sbatch/submit_restore_MatProj.txt b/emmet/scripts/sbatch/submit_restore_MatProj.txt new file mode 100644 index 0000000000..e616942e64 --- /dev/null +++ b/emmet/scripts/sbatch/submit_restore_MatProj.txt @@ -0,0 +1,16 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=48:00:00 +#SBATCH --job-name=restore_matproj +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=restore_matproj-%j.out +#SBATCH --error=restore_matproj-%j.error + +outdir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/ +archive=/home/projects/MatProj/GARDEN/2012-Jul-Aug.tar + +cd $outdir && pwd +htar -xvf $archive +echo DONE diff --git a/emmet/scripts/sbatch/submit_rsync.script b/emmet/scripts/sbatch/submit_rsync.script new file mode 100644 index 0000000000..e08eaef033 --- /dev/null +++ b/emmet/scripts/sbatch/submit_rsync.script @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=48:00:00 +#SBATCH --job-name=rsync +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=rsync-%j.out +#SBATCH --error=rsync-%j.error + +indir=/project/projectdirs/matgen/garden/control_blocks +outdir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/ +rsync --remove-source-files -av $indir/block_* $outdir diff --git a/emmet/scripts/sbatch/submit_targz_to_htar.script b/emmet/scripts/sbatch/submit_targz_to_htar.script new file mode 100644 index 0000000000..953da4efa1 --- /dev/null +++ b/emmet/scripts/sbatch/submit_targz_to_htar.script @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=48:00:00 +#SBATCH --job-name=targz_to_htar +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=targz_to_htar-%j.out +#SBATCH --error=targz_to_htar-%j.error +#SBATCH --mem=10GB + +targz_to_htar=$HOME/mp_prod/codes/emmet/emmet/scripts/targz_to_htar.sh +indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/ +year=2019 +$targz_to_htar $indir $year diff --git a/emmet/scripts/sbatch/submit_update_hpss_archive.script b/emmet/scripts/sbatch/submit_update_hpss_archive.script new file mode 100644 index 0000000000..ad9b2da997 --- /dev/null +++ b/emmet/scripts/sbatch/submit_update_hpss_archive.script @@ -0,0 +1,11 @@ +#!/bin/bash -l +#SBATCH --qos=xfer +#SBATCH --time=06:30:00 +#SBATCH --job-name=update_hpss_archive +#SBATCH --licenses=SCRATCH +#SBATCH --mail-user=phuck@lbl.gov +#SBATCH --mail-type=ALL +#SBATCH --output=update_hpss_archive-%j.out +#SBATCH --error=update_hpss_archive-%j.error + +~/mp_prod/codes/emmet/emmet/scripts/update_hpss_archive.sh From 44fa0b5434daf4b17f893385816ffb82752a96b3 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 21:31:36 +0100 Subject: [PATCH 93/97] prepare for emmet merge --- .gitignore | 108 ------------------ LICENSE | 21 ---- README.md | 2 - .../emmet/scripts/retrieve_mpraw_data.py | 0 requirements.txt | 13 --- 5 files changed, 144 deletions(-) delete mode 100644 .gitignore delete mode 100644 LICENSE delete mode 100644 README.md rename retrieve_mpraw_data.py => emmet/emmet/scripts/retrieve_mpraw_data.py (100%) delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 0fe7e91a59..0000000000 --- a/.gitignore +++ /dev/null @@ -1,108 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ - -token.json -credentials.json -mpraw/* diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 8c40a5f90a..0000000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2018 Materials Project - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 9045646106..0000000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# mp-nomad -Disseminate raw MP calculations through NoMaD diff --git a/retrieve_mpraw_data.py b/emmet/emmet/scripts/retrieve_mpraw_data.py similarity index 100% rename from retrieve_mpraw_data.py rename to emmet/emmet/scripts/retrieve_mpraw_data.py diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9e850ed138..0000000000 --- a/requirements.txt +++ /dev/null @@ -1,13 +0,0 @@ -cachetools==3.0.0 -certifi==2018.10.15 -google-api-python-client==1.7.5 -google-auth==1.6.1 -google-auth-httplib2==0.0.3 -httplib2==0.12.0 -oauth2client==4.1.3 -pyasn1==0.4.4 -pyasn1-modules==0.2.2 -rsa==4.0 -six==1.11.0 -tqdm==4.28.1 -uritemplate==3.0.0 From 065f58e501652dce15c291aa05a9a4e37dc700bd Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 12:35:19 -0800 Subject: [PATCH 94/97] cli: fix mpnomad location --- emmet/{emmet => }/scripts/retrieve_mpraw_data.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename emmet/{emmet => }/scripts/retrieve_mpraw_data.py (100%) diff --git a/emmet/emmet/scripts/retrieve_mpraw_data.py b/emmet/scripts/retrieve_mpraw_data.py similarity index 100% rename from emmet/emmet/scripts/retrieve_mpraw_data.py rename to emmet/scripts/retrieve_mpraw_data.py From 4bad927cef4b20aa095aaffa2e97d253a6b6a5c9 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 12:46:23 -0800 Subject: [PATCH 95/97] cli: retire retrieve_mpraw_data --- emmet/scripts/emmet.py | 43 +++++++++- emmet/scripts/retrieve_mpraw_data.py | 121 --------------------------- 2 files changed, 41 insertions(+), 123 deletions(-) delete mode 100644 emmet/scripts/retrieve_mpraw_data.py diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index a67bed480c..ffe4a24216 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -1,4 +1,4 @@ -import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math +import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math, io, requests from shutil import copyfile, rmtree from glob import glob from fnmatch import fnmatch @@ -25,7 +25,8 @@ from googleapiclient.discovery import build from httplib2 import Http from oauth2client import file, client, tools -from googleapiclient.http import MediaFileUpload +from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload +from tqdm import tqdm if 'FW_CONFIG_FILE' not in os.environ: print('Please set FW_CONFIG_FILE!') @@ -40,6 +41,8 @@ SCOPES = 'https://www.googleapis.com/auth/drive' current_year = int(datetime.today().year) year_tags = ['mp_{}'.format(y) for y in range(2018, current_year+1)] +NOMAD_OUTDIR = '/nomad/nomadlab/mpraw' +NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}' def aggregate_by_formula(coll, q, key=None): query = {'$and': [q, exclude]} @@ -1283,6 +1286,17 @@ def upload_archive(path, name, service, parent=None): print("Uploaded %d%%." % int(status.progress() * 100)) print("Upload Complete!") +def download_file(service, file_id): + request = service.files().get_media(fileId=file_id) + fh = io.BytesIO() + downloader = MediaIoBaseDownload(fh, request) + done = False + with tqdm(total=100) as pbar: + while done is False: + status, done = downloader.next_chunk() + pbar.update(int(status.progress() * 100)) + return fh.getvalue() + @cli.command() @click.argument('target_db_file', type=click.Path(exists=True)) @click.option('--block-filter', '-f', help='block filter substring (e.g. block_2017-)') @@ -1324,6 +1338,28 @@ def recurse(service, folder_id): launcher_name = launcher['name'].replace('.tar.gz', '') full_launcher_path.append(launcher_name) launcher_paths.append(os.path.join(*full_launcher_path)) + + # TODO NoMaD integration + #nomad_query='repository_main_file_uri="{}"'.format(launcher_name) + ##nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO + #print(nomad_query) + #resp = requests.get(NOMAD_REPO.format(nomad_query)).json() + #if 'meta' in resp: + # path = os.path.join(*full_launcher_path) + '.tar.gz' + # if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo + # print('Retrieve', path, '...') + # if not os.path.exists(path): + # os.makedirs(path) + # #content = download_file(service, launcher['id']) + # #with open(path, 'wb') as f: + # # f.write(content) + # print('... DONE.') + # else: + # print(path, 'found in NoMaD repo:') + # for d in resp['data']: + # print('\t', d['attributes']['repository_uri']) + #else: + # raise Exception(resp['errors'][0]['detail']) else: full_launcher_path.append(launcher['name']) recurse(service, launcher['id']) @@ -1334,6 +1370,9 @@ def recurse(service, folder_id): if page_token is None: break # done with launchers in current block + + # TODO older launcher directories don't have prefix + # TODO also cover non-b/l hierarchy block_page_token = None block_query = "'{}' in parents".format(garden_id) if block_filter is None \ else "'{}' in parents and name contains '{}'".format(garden_id, block_filter) diff --git a/emmet/scripts/retrieve_mpraw_data.py b/emmet/scripts/retrieve_mpraw_data.py deleted file mode 100644 index 7360afa514..0000000000 --- a/emmet/scripts/retrieve_mpraw_data.py +++ /dev/null @@ -1,121 +0,0 @@ -from __future__ import print_function -import io, os, sys -from googleapiclient.discovery import build -from httplib2 import Http -from oauth2client import file, client, tools -from googleapiclient.http import MediaIoBaseDownload -from tqdm import tqdm -import requests - -# If modifying these scopes, delete the file token.json. -# see https://developers.google.com/identity/protocols/googlescopes#drivev3 -SCOPES = 'https://www.googleapis.com/auth/drive' -OUTDIR = '/nomad/nomadlab/mpraw' -NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}' - -def download_file(service, file_id): - request = service.files().get_media(fileId=file_id) - fh = io.BytesIO() - downloader = MediaIoBaseDownload(fh, request) - done = False - with tqdm(total=100) as pbar: - while done is False: - status, done = downloader.next_chunk() - pbar.update(int(status.progress() * 100)) - return fh.getvalue() - -full_launcher_path = [] - -def recurse(service, folder_id): - page_token = None - query = "'{}' in parents".format(folder_id) - while True: - response = service.files().list( - q=query, spaces='drive', pageToken=page_token, - fields='nextPageToken, files(id, name, modifiedTime, size)', - pageSize=50 - ).execute() - - for launcher in response['files']: - if '.tar.gz' in launcher['name']: - print(launcher) - launcher_name = launcher['name'].replace('.tar.gz', '') - full_launcher_path.append(launcher_name) - nomad_query='repository_main_file_uri="{}"'.format(launcher_name) - #nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO - print(nomad_query) - resp = requests.get(NOMAD_REPO.format(nomad_query)).json() - if 'meta' in resp: - path = os.path.join(*full_launcher_path) + '.tar.gz' - if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo - print('Retrieve', path, '...') - if not os.path.exists(path): - os.makedirs(path) - #content = download_file(service, launcher['id']) - #with open(path, 'wb') as f: - # f.write(content) - print('... DONE.') - else: - print(path, 'found in NoMaD repo:') - for d in resp['data']: - print('\t', d['attributes']['repository_uri']) - else: - raise Exception(resp['errors'][0]['detail']) - else: - full_launcher_path.append(launcher['name']) - recurse(service, launcher['id']) - - del full_launcher_path[-1:] - - page_token = response.get('nextPageToken', None) - if page_token is None: - break # done with launchers in current block - -def main(): - """Shows basic usage of the Drive v3 API. - Prints the names and ids of the first 10 files the user has access to. - """ - # The file token.json stores the user's access and refresh tokens, and is - # created automatically when the authorization flow completes for the first - # time. - store = file.Storage('token.json') - creds = store.get() - if not creds or creds.invalid: - flow = client.flow_from_clientsecrets('credentials.json', SCOPES) - creds = tools.run_flow(flow, store) - service = build('drive', 'v3', http=creds.authorize(Http())) - - # Call the Drive v3 API - # https://developers.google.com/drive/api/v3/search-parameters#fn1 - # TODO older launcher directories don't have prefix - # TODO also cover non-b/l hierarchy - block_page_token = None - garden_id = os.environ.get('MPDRIVE_GARDEN_ID') - if garden_id: - #block_query = "'{}' in parents and name contains 'block_'".format(garden_id) - block_query = "'{}' in parents and name contains 'block_2011-10-07-08-57-17-804213'".format(garden_id) - else: - print('MPDRIVE_GARDEN_ID not set!') - return - - while True: - block_response = service.files().list( - q=block_query, spaces='drive', pageToken=block_page_token, - fields='nextPageToken, files(id, name)', pageSize=10 - ).execute() - - for block in block_response['files']: - print(block['name']) - full_launcher_path.clear() - full_launcher_path.append(block['name']) - recurse(service, block['id']) - - block_page_token = block_response.get('nextPageToken', None) - if block_page_token is None: - break # done with blocks - - # TODO in production, subscribe to watch garden directory? - # https://developers.google.com/drive/api/v3/reference/files/watch - -if __name__ == '__main__': - main() From bcb24d486b1d01c0e75e8862b8f3ec12beb5e7b6 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 22:49:55 +0100 Subject: [PATCH 96/97] cli: don't import DLSVolumePredictor --- emmet/scripts/emmet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index ffe4a24216..e3bbdd83e2 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -8,7 +8,7 @@ from pymongo.errors import CursorNotFound from pymongo.collection import ReturnDocument from pymongo.errors import DocumentTooLarge -from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor +#from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor from pymatgen import Structure from pymatgen.alchemy.materials import TransformedStructure from pymatgen.util.provenance import StructureNL, Author @@ -597,7 +597,7 @@ def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, print(tasks_coll.count(), 'tasks in', full_name) NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'] - vp = DLSVolumePredictor() + #vp = DLSVolumePredictor() tags = OrderedDict() if tag is None: From 42f0e80492d0ecbdbc6b02c73ed7927f3039c708 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Tue, 5 Feb 2019 22:50:18 +0100 Subject: [PATCH 97/97] cli: better launchpad autoload --- emmet/scripts/emmet.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py index e3bbdd83e2..fb27fc5493 100644 --- a/emmet/scripts/emmet.py +++ b/emmet/scripts/emmet.py @@ -28,9 +28,11 @@ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload from tqdm import tqdm -if 'FW_CONFIG_FILE' not in os.environ: - print('Please set FW_CONFIG_FILE!') - sys.exit(0) +def get_lpad(): + if 'FW_CONFIG_FILE' not in os.environ: + print('Please set FW_CONFIG_FILE!') + sys.exit(0) + return LaunchPad.auto_load() exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}} skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+'] @@ -167,7 +169,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs, insert): def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs): name = multiprocessing.current_process().name print(name, 'starting') - lpad = LaunchPad.auto_load() + lpad = get_lpad() target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print(name, 'connected to target db with', target.collection.count(), 'tasks') @@ -253,7 +255,7 @@ def copy(target_db_file, tag, insert, copy_snls): if not insert: print('DRY RUN: add --insert flag to actually add tasks to production') - lpad = LaunchPad.auto_load() + lpad = get_lpad() source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to source db with', source.collection.count(), 'tasks') @@ -448,7 +450,7 @@ def insert_snls(snls_list): @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan') def find(email, add_snlcolls, add_tasks_db): """checks status of calculations by submitter or author email in SNLs""" - lpad = LaunchPad.auto_load() + lpad = get_lpad() snl_collections = [lpad.db.snls] if add_snlcolls is not None: @@ -506,7 +508,7 @@ def find(email, add_snlcolls, add_tasks_db): @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition') def bandstructure(target_db_file, insert): """add workflows for bandstructure based on materials collection""" - lpad = LaunchPad.auto_load() + lpad = get_lpad() source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to source db with', source.collection.count(), 'tasks') target = VaspCalcDb.from_db_file(target_db_file, admin=True) @@ -564,7 +566,7 @@ def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, if not insert: print('DRY RUN! Add --insert flag to actually add workflows') - lpad = LaunchPad.auto_load() + lpad = get_lpad() snl_collections = [lpad.db.snls] if add_snlcolls is not None: @@ -980,7 +982,7 @@ def format(self, record): def report(tag, in_progress, to_csv): """generate a report of calculations status""" - lpad = LaunchPad.auto_load() + lpad = get_lpad() states = Firework.STATE_RANKS states = sorted(states, key=states.get) @@ -1108,7 +1110,7 @@ def add_snls(tag, input_structures, add_snlcolls, insert): meta = yaml.safe_load(f) meta['authors'] = [Author.parse_author(a) for a in meta['authors']] - lpad = LaunchPad.auto_load() + lpad = get_lpad() snl_collections = [lpad.db.snls] if add_snlcolls is not None: for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')): @@ -1228,7 +1230,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs): if not insert: print('DRY RUN: add --insert flag to actually insert tasks') - lpad = LaunchPad.auto_load() + lpad = get_lpad() target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password) print('connected to target db with', target.collection.count(), 'tasks') base_path = os.path.join(base_path, '')