From fc1b5e1552677e880a08bfa90ff0580401458bdd Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 27 Apr 2018 14:31:19 -0700
Subject: [PATCH 01/97] start cli with add_tasks command

---
 emmet/scripts/__init__.py |  0
 emmet/scripts/emmet.py    | 76 +++++++++++++++++++++++++++++++++++++++
 setup.py                  | 10 ++++--
 3 files changed, 84 insertions(+), 2 deletions(-)
 create mode 100644 emmet/scripts/__init__.py
 create mode 100644 emmet/scripts/emmet.py

diff --git a/emmet/scripts/__init__.py b/emmet/scripts/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
new file mode 100644
index 0000000000..3522c87c12
--- /dev/null
+++ b/emmet/scripts/emmet.py
@@ -0,0 +1,76 @@
+import click, os
+from atomate.vasp.database import VaspCalcDb
+
+@click.group()
+def cli():
+    pass
+
+@cli.command()
+@click.option('--source_db_file', default="source.json", help='source db file')
+@click.option('--target_db_file', default="target.json", help='target db file')
+@click.option('--tag', default=None, help='only insert tasks with specific tag')
+@click.option('--insert/--no-insert', default=False, help='actually execute task addition')
+def add_tasks(source_db_file, target_db_file, tag, insert):
+    """Retrieve tasks from source and add to target"""
+
+    def get_subdir(dn):
+        return dn.rsplit(os.sep, 1)[-1]
+
+    if not os.path.exists(source_db_file):
+        print(source_db_file, 'not found!')
+        return
+    source = VaspCalcDb.from_db_file(source_db_file, admin=True) # '../config/db.json'
+    print('connected to source db with', source.collection.count(), 'tasks')
+
+    if not os.path.exists(target_db_file):
+        print(target_db_file, 'not found!')
+        return
+    target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json'
+    print('connected to target db with', target.collection.count(), 'tasks')
+
+    tags = [tag]
+    if tag is None:
+        tags = [t for t in source.collection.distinct('tags') if t is not None]
+        print(len(tags), 'tags in source collection')
+
+    for t in tags:
+
+        print('tag:', t)
+        query = {'tags': t}
+        source_count = source.collection.count(query)
+        print('source:', source_count, 'tasks out of', source.collection.count())
+        print('target:', target.collection.count(query), 'tasks out of', target.collection.count())
+
+        # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*])
+        source_task_ids = source.collection.find(query).distinct('task_id')
+        source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)]
+        skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id')
+        print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids))
+
+        query.update({'task_id': {'$nin': skip_task_ids}})
+        already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
+        subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs]
+        print(len(subdirs), 'candidate tasks to insert')
+        if len(subdirs) < 1:
+            continue
+
+        if not insert:
+            print('add --insert flag to actually add tasks to production')
+            continue
+
+        for subdir in subdirs:
+            subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}}
+            doc = target.collection.find_one(subdir_query, {'task_id': 1})
+            if doc:
+                print(subdir, 'already inserted as', doc['task_id'])
+                continue
+
+            source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id']
+            print('retrieve', source_task_id, 'for', subdir)
+            task_doc = source.retrieve_task(source_task_id)
+
+            if isinstance(task_doc['task_id'], int):
+                c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"]
+                task_doc['task_id'] = 'mp-{}'.format(c)
+
+            target.insert_task(task_doc, use_gridfs=True)
diff --git a/setup.py b/setup.py
index 360464490d..2e76da10e2 100644
--- a/setup.py
+++ b/setup.py
@@ -17,12 +17,13 @@
         author_email='matproj-develop@googlegroups.com',
         license='modified BSD',
         packages=find_packages(),
+        include_package_data=True,
         package_data={},
         zip_safe=False,
         install_requires=[
             'atomate', 'pymatgen>=2018.4.20','maggma','monty',
             'six', 'pydash', 'tqdm', 'matminer',
-            'prettyplotlib', "pybtex"
+            'prettyplotlib', "pybtex", "Click"
         ],
         classifiers=["Programming Language :: Python :: 3",
                      "Programming Language :: Python :: 3.6",
@@ -34,5 +35,10 @@
                      'Topic :: Other/Nonlisted Topic',
                      'Topic :: Scientific/Engineering'],
         test_suite='nose.collector',
-        tests_require=['nose']
+        tests_require=['nose'],
+        py_modules=['emmet'],
+        entry_points='''
+        [console_scripts]
+        emmet=emmet.scripts.emmet:cli
+        ''',
     )

From 6d6d8f31783920c90031030088adf354f0c7903e Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 10 May 2018 13:44:12 -0700
Subject: [PATCH 02/97] save progress on add_tasks cli

---
 emmet/scripts/emmet.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 3522c87c12..d083c72691 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,5 +1,6 @@
 import click, os
 from atomate.vasp.database import VaspCalcDb
+from pymongo.collection import ReturnDocument
 
 @click.group()
 def cli():
@@ -13,6 +14,8 @@ def cli():
 def add_tasks(source_db_file, target_db_file, tag, insert):
     """Retrieve tasks from source and add to target"""
 
+    exclude = {'tags': {'$ne': 'deprecated'}}
+
     def get_subdir(dn):
         return dn.rsplit(os.sep, 1)[-1]
 
@@ -28,32 +31,40 @@ def get_subdir(dn):
     target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json'
     print('connected to target db with', target.collection.count(), 'tasks')
 
+    indexes = ['task_id', 'tags', 'dir_name']
+    for index in indexes:
+        for db in [source, target]:
+           keys = [k.rsplit('_', 1)[0] for k in db.collection.index_information().keys()]
+           if index not in keys:
+               db.collection.ensure_index(index)
+               print('ensured index', index)
+
     tags = [tag]
     if tag is None:
-        tags = [t for t in source.collection.distinct('tags') if t is not None]
+        tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None]
         print(len(tags), 'tags in source collection')
 
     for t in tags:
 
-        print('tag:', t)
-        query = {'tags': t}
+        print('### {} ###'.format(t))
+        query = {'$and': [{'tags': t}, exclude]}
         source_count = source.collection.count(query)
-        print('source:', source_count, 'tasks out of', source.collection.count())
-        print('target:', target.collection.count(query), 'tasks out of', target.collection.count())
+        print('source / target:', source_count, '/', target.collection.count(query))
 
         # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*])
         source_task_ids = source.collection.find(query).distinct('task_id')
         source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)]
         skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id')
-        print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids))
+        if len(skip_task_ids):
+            print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids))
 
         query.update({'task_id': {'$nin': skip_task_ids}})
         already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
         subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs]
-        print(len(subdirs), 'candidate tasks to insert')
         if len(subdirs) < 1:
             continue
 
+        print(len(subdirs), 'candidate tasks to insert')
         if not insert:
             print('add --insert flag to actually add tasks to production')
             continue

From 7f80f9b9bcbb8c25114054dcf3ac6e9529b317b3 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 12 Jun 2018 16:42:44 -0700
Subject: [PATCH 03/97] cli: add_wflows subcommand

---
 emmet/scripts/emmet.py  | 356 +++++++++++++++++++++++++++++++++++++---
 emmet/vasp/materials.py |   5 +-
 2 files changed, 337 insertions(+), 24 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index d083c72691..40926cb241 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,17 +1,27 @@
-import click, os
-from atomate.vasp.database import VaspCalcDb
+import click, os, yaml, sys, logging, operator
+from collections import Counter
+from pymongo import MongoClient
 from pymongo.collection import ReturnDocument
+from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
+from pymatgen import Structure
+from fireworks import LaunchPad
+from atomate.vasp.database import VaspCalcDb
+from atomate.vasp.workflows.presets.core import wf_structure_optimization
+from atomate.vasp.database import VaspCalcDb
+from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
+from emmet.vasp.materials import group_structures, get_sg
+from emmet.vasp.task_tagger import task_type
+from log4mongo.handlers import MongoHandler
 
 @click.group()
 def cli():
     pass
 
 @cli.command()
-@click.option('--source_db_file', default="source.json", help='source db file')
 @click.option('--target_db_file', default="target.json", help='target db file')
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute task addition')
-def add_tasks(source_db_file, target_db_file, tag, insert):
+def add_tasks(target_db_file, tag, insert):
     """Retrieve tasks from source and add to target"""
 
     exclude = {'tags': {'$ne': 'deprecated'}}
@@ -19,11 +29,9 @@ def add_tasks(source_db_file, target_db_file, tag, insert):
     def get_subdir(dn):
         return dn.rsplit(os.sep, 1)[-1]
 
-    if not os.path.exists(source_db_file):
-        print(source_db_file, 'not found!')
-        return
-    source = VaspCalcDb.from_db_file(source_db_file, admin=True) # '../config/db.json'
-    print('connected to source db with', source.collection.count(), 'tasks')
+    lpad = LaunchPad.auto_load()
+    source = lpad.db.tasks
+    print('connected to source db with', source.count(), 'tasks')
 
     if not os.path.exists(target_db_file):
         print(target_db_file, 'not found!')
@@ -31,28 +39,22 @@ def get_subdir(dn):
     target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json'
     print('connected to target db with', target.collection.count(), 'tasks')
 
-    indexes = ['task_id', 'tags', 'dir_name']
-    for index in indexes:
-        for db in [source, target]:
-           keys = [k.rsplit('_', 1)[0] for k in db.collection.index_information().keys()]
-           if index not in keys:
-               db.collection.ensure_index(index)
-               print('ensured index', index)
+    ensure_indexes(['task_id', 'tags', 'dir_name'], [source, target.collection])
 
     tags = [tag]
     if tag is None:
-        tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None]
+        tags = [t for t in source.find(exclude).distinct('tags') if t is not None]
         print(len(tags), 'tags in source collection')
 
     for t in tags:
 
         print('### {} ###'.format(t))
         query = {'$and': [{'tags': t}, exclude]}
-        source_count = source.collection.count(query)
+        source_count = source.count(query)
         print('source / target:', source_count, '/', target.collection.count(query))
 
         # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*])
-        source_task_ids = source.collection.find(query).distinct('task_id')
+        source_task_ids = source.find(query).distinct('task_id')
         source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)]
         skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id')
         if len(skip_task_ids):
@@ -60,7 +62,7 @@ def get_subdir(dn):
 
         query.update({'task_id': {'$nin': skip_task_ids}})
         already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
-        subdirs = [get_subdir(dn) for dn in source.collection.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs]
+        subdirs = [get_subdir(dn) for dn in source.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs]
         if len(subdirs) < 1:
             continue
 
@@ -76,7 +78,7 @@ def get_subdir(dn):
                 print(subdir, 'already inserted as', doc['task_id'])
                 continue
 
-            source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id']
+            source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id']
             print('retrieve', source_task_id, 'for', subdir)
             task_doc = source.retrieve_task(source_task_id)
 
@@ -85,3 +87,315 @@ def get_subdir(dn):
                 task_doc['task_id'] = 'mp-{}'.format(c)
 
             target.insert_task(task_doc, use_gridfs=True)
+
+
+@cli.command()
+@click.argument('list_of_structures', type=click.File('rb'))
+@click.option('-a', '--alt_tasks_db_file', type=click.Path(exists=True), help='config file for alternative tasks collection')
+@click.option('--tag', default=None, help='only include structures with specific tag')
+@click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
+@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection')
+def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
+    """add workflows for list of structures / SNLs (YAML config or JSON list of pymatgen structures"""
+
+    exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
+
+    if not insert:
+        print('DRY RUN! Add --insert flag to actually add workflows')
+
+    try:
+        snl_db_config = yaml.load(list_of_structures)
+        snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+        snl_db = snl_db_conn[snl_db_config['db']]
+        snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+        snl_coll = snl_db[snl_db_config['collection']]
+    except Exception as ex:
+        print(ex)
+        # NOTE WIP might change it to use add_snls first, and then add_wflows based on SNL collection only
+        # TODO load pymatgen structures from JSON file into MongoDB collection
+        # TODO also fake-tag them, add SNL info
+        snl_coll = None
+        print('to be implemented')
+        return
+    print('# SNLs:\t', snl_coll.count(exclude))
+
+    lpad = LaunchPad.auto_load()
+
+    logger = logging.getLogger('add_wflows')
+    mongo_handler = MongoHandler(
+        host=lpad.host, port=lpad.port, database_name=lpad.name, collection='add_wflows_logs',
+        username=lpad.username, password=lpad.password, authentication_db=lpad.name
+    )
+    logger.addHandler(mongo_handler)
+    ensure_indexes(['level', 'snl_id', 'formula'], [mongo_handler.collection])
+    if clear_logs:
+        mongo_handler.collection.drop()
+
+    if alt_tasks_db_file is not None:
+        target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True)
+        tasks_coll = target.collection
+    else:
+        tasks_coll = lpad.db.tasks
+    print('# tasks:', tasks_coll.count())
+
+    structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
+    NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
+    base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': ['He', 'Ar', 'Ne']}} # exclude no electroneg elements
+    task_base_query = {'_mpworks_meta': {'$exists': 0}}
+    vp = DLSVolumePredictor()
+
+    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label'], [snl_coll])
+
+    tags = [tag]
+    if tag is None:
+        tags = dict(
+            (t, snl_coll.count({'$and': [{'about.remarks': t}, exclude]}))
+            for t in snl_coll.find(exclude).distinct('about.remarks') if t is not None
+        )
+        tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True)
+        print(len(tags), 'tags in source collection')
+
+    canonical_task_structures = {}
+    grouped_workflow_structures = {}
+    canonical_workflow_structures = {}
+
+    for tag, ndocs in tags:
+        query = {'$and': [{'about.remarks': tag}, exclude]}
+        query.update(base_query)
+
+        # TODO WIP will be removed
+        if tag == 'new_ordered_icsd_2017':
+            #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1})
+            print(tag, 'TODO implement db.icsd as snl_coll')
+            continue
+        elif tag == 'pre-atomate production':
+            # TODO scan last
+            continue
+
+        print('aggregate', ndocs, 'structures for', tag, '...')
+        structure_groups = snl_coll.aggregate([
+            {'$match': query}, {'$group': {
+                '_id': '$reduced_cell_formula',
+                'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
+            }}
+        ], allowDiskUse=True, batchSize=50)
+
+        print('loop formulas for', tag, '...')
+        counter = Counter()
+        structures, canonical_structures = {}, {}
+
+        for idx_group, group in enumerate(structure_groups):
+
+            counter['formulas'] += 1
+            formula = group['_id']
+            if formula not in structures:
+                structures[formula] = {}
+            if formula not in canonical_structures:
+                canonical_structures[formula] = {}
+            if idx_group and not idx_group%1000:
+                print(idx_group, '...')
+
+            for dct in group['structures']:
+                if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}):
+                    continue # already checked
+                counter['structures'] += 1
+                s = Structure.from_dict(dct)
+                s.snl_id = dct['snl_id']
+                s.task_id = dct.get('task_id')
+                s.remove_oxidation_states()
+                try:
+                    sgnum = get_sg(s)
+                except Exception as ex:
+                    s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
+                    print(str(ex))
+                    sys.exit(0)
+                if sgnum not in structures[formula]:
+                    structures[formula][sgnum] = []
+                structures[formula][sgnum].append(s)
+
+            for sgnum, slist in structures[formula].items():
+                for g in group_structures(slist):
+                    if sgnum not in canonical_structures[formula]:
+                        canonical_structures[formula][sgnum] = []
+                    canonical_structures[formula][sgnum].append(g[0])
+                    if len(g) > 1:
+                        for s in g[1:]:
+                            logger.warning('duplicate structure', extra={
+                                'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id
+                            })
+
+            if not canonical_structures[formula]:
+                continue
+            #print(sum([len(x) for x in canonical_structures[formula].values()]), 'canonical structure(s) for', formula)
+
+            if formula not in canonical_workflow_structures:
+                canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {}
+                workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1})
+                if workflows.count() > 0:
+                    workflow_structures = {}
+                    for wf in workflows:
+                        s = Structure.from_dict(wf['metadata']['structure'])
+                        s.remove_oxidation_states()
+                        sgnum = get_sg(s)
+                        if sgnum in canonical_structures[formula]:
+                            if sgnum not in workflow_structures:
+                                workflow_structures[sgnum] = []
+                            s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework
+                            workflow_structures[sgnum].append(s)
+                    if workflow_structures:
+                        for sgnum, slist in workflow_structures.items():
+                            grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
+                            canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
+                        #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
+
+            for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
+
+                for struc in slist:
+
+                    try:
+                        struct = vp.get_predicted_structure(struc)
+                        struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                    except Exception as ex:
+                        print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
+                        print(ex)
+                        struct = struc
+
+                    if not structures_match(struct, struc):
+                        print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
+                        struct = struc
+
+                    wf_found, readd_wf = False, False
+                    if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
+                        for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
+                            if structures_match(struct, s):
+                                msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id)
+                                print(msg)
+                                if struct.task_id is not None:
+                                    task_query = {'task_id': struct.task_id}
+                                    task_query.update(task_base_query)
+                                    task = tasks_coll.find_one(task_query, ['input.structure'])
+                                    if task:
+                                        s_task = Structure.from_dict(task['input']['structure'])
+                                        s_task.remove_oxidation_states()
+                                        if not structures_match(struct, s_task):
+                                            msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
+                                            print(msg)
+                                            logger.error(msg, extra={
+                                                'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch'
+                                            })
+                                            counter['snl-task_mismatch'] += 1
+                                        else:
+                                            msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
+                                            print(msg)
+                                            logger.warning(msg, extra={
+                                                'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id
+                                            })
+                                    else:
+                                        print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
+                                        fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]]
+                                        fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks'])
+                                        fw_found = False
+                                        for fw in fws:
+                                            if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
+                                                msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
+                                                print(msg)
+                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id})
+                                                fw_found = True
+                                                break
+                                        if not fw_found:
+                                            print('  --> no WF with enforced task-id', struct.task_id, '-> re-add workflow')
+                                            readd_wf = True
+                                            break
+                                else:
+                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
+                                wf_found = True
+                                break
+
+                    if wf_found:
+                        continue
+
+                    # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
+                    if not readd_wf:
+                        try:
+                            if formula not in canonical_task_structures:
+                                canonical_task_structures[formula] = {}
+                                task_query = {'formula_pretty': formula}
+                                task_query.update(task_base_query)
+                                tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1})
+                                if tasks.count() > 0:
+                                    task_structures = {}
+                                    for task in tasks:
+                                        task_label = task_type(task['orig_inputs'], include_calc_type=False)
+                                        if task_label == "Structure Optimization":
+                                            s = Structure.from_dict(task['input']['structure'])
+                                            sg = get_sg(s)
+                                            if sg in canonical_structures[formula]:
+                                                if sg not in task_structures:
+                                                    task_structures[sg] = []
+                                                s.task_id = task['task_id']
+                                                task_structures[sg].append(s)
+                                    if task_structures:
+                                        for sg, slist in task_structures.items():
+                                            canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)]
+                                        #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula)
+
+                            matched_task_ids = []
+                            if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]:
+                                for s in canonical_task_structures[formula][sgnum]:
+                                    if structures_match(struct, s):
+                                        print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id)
+                                        matched_task_ids.append(s.task_id)
+                                if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids:
+                                    print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids)
+                                    raise ValueError
+                            if matched_task_ids:
+                                logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
+                                continue
+                        except ValueError as ex:
+                            counter['unmatched_task_id'] += 1
+                            continue
+
+                    msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
+                    if struct.task_id is not None:
+                        msg += ' --> enforcing task-id {}'.format(struct.task_id)
+                    print(msg)
+
+                    no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
+                    if len(no_potcars) > 0:
+                        msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
+                        print(msg)
+                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars})
+                        continue
+
+                    try:
+                        wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
+                        wf = add_trackers(wf)
+                        wf = add_tags(wf, [tag])
+                        if struct.task_id is not None:
+                            wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
+                        #if struct.icsd_id is not None:
+                        #    wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id})
+                    except:
+                        msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
+                        print(msg)
+                        logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'})
+                        continue
+
+                    if insert:
+                        old_new = lpad.add_wf(wf)
+                        logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                    counter['add(ed)'] += 1
+
+        print(counter)
+
+
+def structures_match(s1, s2):
+    return bool(len(list(group_structures([s1, s2]))) == 1)
+
+def ensure_indexes(indexes, colls):
+    for index in indexes:
+        for coll in colls:
+           keys = [k.rsplit('_', 1)[0] for k in coll.index_information().keys()]
+           if index not in keys:
+               coll.ensure_index(index)
+               print('ensured index', index, 'on', coll.full_name)
diff --git a/emmet/vasp/materials.py b/emmet/vasp/materials.py
index 3a8a41a843..e0ff39b325 100644
--- a/emmet/vasp/materials.py
+++ b/emmet/vasp/materials.py
@@ -274,6 +274,8 @@ def ensure_indicies(self):
         self.materials.ensure_index("task_ids")
         self.materials.ensure_index(self.materials.lu_field)
 
+def get_sg(struc):
+    return struc.get_space_group_info(symprec=0.1)[1]
 
 def structure_metadata(structure):
     """
@@ -319,9 +321,6 @@ def group_structures(structures, ltol=0.2, stol=0.3, angle_tol=5, separate_mag_o
         allow_subset=False,
         comparator=ElementComparator())
 
-    def get_sg(struc):
-        return struc.get_space_group_info(symprec=0.1)[1]
-
     def get_mag_ordering(struc):
         return CollinearMagneticStructureAnalyzer(struc).ordering.value
 

From f2ed91ec5f90f7d9caea8fe0de4583a69ecca2bb Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 14 Jun 2018 14:20:43 -0700
Subject: [PATCH 04/97] cli: CursorNotFound, electroneg query

---
 emmet/scripts/emmet.py | 424 ++++++++++++++++++++++-------------------
 1 file changed, 223 insertions(+), 201 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 40926cb241..5920cc9adb 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,6 +1,7 @@
 import click, os, yaml, sys, logging, operator
-from collections import Counter
+from collections import Counter, OrderedDict
 from pymongo import MongoClient
+from pymongo.errors import CursorNotFound
 from pymongo.collection import ReturnDocument
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
@@ -140,20 +141,29 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
 
     structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
     NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
-    base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': ['He', 'Ar', 'Ne']}} # exclude no electroneg elements
+    no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+']
+    base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
     task_base_query = {'_mpworks_meta': {'$exists': 0}}
     vp = DLSVolumePredictor()
 
-    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label'], [snl_coll])
+    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll])
 
-    tags = [tag]
+    tags = []
     if tag is None:
-        tags = dict(
-            (t, snl_coll.count({'$and': [{'about.remarks': t}, exclude]}))
-            for t in snl_coll.find(exclude).distinct('about.remarks') if t is not None
-        )
+        query = dict(exclude)
+        query.update(base_query)
+        remarks = filter(None, snl_coll.find(query).distinct('about.remarks'))
+        for t in remarks:
+            query = {'$and': [{'about.remarks': t}, exclude]}
+            query.update(base_query)
+            tags.append((t, snl_coll.count(query)))
         tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True)
-        print(len(tags), 'tags in source collection')
+        print(len(tags), 'tags in source collection => TOP10:')
+        print('\n'.join(['{} ({})'.format(*t) for t in tags[:10]]))
+    else:
+        query = {'$and': [{'about.remarks': tag}, exclude]}
+        query.update(base_query)
+        tags = [(tag, snl_coll.count(query))]
 
     canonical_task_structures = {}
     grouped_workflow_structures = {}
@@ -174,217 +184,229 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
 
         print('aggregate', ndocs, 'structures for', tag, '...')
         structure_groups = snl_coll.aggregate([
-            {'$match': query}, {'$group': {
+            {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
+            {'$group': {
                 '_id': '$reduced_cell_formula',
                 'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
             }}
-        ], allowDiskUse=True, batchSize=50)
+        ], allowDiskUse=True, batchSize=1)
 
         print('loop formulas for', tag, '...')
         counter = Counter()
         structures, canonical_structures = {}, {}
 
-        for idx_group, group in enumerate(structure_groups):
-
-            counter['formulas'] += 1
-            formula = group['_id']
-            if formula not in structures:
-                structures[formula] = {}
-            if formula not in canonical_structures:
-                canonical_structures[formula] = {}
-            if idx_group and not idx_group%1000:
-                print(idx_group, '...')
-
-            for dct in group['structures']:
-                if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}):
-                    continue # already checked
-                counter['structures'] += 1
-                s = Structure.from_dict(dct)
-                s.snl_id = dct['snl_id']
-                s.task_id = dct.get('task_id')
-                s.remove_oxidation_states()
-                try:
-                    sgnum = get_sg(s)
-                except Exception as ex:
-                    s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
-                    print(str(ex))
-                    sys.exit(0)
-                if sgnum not in structures[formula]:
-                    structures[formula][sgnum] = []
-                structures[formula][sgnum].append(s)
-
-            for sgnum, slist in structures[formula].items():
-                for g in group_structures(slist):
-                    if sgnum not in canonical_structures[formula]:
-                        canonical_structures[formula][sgnum] = []
-                    canonical_structures[formula][sgnum].append(g[0])
-                    if len(g) > 1:
-                        for s in g[1:]:
-                            logger.warning('duplicate structure', extra={
-                                'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id
-                            })
-
-            if not canonical_structures[formula]:
-                continue
-            #print(sum([len(x) for x in canonical_structures[formula].values()]), 'canonical structure(s) for', formula)
-
-            if formula not in canonical_workflow_structures:
-                canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {}
-                workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1})
-                if workflows.count() > 0:
-                    workflow_structures = {}
-                    for wf in workflows:
-                        s = Structure.from_dict(wf['metadata']['structure'])
-                        s.remove_oxidation_states()
-                        sgnum = get_sg(s)
-                        if sgnum in canonical_structures[formula]:
-                            if sgnum not in workflow_structures:
-                                workflow_structures[sgnum] = []
-                            s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework
-                            workflow_structures[sgnum].append(s)
-                    if workflow_structures:
-                        for sgnum, slist in workflow_structures.items():
-                            grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
-                            canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
-                        #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
-
-            for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
-
-                for struc in slist:
-
+        try:
+            for idx_group, group in enumerate(structure_groups):
+
+                counter['formulas'] += 1
+                formula = group['_id']
+                if formula not in structures:
+                    structures[formula] = {}
+                if formula not in canonical_structures:
+                    canonical_structures[formula] = {}
+                if idx_group and not idx_group%1000:
+                    print(idx_group, '...')
+
+                for dct in group['structures']:
+                    if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}):
+                        continue # already checked
+                    mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups
+                    counter['structures'] += 1
+                    s = Structure.from_dict(dct)
+                    s.snl_id = dct['snl_id']
+                    s.task_id = dct.get('task_id')
+                    s.remove_oxidation_states()
                     try:
-                        struct = vp.get_predicted_structure(struc)
-                        struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                        sgnum = get_sg(s)
                     except Exception as ex:
-                        print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
-                        print(ex)
-                        struct = struc
-
-                    if not structures_match(struct, struc):
-                        print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
-                        struct = struc
-
-                    wf_found, readd_wf = False, False
-                    if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
-                        for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
-                            if structures_match(struct, s):
-                                msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id)
-                                print(msg)
-                                if struct.task_id is not None:
-                                    task_query = {'task_id': struct.task_id}
-                                    task_query.update(task_base_query)
-                                    task = tasks_coll.find_one(task_query, ['input.structure'])
-                                    if task:
-                                        s_task = Structure.from_dict(task['input']['structure'])
-                                        s_task.remove_oxidation_states()
-                                        if not structures_match(struct, s_task):
-                                            msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
-                                            print(msg)
-                                            logger.error(msg, extra={
-                                                'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch'
-                                            })
-                                            counter['snl-task_mismatch'] += 1
-                                        else:
-                                            msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
-                                            print(msg)
-                                            logger.warning(msg, extra={
-                                                'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id
-                                            })
-                                    else:
-                                        print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
-                                        fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]]
-                                        fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks'])
-                                        fw_found = False
-                                        for fw in fws:
-                                            if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
-                                                msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
-                                                print(msg)
-                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id})
-                                                fw_found = True
-                                                break
-                                        if not fw_found:
-                                            print('  --> no WF with enforced task-id', struct.task_id, '-> re-add workflow')
-                                            readd_wf = True
-                                            break
-                                else:
-                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
-                                wf_found = True
-                                break
-
-                    if wf_found:
+                        s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
+                        msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)})
                         continue
+                    if sgnum not in structures[formula]:
+                        structures[formula][sgnum] = []
+                    structures[formula][sgnum].append(s)
+
+                for sgnum, slist in structures[formula].items():
+                    for g in group_structures(slist):
+                        if sgnum not in canonical_structures[formula]:
+                            canonical_structures[formula][sgnum] = []
+                        canonical_structures[formula][sgnum].append(g[0])
+                        if len(g) > 1:
+                            for s in g[1:]:
+                                logger.warning('duplicate structure', extra={
+                                    'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id
+                                })
+
+                if not canonical_structures[formula]:
+                    continue
+                canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist]
+
+                if formula not in canonical_workflow_structures:
+                    canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {}
+                    workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1})
+                    if workflows.count() > 0:
+                        workflow_structures = {}
+                        for wf in workflows:
+                            s = Structure.from_dict(wf['metadata']['structure'])
+                            s.remove_oxidation_states()
+                            sgnum = get_sg(s)
+                            if sgnum in canonical_structures[formula]:
+                                if sgnum not in workflow_structures:
+                                    workflow_structures[sgnum] = []
+                                s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework
+                                workflow_structures[sgnum].append(s)
+                        if workflow_structures:
+                            for sgnum, slist in workflow_structures.items():
+                                grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
+                                canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
+                            #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
+
+                for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
+
+                    for struc in slist:
 
-                    # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
-                    if not readd_wf:
                         try:
-                            if formula not in canonical_task_structures:
-                                canonical_task_structures[formula] = {}
-                                task_query = {'formula_pretty': formula}
-                                task_query.update(task_base_query)
-                                tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1})
-                                if tasks.count() > 0:
-                                    task_structures = {}
-                                    for task in tasks:
-                                        task_label = task_type(task['orig_inputs'], include_calc_type=False)
-                                        if task_label == "Structure Optimization":
-                                            s = Structure.from_dict(task['input']['structure'])
-                                            sg = get_sg(s)
-                                            if sg in canonical_structures[formula]:
-                                                if sg not in task_structures:
-                                                    task_structures[sg] = []
-                                                s.task_id = task['task_id']
-                                                task_structures[sg].append(s)
-                                    if task_structures:
-                                        for sg, slist in task_structures.items():
-                                            canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)]
-                                        #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula)
-
-                            matched_task_ids = []
-                            if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]:
-                                for s in canonical_task_structures[formula][sgnum]:
-                                    if structures_match(struct, s):
-                                        print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id)
-                                        matched_task_ids.append(s.task_id)
-                                if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids:
-                                    print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids)
-                                    raise ValueError
-                            if matched_task_ids:
-                                logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
-                                continue
-                        except ValueError as ex:
-                            counter['unmatched_task_id'] += 1
-                            continue
+                            struct = vp.get_predicted_structure(struc)
+                            struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                        except Exception as ex:
+                            print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
+                            print(ex)
+                            struct = struc
+
+                        if not structures_match(struct, struc):
+                            print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
+                            struct = struc
+
+                        wf_found, readd_wf = False, False
+                        if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
+                            for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
+                                if structures_match(struct, s):
+                                    msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id)
+                                    print(msg)
+                                    if struct.task_id is not None:
+                                        task_query = {'task_id': struct.task_id}
+                                        task_query.update(task_base_query)
+                                        task = tasks_coll.find_one(task_query, ['input.structure'])
+                                        if task:
+                                            s_task = Structure.from_dict(task['input']['structure'])
+                                            s_task.remove_oxidation_states()
+                                            if not structures_match(struct, s_task):
+                                                msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
+                                                print(msg)
+                                                logger.error(msg, extra={
+                                                    'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch'
+                                                })
+                                                counter['snl-task_mismatch'] += 1
+                                            else:
+                                                msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
+                                                print(msg)
+                                                logger.warning(msg, extra={
+                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id
+                                                })
+                                        else:
+                                            print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
+                                            fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]]
+                                            fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks'])
+                                            fw_found = False
+                                            for fw in fws:
+                                                if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
+                                                    msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
+                                                    print(msg)
+                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id})
+                                                    fw_found = True
+                                                    break
+                                            if not fw_found:
+                                                print('  --> no WF with enforced task-id', struct.task_id, '-> re-add workflow')
+                                                readd_wf = True
+                                                break
+                                    else:
+                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
+                                    wf_found = True
+                                    break
 
-                    msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
-                    if struct.task_id is not None:
-                        msg += ' --> enforcing task-id {}'.format(struct.task_id)
-                    print(msg)
+                        if wf_found:
+                            continue
 
-                    no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
-                    if len(no_potcars) > 0:
-                        msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
-                        print(msg)
-                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars})
-                        continue
+                        # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
+                        if not readd_wf:
+                            try:
+                                if formula not in canonical_task_structures:
+                                    canonical_task_structures[formula] = {}
+                                    task_query = {'formula_pretty': formula}
+                                    task_query.update(task_base_query)
+                                    tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1})
+                                    if tasks.count() > 0:
+                                        task_structures = {}
+                                        for task in tasks:
+                                            task_label = task_type(task['orig_inputs'], include_calc_type=False)
+                                            if task_label == "Structure Optimization":
+                                                s = Structure.from_dict(task['input']['structure'])
+                                                sg = get_sg(s)
+                                                if sg in canonical_structures[formula]:
+                                                    if sg not in task_structures:
+                                                        task_structures[sg] = []
+                                                    s.task_id = task['task_id']
+                                                    task_structures[sg].append(s)
+                                        if task_structures:
+                                            for sg, slist in task_structures.items():
+                                                canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)]
+                                            #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula)
+
+                                matched_task_ids = []
+                                if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]:
+                                    for s in canonical_task_structures[formula][sgnum]:
+                                        if structures_match(struct, s):
+                                            print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id)
+                                            matched_task_ids.append(s.task_id)
+                                    if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids:
+                                        print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids)
+                                        raise ValueError
+                                if matched_task_ids:
+                                    logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
+                                    continue
+                            except ValueError as ex:
+                                counter['unmatched_task_id'] += 1
+                                continue
 
-                    try:
-                        wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
-                        wf = add_trackers(wf)
-                        wf = add_tags(wf, [tag])
+                        msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
                         if struct.task_id is not None:
-                            wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
-                        #if struct.icsd_id is not None:
-                        #    wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id})
-                    except:
-                        msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
+                            msg += ' --> enforcing task-id {}'.format(struct.task_id)
                         print(msg)
-                        logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'})
-                        continue
 
-                    if insert:
-                        old_new = lpad.add_wf(wf)
-                        logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
-                    counter['add(ed)'] += 1
+                        no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
+                        if len(no_potcars) > 0:
+                            msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
+                            print(msg)
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars})
+                            continue
+
+                        try:
+                            wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
+                            wf = add_trackers(wf)
+                            wf = add_tags(wf, [tag])
+                            if struct.task_id is not None:
+                                wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
+                            #if struct.icsd_id is not None:
+                            #    wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id})
+                        except:
+                            msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
+                            print(msg)
+                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'})
+                            continue
+
+                        if insert:
+                            old_new = lpad.add_wf(wf)
+                            logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                        counter['add(ed)'] += 1
+
+        except CursorNotFound as ex:
+            print(ex)
+            sites_elements = [
+                (len(set([e.symbol for e in x.composition.elements])), x.num_sites)
+                for x in canonical_structures_list
+            ]
+            print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
 
         print(counter)
 

From 08e56f6117f9731fe0373ebd45b03478de733a78 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 15 Jun 2018 16:57:35 -0700
Subject: [PATCH 05/97] cli: multiple tasks collections, enforce/clean-up
 task_ids

---
 emmet/scripts/emmet.py | 181 ++++++++++++++++++++++++++++-------------
 1 file changed, 123 insertions(+), 58 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 5920cc9adb..6f553d6c54 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -54,16 +54,27 @@ def get_subdir(dn):
         source_count = source.count(query)
         print('source / target:', source_count, '/', target.collection.count(query))
 
-        # skip tasks with task_id existing in target (have to be a string [mp-*, mvc-*])
-        source_task_ids = source.find(query).distinct('task_id')
-        source_mp_task_ids = [task_id for task_id in source_task_ids if isinstance(task_id, str)]
-        skip_task_ids = target.collection.find({'task_id': {'$in': source_mp_task_ids}}).distinct('task_id')
+        # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*])
+        nr_source_mp_tasks, skip_task_ids = 0, []
+        for doc in source.find(query, ['task_id', 'dir_name']):
+            if isinstance(doc['task_id'], str):
+                nr_source_mp_tasks += 1
+                task_query = {'task_id': doc['task_id'], 'dir_name': doc['dir_name']}
+                if target.collection.count(task_query):
+                    skip_task_ids.append(doc['task_id'])
         if len(skip_task_ids):
-            print('skip', len(skip_task_ids), 'existing MP task ids out of', len(source_mp_task_ids))
+            print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks)
 
         query.update({'task_id': {'$nin': skip_task_ids}})
         already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
-        subdirs = [get_subdir(dn) for dn in source.find(query).distinct('dir_name') if get_subdir(dn) not in already_inserted_subdirs]
+        subdirs = []
+        for doc in source.find(query, ['dir_name', 'task_id', 'retired_task_id']):
+            subdir = get_subdir(doc['dir_name'])
+            if subdir not in already_inserted_subdirs or 'retired_task_id' in doc:
+                entry = {'subdir': subdir}
+                if 'retired_task_id' in doc:
+                    entry.update({'task_id': doc['task_id']})
+                subdirs.append(entry)
         if len(subdirs) < 1:
             continue
 
@@ -72,11 +83,20 @@ def get_subdir(dn):
             print('add --insert flag to actually add tasks to production')
             continue
 
-        for subdir in subdirs:
-            subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}}
+        for subdir_doc in subdirs:
+            subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}}
             doc = target.collection.find_one(subdir_query, {'task_id': 1})
             if doc:
-                print(subdir, 'already inserted as', doc['task_id'])
+                print(subdir_doc['subdir'], 'already inserted as', doc['task_id'])
+                if 'task_id' in subdir_doc and subdir_doc['task_id'] != doc['task_id']:
+                    target.collection.remove({'task_id': subdir_doc['task_id']})
+                    target.collection.update(
+                        {'task_id': doc['task_id']}, {
+                            '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id']},
+                            '$addToSet': {'tags': t}
+                        }
+                    )
+                    print('replaced task_id', doc['task_id'], 'with', subdir_doc['task_id'])
                 continue
 
             source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id']
@@ -118,7 +138,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
         snl_coll = None
         print('to be implemented')
         return
-    print('# SNLs:\t', snl_coll.count(exclude))
+    print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
 
     lpad = LaunchPad.auto_load()
 
@@ -132,12 +152,13 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     if clear_logs:
         mongo_handler.collection.drop()
 
-    if alt_tasks_db_file is not None:
+    tasks_collections = OrderedDict()
+    tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks
+    if alt_tasks_db_file is not None: # TODO multiple alt_task_db_files?
         target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True)
-        tasks_coll = target.collection
-    else:
-        tasks_coll = lpad.db.tasks
-    print('# tasks:', tasks_coll.count())
+        tasks_collections[target.collection.full_name] = target.collection
+    for full_name, tasks_coll in tasks_collections.items():
+        print(tasks_coll.count(), 'tasks in', full_name)
 
     structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
     NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
@@ -169,6 +190,41 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     grouped_workflow_structures = {}
     canonical_workflow_structures = {}
 
+    def load_canonical_task_structures(formula, full_name):
+        if full_name not in canonical_task_structures:
+            canonical_task_structures[full_name] = {}
+        if formula not in canonical_task_structures[full_name]:
+            canonical_task_structures[full_name][formula] = {}
+            task_query = {'formula_pretty': formula}
+            task_query.update(task_base_query)
+            tasks = tasks_collections[full_name].find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1})
+            if tasks.count() > 0:
+                task_structures = {}
+                for task in tasks:
+                    task_label = task_type(task['orig_inputs'], include_calc_type=False)
+                    if task_label == "Structure Optimization":
+                        s = Structure.from_dict(task['input']['structure'])
+                        sg = get_sg(s)
+                        if sg in canonical_structures[formula]:
+                            if sg not in task_structures:
+                                task_structures[sg] = []
+                            s.task_id = task['task_id']
+                            task_structures[sg].append(s)
+                if task_structures:
+                    for sg, slist in task_structures.items():
+                        canonical_task_structures[full_name][formula][sg] = [g[0] for g in group_structures(slist)]
+                    #print(sum([len(x) for x in canonical_task_structures[full_name][formula].values()]), 'canonical task structure(s) for', formula)
+
+    def find_matching_canonical_task_structures(formula, struct, full_name):
+        matched_task_ids = []
+        if sgnum in canonical_task_structures[full_name][formula] and canonical_task_structures[full_name][formula][sgnum]:
+            for s in canonical_task_structures[full_name][formula][sgnum]:
+                if structures_match(struct, s):
+                    print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id, 'in', full_name)
+                    matched_task_ids.append(s.task_id)
+        return matched_task_ids
+
+
     for tag, ndocs in tags:
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
@@ -278,7 +334,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
                             print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
                             struct = struc
 
-                        wf_found, readd_wf = False, False
+                        wf_found = False
                         if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
                             for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
                                 if structures_match(struct, s):
@@ -287,7 +343,10 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
                                     if struct.task_id is not None:
                                         task_query = {'task_id': struct.task_id}
                                         task_query.update(task_base_query)
-                                        task = tasks_coll.find_one(task_query, ['input.structure'])
+                                        for full_name in reversed(tasks_collections):
+                                            task = tasks_collections[full_name].find_one(task_query, ['input.structure'])
+                                            if task:
+                                                break
                                         if task:
                                             s_task = Structure.from_dict(task['input']['structure'])
                                             s_task.remove_oxidation_states()
@@ -317,9 +376,38 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
                                                     fw_found = True
                                                     break
                                             if not fw_found:
-                                                print('  --> no WF with enforced task-id', struct.task_id, '-> re-add workflow')
-                                                readd_wf = True
-                                                break
+                                                print('  --> no WF with enforced task-id', struct.task_id)
+                                                fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1})
+                                                print('  -->', s.fw_id, fw['state'])
+                                                if fw['state'] == 'COMPLETED':
+                                                    # the task is in lpad.db.tasks with different integer task_id
+                                                    #    => find task => overwrite task_id => add_tasks will pick it up
+                                                    full_name = list(tasks_collections.keys())[0]
+                                                    load_canonical_task_structures(formula, full_name)
+                                                    matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name)
+                                                    if len(matched_task_ids) == 1:
+                                                        tasks_collections[full_name].update(
+                                                            {'task_id': matched_task_ids[0]}, {
+                                                                '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0]},
+                                                                '$addToSet': {'tags': tag}
+                                                            }
+                                                        )
+                                                        print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name)
+                                                    elif matched_task_ids:
+                                                        msg = '  --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id)
+                                                        print(msg)
+                                                        logger.error(msg, extra={
+                                                            'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF'
+                                                        })
+                                                    else:
+                                                        msg = '  --> ERROR: task for completed WF {} does not exist!?'.format(s.fw_id)
+                                                        print(msg)
+                                                        logger.error(msg, extra={
+                                                            'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing'
+                                                        })
+                                                else:
+                                                    # update WF to include task_id as additional_field
+                                                    sys.exit(0)
                                     else:
                                         logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
                                     wf_found = True
@@ -329,45 +417,22 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
                             continue
 
                         # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
-                        if not readd_wf:
-                            try:
-                                if formula not in canonical_task_structures:
-                                    canonical_task_structures[formula] = {}
-                                    task_query = {'formula_pretty': formula}
-                                    task_query.update(task_base_query)
-                                    tasks = tasks_coll.find(task_query, {'input.structure': 1, 'task_id': 1, 'orig_inputs': 1})
-                                    if tasks.count() > 0:
-                                        task_structures = {}
-                                        for task in tasks:
-                                            task_label = task_type(task['orig_inputs'], include_calc_type=False)
-                                            if task_label == "Structure Optimization":
-                                                s = Structure.from_dict(task['input']['structure'])
-                                                sg = get_sg(s)
-                                                if sg in canonical_structures[formula]:
-                                                    if sg not in task_structures:
-                                                        task_structures[sg] = []
-                                                    s.task_id = task['task_id']
-                                                    task_structures[sg].append(s)
-                                        if task_structures:
-                                            for sg, slist in task_structures.items():
-                                                canonical_task_structures[formula][sg] = [g[0] for g in group_structures(slist)]
-                                            #print(sum([len(x) for x in canonical_task_structures[formula].values()]), 'canonical task structure(s) for', formula)
-
-                                matched_task_ids = []
-                                if sgnum in canonical_task_structures[formula] and canonical_task_structures[formula][sgnum]:
-                                    for s in canonical_task_structures[formula][sgnum]:
-                                        if structures_match(struct, s):
-                                            print('Structure for SNL', struct.snl_id, 'already added in task', s.task_id)
-                                            matched_task_ids.append(s.task_id)
-                                    if struct.task_id is not None and matched_task_ids and struct.task_id not in matched_task_ids:
-                                        print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids)
-                                        raise ValueError
-                                if matched_task_ids:
-                                    logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
-                                    continue
-                            except ValueError as ex:
-                                counter['unmatched_task_id'] += 1
+                        try:
+                            matched_task_ids = OrderedDict()
+                            for full_name in reversed(tasks_collections):
+                                load_canonical_task_structures(formula, full_name)
+                                matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name)
+                                if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]:
+                                    print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids[full_name])
+                                    raise ValueError
+                                if matched_task_ids[full_name]:
+                                    break
+                            if any(matched_task_ids.values()):
+                                logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
                                 continue
+                        except ValueError as ex:
+                            counter['unmatched_task_id'] += 1
+                            continue
 
                         msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
                         if struct.task_id is not None:

From ea38779cbb493a5f812bc360f55553915e160f8a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 20 Jun 2018 11:48:39 -0700
Subject: [PATCH 06/97] cli: save progress

---
 emmet/scripts/emmet.py | 106 +++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 6f553d6c54..062517a822 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,5 @@
-import click, os, yaml, sys, logging, operator
+import click, os, yaml, sys, logging, operator, json
+from datetime import datetime
 from collections import Counter, OrderedDict
 from pymongo import MongoClient
 from pymongo.errors import CursorNotFound
@@ -27,12 +28,15 @@ def add_tasks(target_db_file, tag, insert):
 
     exclude = {'tags': {'$ne': 'deprecated'}}
 
+    if not insert:
+        print('DRY RUN: add --insert flag to actually add tasks to production')
+
     def get_subdir(dn):
         return dn.rsplit(os.sep, 1)[-1]
 
     lpad = LaunchPad.auto_load()
-    source = lpad.db.tasks
-    print('connected to source db with', source.count(), 'tasks')
+    source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
+    print('connected to source db with', source.collection.count(), 'tasks')
 
     if not os.path.exists(target_db_file):
         print(target_db_file, 'not found!')
@@ -40,26 +44,26 @@ def get_subdir(dn):
     target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json'
     print('connected to target db with', target.collection.count(), 'tasks')
 
-    ensure_indexes(['task_id', 'tags', 'dir_name'], [source, target.collection])
+    ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection])
 
     tags = [tag]
     if tag is None:
-        tags = [t for t in source.find(exclude).distinct('tags') if t is not None]
+        tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None]
         print(len(tags), 'tags in source collection')
 
     for t in tags:
 
         print('### {} ###'.format(t))
         query = {'$and': [{'tags': t}, exclude]}
-        source_count = source.count(query)
+        source_count = source.collection.count(query)
         print('source / target:', source_count, '/', target.collection.count(query))
 
         # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*])
         nr_source_mp_tasks, skip_task_ids = 0, []
-        for doc in source.find(query, ['task_id', 'dir_name']):
+        for doc in source.collection.find(query, ['task_id', 'dir_name']):
             if isinstance(doc['task_id'], str):
                 nr_source_mp_tasks += 1
-                task_query = {'task_id': doc['task_id'], 'dir_name': doc['dir_name']}
+                task_query = {'task_id': doc['task_id'], '$or': [{'dir_name': doc['dir_name']}, {'_mpworks_meta': {'$exists': 0}}]}
                 if target.collection.count(task_query):
                     skip_task_ids.append(doc['task_id'])
         if len(skip_task_ids):
@@ -68,7 +72,7 @@ def get_subdir(dn):
         query.update({'task_id': {'$nin': skip_task_ids}})
         already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
         subdirs = []
-        for doc in source.find(query, ['dir_name', 'task_id', 'retired_task_id']):
+        for doc in source.collection.find(query, ['dir_name', 'task_id', 'retired_task_id']):
             subdir = get_subdir(doc['dir_name'])
             if subdir not in already_inserted_subdirs or 'retired_task_id' in doc:
                 entry = {'subdir': subdir}
@@ -79,9 +83,6 @@ def get_subdir(dn):
             continue
 
         print(len(subdirs), 'candidate tasks to insert')
-        if not insert:
-            print('add --insert flag to actually add tasks to production')
-            continue
 
         for subdir_doc in subdirs:
             subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}}
@@ -89,25 +90,46 @@ def get_subdir(dn):
             if doc:
                 print(subdir_doc['subdir'], 'already inserted as', doc['task_id'])
                 if 'task_id' in subdir_doc and subdir_doc['task_id'] != doc['task_id']:
-                    target.collection.remove({'task_id': subdir_doc['task_id']})
-                    target.collection.update(
-                        {'task_id': doc['task_id']}, {
-                            '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id']},
-                            '$addToSet': {'tags': t}
-                        }
-                    )
+                    if insert:
+                        target.collection.remove({'task_id': subdir_doc['task_id']})
+                        target.collection.update(
+                            {'task_id': doc['task_id']}, {
+                                '$set': {'task_id': subdir_doc['task_id'], 'retired_task_id': doc['task_id'], 'last_updated': datetime.utcnow()},
+                                '$addToSet': {'tags': t}
+                            }
+                        )
                     print('replaced task_id', doc['task_id'], 'with', subdir_doc['task_id'])
                 continue
 
-            source_task_id = source.find_one(subdir_query, {'task_id': 1})['task_id']
-            print('retrieve', source_task_id, 'for', subdir)
+            source_task_id = source.collection.find_one(subdir_query, {'task_id': 1})['task_id']
+            print('retrieve', source_task_id, 'for', subdir_doc['subdir'])
             task_doc = source.retrieve_task(source_task_id)
 
             if isinstance(task_doc['task_id'], int):
-                c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"]
-                task_doc['task_id'] = 'mp-{}'.format(c)
+                if insert:
+                    c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"]
+                    task_doc['task_id'] = 'mp-{}'.format(c)
+            else:
+                task = target.collection.find_one({'task_id': task_doc['task_id']}, ['orig_inputs', 'output.structure'])
+                if task:
+                    task_label = task_type(task['orig_inputs'], include_calc_type=False)
+                    if task_label == "Structure Optimization":
+                        s1 = Structure.from_dict(task['output']['structure'])
+                        s2 = Structure.from_dict(task_doc['output']['structure'])
+                        if structures_match(s1, s2):
+                            if insert:
+                                target.collection.remove({'task_id': task_doc['task_id']})
+                            print('INFO: removed old task!')
+                        else:
+                            print('ERROR: structures do not match!')
+                            #json.dump({'old': s1.as_dict(), 'new': s2.as_dict()}, open('{}.json'.format(task_doc['task_id']), 'w'))
+                            continue
+                    else:
+                        print('ERROR: not a SO task!')
+                        continue
 
-            target.insert_task(task_doc, use_gridfs=True)
+            if insert:
+                target.insert_task(task_doc, use_gridfs=True)
 
 
 @cli.command()
@@ -169,7 +191,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
 
     ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll])
 
-    tags = []
+    tags = OrderedDict()
     if tag is None:
         query = dict(exclude)
         query.update(base_query)
@@ -177,14 +199,14 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
         for t in remarks:
             query = {'$and': [{'about.remarks': t}, exclude]}
             query.update(base_query)
-            tags.append((t, snl_coll.count(query)))
-        tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True)
+            tags[t] = snl_coll.count(query)
+        tags = OrderedDict((el[0], el[1]) for el in sorted(tags.items(), key=operator.itemgetter(1), reverse=True))
         print(len(tags), 'tags in source collection => TOP10:')
-        print('\n'.join(['{} ({})'.format(*t) for t in tags[:10]]))
+        print('\n'.join(['{} ({})'.format(k, v) for k, v in list(tags.items())[:10]]))
     else:
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
-        tags = [(tag, snl_coll.count(query))]
+        tags = OrderedDict((tag, snl_coll.count(query)))
 
     canonical_task_structures = {}
     grouped_workflow_structures = {}
@@ -225,7 +247,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
         return matched_task_ids
 
 
-    for tag, ndocs in tags:
+    for tag, ndocs in tags.items():
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
 
@@ -316,7 +338,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             for sgnum, slist in workflow_structures.items():
                                 grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
                                 canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
-                            #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
+                            print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
 
                 for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
 
@@ -388,7 +410,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                     if len(matched_task_ids) == 1:
                                                         tasks_collections[full_name].update(
                                                             {'task_id': matched_task_ids[0]}, {
-                                                                '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0]},
+                                                                '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()},
                                                                 '$addToSet': {'tags': tag}
                                                             }
                                                         )
@@ -406,8 +428,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                             'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing'
                                                         })
                                                 else:
-                                                    # update WF to include task_id as additional_field
-                                                    sys.exit(0)
+                                                    print('  --> TODO: update {} WF to include task_id as additional_field'.format(fw['state'], s.fw_id))
                                     else:
                                         logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
                                     wf_found = True
@@ -428,17 +449,15 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                 if matched_task_ids[full_name]:
                                     break
                             if any(matched_task_ids.values()):
-                                logger.warning('matched task ids', extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id(s)': matched_task_ids})
+                                logger.warning('matched task ids', extra={
+                                    'formula': formula, 'snl_id': struct.snl_id,
+                                    'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
+                                })
                                 continue
                         except ValueError as ex:
                             counter['unmatched_task_id'] += 1
                             continue
 
-                        msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
-                        if struct.task_id is not None:
-                            msg += ' --> enforcing task-id {}'.format(struct.task_id)
-                        print(msg)
-
                         no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
                         if len(no_potcars) > 0:
                             msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
@@ -460,9 +479,14 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'})
                             continue
 
+                        msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
+                        if struct.task_id is not None:
+                            msg += ' --> enforcing task-id {}'.format(struct.task_id)
+                        print(msg)
+
                         if insert:
                             old_new = lpad.add_wf(wf)
-                            logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                            #logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
                         counter['add(ed)'] += 1
 
         except CursorNotFound as ex:

From 609ea7e1ed2c9fb4d1f5089094b951a77cd93101 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 20 Jun 2018 15:25:02 -0700
Subject: [PATCH 07/97] cli: resolve/cleanup task_id errors

---
 emmet/scripts/emmet.py | 63 +++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 062517a822..4ff07642b9 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -6,7 +6,7 @@
 from pymongo.collection import ReturnDocument
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
-from fireworks import LaunchPad
+from fireworks import LaunchPad, Workflow
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.workflows.presets.core import wf_structure_optimization
 from atomate.vasp.database import VaspCalcDb
@@ -186,7 +186,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
     no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+']
     base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
-    task_base_query = {'_mpworks_meta': {'$exists': 0}}
+    task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}}
     vp = DLSVolumePredictor()
 
     ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll])
@@ -206,7 +206,7 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     else:
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
-        tags = OrderedDict((tag, snl_coll.count(query)))
+        tags[tag] = snl_coll.count(query)
 
     canonical_task_structures = {}
     grouped_workflow_structures = {}
@@ -299,6 +299,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     except Exception as ex:
                         s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
                         msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                        print(msg)
                         logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)})
                         continue
                     if sgnum not in structures[formula]:
@@ -338,7 +339,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             for sgnum, slist in workflow_structures.items():
                                 grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
                                 canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
-                            print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
+                            #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
 
                 for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
 
@@ -374,10 +375,10 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                             s_task.remove_oxidation_states()
                                             if not structures_match(struct, s_task):
                                                 msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
+                                                msg += '  --> CLEANUP: remove task_id from SNL'
                                                 print(msg)
-                                                logger.error(msg, extra={
-                                                    'formula': formula, 'snl_id': struct.snl_id, 'error': 'SNL-TASK structure mismatch'
-                                                })
+                                                snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
+                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
                                                 counter['snl-task_mismatch'] += 1
                                             else:
                                                 msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
@@ -422,13 +423,15 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                             'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF'
                                                         })
                                                     else:
-                                                        msg = '  --> ERROR: task for completed WF {} does not exist!?'.format(s.fw_id)
+                                                        msg = '  --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id)
+                                                        msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id)
                                                         print(msg)
-                                                        logger.error(msg, extra={
-                                                            'formula': formula, 'snl_id': struct.snl_id, 'error': 'Task for Completed WF missing'
-                                                        })
+                                                        lpad.delete_wf(s.fw_id)
+                                                        break
                                                 else:
-                                                    print('  --> TODO: update {} WF to include task_id as additional_field'.format(fw['state'], s.fw_id))
+                                                    print('  --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'], s.fw_id))
+                                                    lpad.delete_wf(s.fw_id)
+                                                    break
                                     else:
                                         logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
                                     wf_found = True
@@ -438,24 +441,20 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             continue
 
                         # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
-                        try:
-                            matched_task_ids = OrderedDict()
-                            for full_name in reversed(tasks_collections):
-                                load_canonical_task_structures(formula, full_name)
-                                matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name)
-                                if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]:
-                                    print('  --> ERROR: task', struct.task_id, 'not in', matched_task_ids[full_name])
-                                    raise ValueError
-                                if matched_task_ids[full_name]:
-                                    break
-                            if any(matched_task_ids.values()):
-                                logger.warning('matched task ids', extra={
-                                    'formula': formula, 'snl_id': struct.snl_id,
-                                    'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
-                                })
-                                continue
-                        except ValueError as ex:
-                            counter['unmatched_task_id'] += 1
+                        msg, matched_task_ids = '', OrderedDict()
+                        for full_name in reversed(tasks_collections):
+                            load_canonical_task_structures(formula, full_name)
+                            matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name)
+                            if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]:
+                                msg = '  --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name])
+                                print(msg)
+                            if matched_task_ids[full_name]:
+                                break
+                        if any(matched_task_ids.values()):
+                            logger.warning('matched task ids' + msg, extra={
+                                'formula': formula, 'snl_id': struct.snl_id,
+                                'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
+                            })
                             continue
 
                         no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
@@ -486,7 +485,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
                         if insert:
                             old_new = lpad.add_wf(wf)
-                            #logger.warning('workflow added', extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                        else:
+                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id})
                         counter['add(ed)'] += 1
 
         except CursorNotFound as ex:

From 1fa945b161a5a278d5305b2f1c7536390d05734a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 21 Jun 2018 14:41:07 -0700
Subject: [PATCH 08/97] cli: add report subcommand

---
 emmet/scripts/emmet.py | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 4ff07642b9..7455af9acd 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -511,3 +511,43 @@ def ensure_indexes(indexes, colls):
            if index not in keys:
                coll.ensure_index(index)
                print('ensured index', index, 'on', coll.full_name)
+
+
+@cli.command()
+@click.option('--tag', default=None, help='only include structures with specific tag')
+def report(tag):
+    """generate a report of calculations status"""
+
+    lpad = LaunchPad.auto_load()
+    states = ['COMPLETED', 'FIZZLED', 'READY', 'RUNNING']
+
+    tags = [tag]
+    if tag is None:
+        tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None]
+        print(len(tags), 'tags in workflows collection')
+
+    from prettytable import PrettyTable
+    table = PrettyTable()
+    table.field_names = ['tag', 'workflows'] + states + ['% FIZZLED', 'progress']
+
+    for t in tags:
+        wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1})
+        counter = Counter([wf['state'] for wf in wflows])
+        total = sum(v for k, v in counter.items() if k in states)
+        tc, progress = t, '-'
+        if counter['COMPLETED'] + counter['FIZZLED'] != total:
+            tc = "\033[1;34m{}\033[0m".format(t)
+            progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100.
+            progress = '{:.0f}%'.format(progress)
+        entry = [tc, total] + [counter[state] for state in states]
+        fizzled = counter['FIZZLED'] / total
+        percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
+                if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
+        entry.append(percent_fizzled)
+        entry.append(progress)
+        table.add_row(entry)
+
+    table.sortby = 'workflows'
+    table.reversesort = True
+    table.align['tag'] = 'r'
+    print(table)

From 6f06894feb9ca4693937ebc1a35577fdd8dfa59a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 22 Jun 2018 12:15:45 -0700
Subject: [PATCH 09/97] cli: progress on report

---
 emmet/scripts/emmet.py | 46 ++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 7455af9acd..eb1250d63e 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -13,7 +13,7 @@
 from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
 from emmet.vasp.materials import group_structures, get_sg
 from emmet.vasp.task_tagger import task_type
-from log4mongo.handlers import MongoHandler
+from log4mongo.handlers import MongoHandler, MongoFormatter
 
 @click.group()
 def cli():
@@ -167,12 +167,12 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     logger = logging.getLogger('add_wflows')
     mongo_handler = MongoHandler(
         host=lpad.host, port=lpad.port, database_name=lpad.name, collection='add_wflows_logs',
-        username=lpad.username, password=lpad.password, authentication_db=lpad.name
+        username=lpad.username, password=lpad.password, authentication_db=lpad.name, formatter=MyMongoFormatter()
     )
     logger.addHandler(mongo_handler)
-    ensure_indexes(['level', 'snl_id', 'formula'], [mongo_handler.collection])
     if clear_logs:
         mongo_handler.collection.drop()
+    ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tag'], [mongo_handler.collection])
 
     tasks_collections = OrderedDict()
     tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks
@@ -300,7 +300,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
                         msg = 'SNL {}: {}'.format(s.snl_id, ex)
                         print(msg)
-                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'error': str(ex)})
+                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'error': str(ex)})
                         continue
                     if sgnum not in structures[formula]:
                         structures[formula][sgnum] = []
@@ -314,7 +314,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         if len(g) > 1:
                             for s in g[1:]:
                                 logger.warning('duplicate structure', extra={
-                                    'formula': formula, 'snl_id': s.snl_id, 'canonical_snl_id': g[0].snl_id
+                                    'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'canonical_snl_id': g[0].snl_id
                                 })
 
                 if not canonical_structures[formula]:
@@ -378,13 +378,13 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                 msg += '  --> CLEANUP: remove task_id from SNL'
                                                 print(msg)
                                                 snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
-                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
+                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag})
                                                 counter['snl-task_mismatch'] += 1
                                             else:
                                                 msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
                                                 print(msg)
                                                 logger.warning(msg, extra={
-                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id
+                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag
                                                 })
                                         else:
                                             print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
@@ -395,7 +395,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                 if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
                                                     msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
                                                     print(msg)
-                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id})
+                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag})
                                                     fw_found = True
                                                     break
                                             if not fw_found:
@@ -420,7 +420,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                         msg = '  --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id)
                                                         print(msg)
                                                         logger.error(msg, extra={
-                                                            'formula': formula, 'snl_id': struct.snl_id, 'error': 'Multiple tasks for Completed WF'
+                                                            'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'Multiple tasks for Completed WF'
                                                         })
                                                     else:
                                                         msg = '  --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id)
@@ -433,7 +433,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                     lpad.delete_wf(s.fw_id)
                                                     break
                                     else:
-                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id})
+                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag})
                                     wf_found = True
                                     break
 
@@ -452,7 +452,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                 break
                         if any(matched_task_ids.values()):
                             logger.warning('matched task ids' + msg, extra={
-                                'formula': formula, 'snl_id': struct.snl_id,
+                                'formula': formula, 'snl_id': struct.snl_id, 'tag': tag,
                                 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
                             })
                             continue
@@ -461,7 +461,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         if len(no_potcars) > 0:
                             msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
                             print(msg)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'no_potcars': no_potcars})
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': no_potcars})
                             continue
 
                         try:
@@ -475,7 +475,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         except:
                             msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
                             print(msg)
-                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'error': 'could not make workflow'})
+                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'could not make workflow'})
                             continue
 
                         msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
@@ -485,9 +485,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
                         if insert:
                             old_new = lpad.add_wf(wf)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': list(old_new.values())[0]})
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'fw_id': list(old_new.values())[0]})
                         else:
-                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id})
+                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag})
                         counter['add(ed)'] += 1
 
         except CursorNotFound as ex:
@@ -512,6 +512,17 @@ def ensure_indexes(indexes, colls):
                coll.ensure_index(index)
                print('ensured index', index, 'on', coll.full_name)
 
+class MyMongoFormatter(logging.Formatter):
+    KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tag', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)']
+
+    def format(self, record):
+        mongoformatter = MongoFormatter()
+        document = mongoformatter.format(record)
+        for k in list(document.keys()):
+            if k not in self.KEEP_KEYS:
+                document.pop(k)
+        return document
+
 
 @cli.command()
 @click.option('--tag', default=None, help='only include structures with specific tag')
@@ -528,10 +539,11 @@ def report(tag):
 
     from prettytable import PrettyTable
     table = PrettyTable()
-    table.field_names = ['tag', 'workflows'] + states + ['% FIZZLED', 'progress']
+    table.field_names = ['tag', 'SNLs', 'workflows'] + states + ['% FIZZLED', 'progress']
 
     for t in tags:
         wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1})
+        nr_snls = lpad.db.add_wflows_logs.count({'tag': t})
         counter = Counter([wf['state'] for wf in wflows])
         total = sum(v for k, v in counter.items() if k in states)
         tc, progress = t, '-'
@@ -539,7 +551,7 @@ def report(tag):
             tc = "\033[1;34m{}\033[0m".format(t)
             progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100.
             progress = '{:.0f}%'.format(progress)
-        entry = [tc, total] + [counter[state] for state in states]
+        entry = [tc, nr_snls, total] + [counter[state] for state in states]
         fizzled = counter['FIZZLED'] / total
         percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
                 if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)

From 063caf8a7f9aa750b6db8f74876c1f6185a4cf6c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 22 Jun 2018 17:17:31 -0700
Subject: [PATCH 10/97] cli: more progress on report etc

---
 emmet/scripts/emmet.py | 152 +++++++++++++++++++++++------------------
 1 file changed, 85 insertions(+), 67 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index eb1250d63e..f305db0798 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, operator, json
+import click, os, yaml, sys, logging, json
 from datetime import datetime
 from collections import Counter, OrderedDict
 from pymongo import MongoClient
@@ -15,6 +15,10 @@
 from emmet.vasp.task_tagger import task_type
 from log4mongo.handlers import MongoHandler, MongoFormatter
 
+if 'FW_CONFIG_FILE' not in os.environ:
+    print('Please set FW_CONFIG_FILE!')
+    sys.exit(0)
+
 @click.group()
 def cli():
     pass
@@ -133,36 +137,34 @@ def get_subdir(dn):
 
 
 @cli.command()
-@click.argument('list_of_structures', type=click.File('rb'))
-@click.option('-a', '--alt_tasks_db_file', type=click.Path(exists=True), help='config file for alternative tasks collection')
+@click.option('--add_snls_db', type=click.Path(exists=True), help='config file for additional SNLs collection')
+@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection')
 @click.option('--tag', default=None, help='only include structures with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
 @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection')
-def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
-    """add workflows for list of structures / SNLs (YAML config or JSON list of pymatgen structures"""
+@click.option('--max-structures', default=1000, help='set max structures for tags to scan')
+def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures):
+    """add workflows based on tags in SNL collection"""
 
     exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
 
     if not insert:
         print('DRY RUN! Add --insert flag to actually add workflows')
 
-    try:
-        snl_db_config = yaml.load(list_of_structures)
+    lpad = LaunchPad.auto_load()
+
+    # TODO use add_snls first, and then add_wflows based on SNL collection
+    snl_collections = [lpad.db.snls]
+    if add_snls_db is not None:
+        snl_db_config = yaml.load(open(add_snls_db, 'r'))
         snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
         snl_db = snl_db_conn[snl_db_config['db']]
         snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
-        snl_coll = snl_db[snl_db_config['collection']]
-    except Exception as ex:
-        print(ex)
-        # NOTE WIP might change it to use add_snls first, and then add_wflows based on SNL collection only
-        # TODO load pymatgen structures from JSON file into MongoDB collection
-        # TODO also fake-tag them, add SNL info
-        snl_coll = None
-        print('to be implemented')
-        return
-    print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
+        snl_collections.append(snl_db[snl_db_config['collection']])
 
-    lpad = LaunchPad.auto_load()
+    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], snl_collections)
+    for snl_coll in snl_collections:
+        print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
 
     logger = logging.getLogger('add_wflows')
     mongo_handler = MongoHandler(
@@ -172,12 +174,12 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     logger.addHandler(mongo_handler)
     if clear_logs:
         mongo_handler.collection.drop()
-    ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tag'], [mongo_handler.collection])
+    ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tags'], [mongo_handler.collection])
 
     tasks_collections = OrderedDict()
     tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks
-    if alt_tasks_db_file is not None: # TODO multiple alt_task_db_files?
-        target = VaspCalcDb.from_db_file(alt_tasks_db_file, admin=True)
+    if add_tasks_db is not None: # TODO multiple alt_task_db_files?
+        target = VaspCalcDb.from_db_file(add_tasks_db, admin=True)
         tasks_collections[target.collection.full_name] = target.collection
     for full_name, tasks_coll in tasks_collections.items():
         print(tasks_coll.count(), 'tasks in', full_name)
@@ -189,24 +191,40 @@ def add_wflows(list_of_structures, alt_tasks_db_file, tag, insert, clear_logs):
     task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}}
     vp = DLSVolumePredictor()
 
-    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], [snl_coll])
-
     tags = OrderedDict()
     if tag is None:
+        all_tags = OrderedDict()
         query = dict(exclude)
         query.update(base_query)
-        remarks = filter(None, snl_coll.find(query).distinct('about.remarks'))
-        for t in remarks:
-            query = {'$and': [{'about.remarks': t}, exclude]}
-            query.update(base_query)
-            tags[t] = snl_coll.count(query)
-        tags = OrderedDict((el[0], el[1]) for el in sorted(tags.items(), key=operator.itemgetter(1), reverse=True))
-        print(len(tags), 'tags in source collection => TOP10:')
-        print('\n'.join(['{} ({})'.format(k, v) for k, v in list(tags.items())[:10]]))
+        for snl_coll in snl_collections:
+            remarks = filter(None, snl_coll.find(query).distinct('about.remarks'))
+            for t in remarks:
+                query = {'$and': [{'about.remarks': t}, exclude]}
+                query.update(base_query)
+                if t not in all_tags:
+                    all_tags[t] = [snl_coll.count(query), snl_coll]
+                else:
+                    print('tag -', t, '- already in', all_tags[t][-1].full_name)
+        sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0])
+        for item in sorted_tags:
+            to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]})
+            if item[1][0] < max_structures and to_scan:
+                tags[item[0]] = [item[1][0], to_scan, item[1][-1]]
     else:
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
-        tags[tag] = snl_coll.count(query)
+        for snl_coll in snl_collections:
+            cnt = snl_coll.count(query)
+            if cnt:
+                to_scan = cnt - lpad.db.add_wflows_logs.count({'tags': tag})
+                tags[tag] = [cnt, to_scan, snl_coll]
+                break
+
+    if not tags:
+        print('nothing to scan')
+        return
+    print(len(tags), 'tags to scan in source SNL collections:')
+    print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()]))
 
     canonical_task_structures = {}
     grouped_workflow_structures = {}
@@ -247,21 +265,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
         return matched_task_ids
 
 
-    for tag, ndocs in tags.items():
+    for tag, value in tags.items():
         query = {'$and': [{'about.remarks': tag}, exclude]}
         query.update(base_query)
 
-        # TODO WIP will be removed
-        if tag == 'new_ordered_icsd_2017':
+        if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed
             #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1})
-            print(tag, 'TODO implement db.icsd as snl_coll')
-            continue
-        elif tag == 'pre-atomate production':
-            # TODO scan last
+            print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?')
             continue
 
-        print('aggregate', ndocs, 'structures for', tag, '...')
-        structure_groups = snl_coll.aggregate([
+        print('aggregate', value[0], 'structures for', tag, '...')
+        structure_groups = value[-1].aggregate([
             {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
             {'$group': {
                 '_id': '$reduced_cell_formula',
@@ -286,7 +300,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     print(idx_group, '...')
 
                 for dct in group['structures']:
-                    if mongo_handler.collection.find_one({'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}):
+                    q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}
+                    if mongo_handler.collection.find_one(q):
+                        lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}})
                         continue # already checked
                     mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups
                     counter['structures'] += 1
@@ -300,7 +316,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
                         msg = 'SNL {}: {}'.format(s.snl_id, ex)
                         print(msg)
-                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'error': str(ex)})
+                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
                         continue
                     if sgnum not in structures[formula]:
                         structures[formula][sgnum] = []
@@ -314,7 +330,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         if len(g) > 1:
                             for s in g[1:]:
                                 logger.warning('duplicate structure', extra={
-                                    'formula': formula, 'snl_id': s.snl_id, 'tag': tag, 'canonical_snl_id': g[0].snl_id
+                                    'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id
                                 })
 
                 if not canonical_structures[formula]:
@@ -377,14 +393,14 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                 msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
                                                 msg += '  --> CLEANUP: remove task_id from SNL'
                                                 print(msg)
-                                                snl_coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
-                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag})
+                                                value[-1].update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
+                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
                                                 counter['snl-task_mismatch'] += 1
                                             else:
                                                 msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
                                                 print(msg)
                                                 logger.warning(msg, extra={
-                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag
+                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]
                                                 })
                                         else:
                                             print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
@@ -395,7 +411,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                 if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
                                                     msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
                                                     print(msg)
-                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tag': tag})
+                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]})
                                                     fw_found = True
                                                     break
                                             if not fw_found:
@@ -420,7 +436,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                         msg = '  --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id)
                                                         print(msg)
                                                         logger.error(msg, extra={
-                                                            'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'Multiple tasks for Completed WF'
+                                                            'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF'
                                                         })
                                                     else:
                                                         msg = '  --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id)
@@ -433,7 +449,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                     lpad.delete_wf(s.fw_id)
                                                     break
                                     else:
-                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tag': tag})
+                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
                                     wf_found = True
                                     break
 
@@ -452,7 +468,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                 break
                         if any(matched_task_ids.values()):
                             logger.warning('matched task ids' + msg, extra={
-                                'formula': formula, 'snl_id': struct.snl_id, 'tag': tag,
+                                'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag],
                                 'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
                             })
                             continue
@@ -461,7 +477,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         if len(no_potcars) > 0:
                             msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
                             print(msg)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': no_potcars})
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars})
                             continue
 
                         try:
@@ -475,7 +491,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                         except:
                             msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
                             print(msg)
-                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'error': 'could not make workflow'})
+                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'})
                             continue
 
                         msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
@@ -485,17 +501,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
                         if insert:
                             old_new = lpad.add_wf(wf)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag, 'fw_id': list(old_new.values())[0]})
+                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]})
                         else:
-                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tag': tag})
+                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]})
                         counter['add(ed)'] += 1
 
         except CursorNotFound as ex:
             print(ex)
-            sites_elements = [
+            sites_elements = set([
                 (len(set([e.symbol for e in x.composition.elements])), x.num_sites)
                 for x in canonical_structures_list
-            ]
+            ])
             print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
 
         print(counter)
@@ -513,7 +529,7 @@ def ensure_indexes(indexes, colls):
                print('ensured index', index, 'on', coll.full_name)
 
 class MyMongoFormatter(logging.Formatter):
-    KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tag', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)']
+    KEEP_KEYS = ['timestamp', 'level', 'message', 'formula', 'snl_id', 'tags', 'error', 'canonical_snl_id', 'fw_id', 'task_id', 'task_id(s)']
 
     def format(self, record):
         mongoformatter = MongoFormatter()
@@ -535,31 +551,33 @@ def report(tag):
     tags = [tag]
     if tag is None:
         tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None]
-        print(len(tags), 'tags in workflows collection')
+        tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags]
+        print(len(tags), 'tags in WFs and logs collections')
 
     from prettytable import PrettyTable
     table = PrettyTable()
-    table.field_names = ['tag', 'SNLs', 'workflows'] + states + ['% FIZZLED', 'progress']
+    table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress']
 
     for t in tags:
         wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1})
-        nr_snls = lpad.db.add_wflows_logs.count({'tag': t})
+        nr_snls = lpad.db.add_wflows_logs.count({'tags': t})
+        wflows_to_add = lpad.db.add_wflows_logs.count({'tags': t, 'level': 'ERROR', 'error': {'$exists': 0}})
         counter = Counter([wf['state'] for wf in wflows])
         total = sum(v for k, v in counter.items() if k in states)
         tc, progress = t, '-'
-        if counter['COMPLETED'] + counter['FIZZLED'] != total:
+        if wflows_to_add or counter['COMPLETED'] + counter['FIZZLED'] != total:
             tc = "\033[1;34m{}\033[0m".format(t)
-            progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100.
+            progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0.
             progress = '{:.0f}%'.format(progress)
-        entry = [tc, nr_snls, total] + [counter[state] for state in states]
-        fizzled = counter['FIZZLED'] / total
+        entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states]
+        fizzled = counter['FIZZLED'] / total if total else 0.
         percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
                 if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
         entry.append(percent_fizzled)
         entry.append(progress)
         table.add_row(entry)
 
-    table.sortby = 'workflows'
+    table.sortby = 'SNLs'
     table.reversesort = True
-    table.align['tag'] = 'r'
+    table.align['Tag'] = 'r'
     print(table)

From 5f9e9d6ca8bc7489d7d6e66b15b1c009ffb61bb3 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 11:01:24 -0700
Subject: [PATCH 11/97] cli: skip-all-scanned, catch another sgnum

---
 emmet/scripts/emmet.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index f305db0798..41ac7600bd 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -143,7 +143,8 @@ def get_subdir(dn):
 @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
 @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection')
 @click.option('--max-structures', default=1000, help='set max structures for tags to scan')
-def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures):
+@click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors')
+def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
     """add workflows based on tags in SNL collection"""
 
     exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
@@ -244,7 +245,14 @@ def load_canonical_task_structures(formula, full_name):
                     task_label = task_type(task['orig_inputs'], include_calc_type=False)
                     if task_label == "Structure Optimization":
                         s = Structure.from_dict(task['input']['structure'])
-                        sg = get_sg(s)
+                        try:
+                            sgnum = get_sg(s)
+                        except Exception as ex:
+                            s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
+                            msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                            print(msg)
+                            logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
+                            continue
                         if sg in canonical_structures[formula]:
                             if sg not in task_structures:
                                 task_structures[sg] = []
@@ -304,7 +312,10 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     if mongo_handler.collection.find_one(q):
                         lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}})
                         continue # already checked
-                    mongo_handler.collection.remove({'level': 'ERROR', 'formula': formula, 'snl_id': dct['snl_id']}) # avoid dups
+                    q['level'] = 'ERROR'
+                    if skip_all_scanned and mongo_handler.collection.find_one(q):
+                        continue
+                    mongo_handler.collection.remove(q) # avoid dups
                     counter['structures'] += 1
                     s = Structure.from_dict(dct)
                     s.snl_id = dct['snl_id']

From 75b145c437a8a2b26c1b8a5662bdc57d3dbbf26a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 16:56:57 -0700
Subject: [PATCH 12/97] cli: add ensure_meta

---
 emmet/scripts/emmet.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 41ac7600bd..db4428314d 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -23,6 +23,33 @@
 def cli():
     pass
 
+
+@cli.command()
+@click.argument('snls_db', type=click.Path(exists=True))
+def ensure_meta(snls_db):
+    """ensure meta-data fields are set in SNL collection"""
+
+    snl_db_config = yaml.load(open(snls_db, 'r'))
+    snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+    snl_db = snl_db_conn[snl_db_config['db']]
+    snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+    snl_coll = snl_db[snl_db_config['collection']]
+    print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
+
+    for idx, doc in enumerate(snl_coll.find({}, structure_keys)):
+        if idx and not idx%1000:
+            print(idx, '...')
+        struct = Structure.from_dict(doc)
+        d = {'formula_pretty': struct.composition.reduced_formula}
+        d['nelements'] = len(set(struct.composition.elements))
+        d['nsites'] = len(struct)
+        d['is_ordered'] = struct.is_ordered
+        d['is_valid'] = struct.is_valid()
+        snl_coll.update({'snl_id': doc['snl_id']}, {'$set': d})
+
+    ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll])
+
+
 @cli.command()
 @click.option('--target_db_file', default="target.json", help='target db file')
 @click.option('--tag', default=None, help='only insert tasks with specific tag')

From ef17bdf749a6e7170d918d99ae5cd6c4549ef1f4 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 16:58:16 -0700
Subject: [PATCH 13/97] cli: some global definitions

---
 emmet/scripts/emmet.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index db4428314d..d237d50cf2 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -19,6 +19,12 @@
     print('Please set FW_CONFIG_FILE!')
     sys.exit(0)
 
+exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}}
+no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+']
+base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
+task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
+structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
+
 @click.group()
 def cli():
     pass
@@ -57,8 +63,6 @@ def ensure_meta(snls_db):
 def add_tasks(target_db_file, tag, insert):
     """Retrieve tasks from source and add to target"""
 
-    exclude = {'tags': {'$ne': 'deprecated'}}
-
     if not insert:
         print('DRY RUN: add --insert flag to actually add tasks to production')
 
@@ -174,8 +178,6 @@ def get_subdir(dn):
 def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
     """add workflows based on tags in SNL collection"""
 
-    exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
-
     if not insert:
         print('DRY RUN! Add --insert flag to actually add workflows')
 
@@ -212,11 +214,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
     for full_name, tasks_coll in tasks_collections.items():
         print(tasks_coll.count(), 'tasks in', full_name)
 
-    structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
     NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
-    no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+']
-    base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
-    task_base_query = {'tags': {'$ne': 'deprecated'}, '_mpworks_meta': {'$exists': 0}}
     vp = DLSVolumePredictor()
 
     tags = OrderedDict()

From dd6afb76da434e3ae0c0b2aa3f92e02b41df4582 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 16:59:02 -0700
Subject: [PATCH 14/97] cli: start add_snls

---
 emmet/scripts/emmet.py | 94 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index d237d50cf2..1eee57794e 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,5 @@
-import click, os, yaml, sys, logging, json
+import click, os, yaml, sys, logging, json, tarfile
+from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
 from pymongo import MongoClient
@@ -6,6 +7,7 @@
 from pymongo.collection import ReturnDocument
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
+from pymatgen.util.provenance import StructureNL, Author
 from fireworks import LaunchPad, Workflow
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.workflows.presets.core import wf_structure_optimization
@@ -617,3 +619,93 @@ def report(tag):
     table.reversesort = True
     table.align['Tag'] = 'r'
     print(table)
+
+
+@cli.command()
+@click.argument('archive', type=click.Path(exists=True))
+@click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections')
+def add_snls(archive, add_snls_dbs):
+    """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection"""
+    # TODO assign task_ids to structures?
+
+    lpad = LaunchPad.auto_load()
+    snl_collections = [lpad.db.snls]
+    if add_snls_dbs:
+        for add_snls_db in add_snls_dbs:
+            snl_db_config = yaml.load(open(add_snls_db, 'r'))
+            snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+            snl_db = snl_db_conn[snl_db_config['db']]
+            snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+            snl_collections.append(snl_db[snl_db_config['collection']])
+    for snl_coll in snl_collections:
+        print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
+
+    fname, ext = os.path.splitext(os.path.basename(archive))
+    tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else fname, ''
+    if sec_ext:
+        ext = ''.join([sec_ext, ext])
+    exts = ['tar.gz', '.tgz']
+    if ext not in exts:
+        print(ext, 'not supported (yet)! Please use one of', exts)
+        return
+
+    meta_path = '{}.yaml'.format(tag)
+    if not os.path.exists(meta_path):
+        print('Please include meta info in', meta_path)
+        return
+    with open(meta_path, 'r') as f:
+        meta = yaml.load(f)
+        meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
+
+    exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
+
+    snls = []
+    tar = tarfile.open(archive, 'r:gz')
+    for member in tar.getmembers():
+        if os.path.basename(member.name).startswith('.'):
+            continue
+        f = tar.extractfile(member)
+        if f:
+            print(member.name)
+            contents = f.read().decode('utf-8')
+            fname = member.name.lower()
+            if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
+                fmt = 'cif'
+            elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
+                fmt = 'json'
+            else:
+                print('reading', fname, 'not supported (yet)')
+                continue
+
+            try:
+                struct = Structure.from_str(contents, fmt=fmt)
+            except Exception as ex:
+                print(ex)
+                break #continue
+
+            formula = struct.composition.reduced_formula
+            query = {'$and': [{'formula_pretty': formula}, exclude]}
+            query.update(base_query)
+
+            for snl_coll in snl_collections:
+                snl_groups = snl_coll.aggregate([
+                    {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
+                    {'$group': {
+                        '_id': '$formula_pretty',
+                        'snls': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
+                    }}
+                ], allowDiskUse=True, batchSize=1)
+            return
+
+            snls.append(StructureNL(
+                struct, authors, references=references.strip(), remarks=[tag]
+            ))
+    print(len(snls))
+
+#                snls.append(snl.as_dict())
+#    if snls:
+#        print('add', len(snls), 'SNLs')
+#        result = target.db.snls.insert_many(snls)
+#        print('#SNLs inserted:', len(result.inserted_ids))
+#    else:
+#        print('no SNLs to insert')

From 931cff109b785e5cdda57f48e77558eba7f060df Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 16:59:22 -0700
Subject: [PATCH 15/97] cli: fix sgnum error catch

---
 emmet/scripts/emmet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 1eee57794e..cf530a1632 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -275,10 +275,10 @@ def load_canonical_task_structures(formula, full_name):
                         try:
                             sgnum = get_sg(s)
                         except Exception as ex:
-                            s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
-                            msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                            s.to(fmt='json', filename='sgnum_{}.json'.format(task['task_id']))
+                            msg = 'SNL {}: {}'.format(task['task_id'], ex)
                             print(msg)
-                            logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
+                            logger.error(msg, extra={'formula': formula, 'task_id': task['task_id'], 'tags': [tag], 'error': str(ex)})
                             continue
                         if sg in canonical_structures[formula]:
                             if sg not in task_structures:

From 50866a8d97a2fc166f05b800724e25afc7807c44 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 16:59:52 -0700
Subject: [PATCH 16/97] cli.report: minor table update

---
 emmet/scripts/emmet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index cf530a1632..cc54179c4c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -613,7 +613,8 @@ def report(tag):
                 if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
         entry.append(percent_fizzled)
         entry.append(progress)
-        table.add_row(entry)
+        if any(entry[2:-2]):
+            table.add_row(entry)
 
     table.sortby = 'SNLs'
     table.reversesort = True

From 718d1878c81e0e1f26d6fcab45002bb1c9b4b17c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 27 Jun 2018 17:06:35 -0700
Subject: [PATCH 17/97] cli: codacy fixes

---
 emmet/scripts/emmet.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index cc54179c4c..bd5b9ed55f 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, json, tarfile
+import click, os, yaml, sys, logging, tarfile
 from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
@@ -11,7 +11,6 @@
 from fireworks import LaunchPad, Workflow
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.workflows.presets.core import wf_structure_optimization
-from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
 from emmet.vasp.materials import group_structures, get_sg
 from emmet.vasp.task_tagger import task_type
@@ -37,7 +36,7 @@ def cli():
 def ensure_meta(snls_db):
     """ensure meta-data fields are set in SNL collection"""
 
-    snl_db_config = yaml.load(open(snls_db, 'r'))
+    snl_db_config = yaml.safe_load(open(snls_db, 'r'))
     snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
     snl_db = snl_db_conn[snl_db_config['db']]
     snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
@@ -188,7 +187,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
     # TODO use add_snls first, and then add_wflows based on SNL collection
     snl_collections = [lpad.db.snls]
     if add_snls_db is not None:
-        snl_db_config = yaml.load(open(add_snls_db, 'r'))
+        snl_db_config = yaml.safe_load(open(add_snls_db, 'r'))
         snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
         snl_db = snl_db_conn[snl_db_config['db']]
         snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
@@ -273,7 +272,7 @@ def load_canonical_task_structures(formula, full_name):
                     if task_label == "Structure Optimization":
                         s = Structure.from_dict(task['input']['structure'])
                         try:
-                            sgnum = get_sg(s)
+                            sg = get_sg(s)
                         except Exception as ex:
                             s.to(fmt='json', filename='sgnum_{}.json'.format(task['task_id']))
                             msg = 'SNL {}: {}'.format(task['task_id'], ex)
@@ -483,7 +482,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                         lpad.delete_wf(s.fw_id)
                                                         break
                                                 else:
-                                                    print('  --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state'], s.fw_id))
+                                                    print('  --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state']))
                                                     lpad.delete_wf(s.fw_id)
                                                     break
                                     else:
@@ -633,7 +632,7 @@ def add_snls(archive, add_snls_dbs):
     snl_collections = [lpad.db.snls]
     if add_snls_dbs:
         for add_snls_db in add_snls_dbs:
-            snl_db_config = yaml.load(open(add_snls_db, 'r'))
+            snl_db_config = yaml.safe_load(open(add_snls_db, 'r'))
             snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
             snl_db = snl_db_conn[snl_db_config['db']]
             snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
@@ -655,7 +654,7 @@ def add_snls(archive, add_snls_dbs):
         print('Please include meta info in', meta_path)
         return
     with open(meta_path, 'r') as f:
-        meta = yaml.load(f)
+        meta = yaml.safe_load(f)
         meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
 
     exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}

From 7776857eee78094147ec8c9aae9e67c48462c19b Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 28 Jun 2018 16:23:21 -0700
Subject: [PATCH 18/97] cli: add_snls working

---
 emmet/scripts/emmet.py | 140 +++++++++++++++++++++++++++--------------
 1 file changed, 92 insertions(+), 48 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index bd5b9ed55f..d36e266484 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -26,6 +26,25 @@
 task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
 structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
 
+def aggregate_by_formula(coll, q, key='reduced_cell_formula'):
+    query = {'$and': [q, exclude]}
+    query.update(base_query)
+    return coll.aggregate([
+        {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
+        {'$group': {
+            '_id': '${}'.format(key),
+            'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
+        }}
+    ], allowDiskUse=True, batchSize=1)
+
+def get_meta_from_structure(struct):
+    d = {'formula_pretty': struct.composition.reduced_formula}
+    d['nelements'] = len(set(struct.composition.elements))
+    d['nsites'] = len(struct)
+    d['is_ordered'] = struct.is_ordered
+    d['is_valid'] = struct.is_valid()
+    return d
+
 @click.group()
 def cli():
     pass
@@ -47,18 +66,13 @@ def ensure_meta(snls_db):
         if idx and not idx%1000:
             print(idx, '...')
         struct = Structure.from_dict(doc)
-        d = {'formula_pretty': struct.composition.reduced_formula}
-        d['nelements'] = len(set(struct.composition.elements))
-        d['nsites'] = len(struct)
-        d['is_ordered'] = struct.is_ordered
-        d['is_valid'] = struct.is_valid()
-        snl_coll.update({'snl_id': doc['snl_id']}, {'$set': d})
+        snl_coll.update({'snl_id': doc['snl_id']}, {'$set': get_meta_from_structure(struct)})
 
     ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll])
 
 
 @cli.command()
-@click.option('--target_db_file', default="target.json", help='target db file')
+@click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute task addition')
 def add_tasks(target_db_file, tag, insert):
@@ -74,10 +88,7 @@ def get_subdir(dn):
     source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to source db with', source.collection.count(), 'tasks')
 
-    if not os.path.exists(target_db_file):
-        print(target_db_file, 'not found!')
-        return
-    target = VaspCalcDb.from_db_file(target_db_file, admin=True) # 'db_atomate.json'
+    target = VaspCalcDb.from_db_file(target_db_file, admin=True)
     print('connected to target db with', target.collection.count(), 'tasks')
 
     ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection])
@@ -184,7 +195,6 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
 
     lpad = LaunchPad.auto_load()
 
-    # TODO use add_snls first, and then add_wflows based on SNL collection
     snl_collections = [lpad.db.snls]
     if add_snls_db is not None:
         snl_db_config = yaml.safe_load(open(add_snls_db, 'r'))
@@ -300,22 +310,17 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
 
     for tag, value in tags.items():
-        query = {'$and': [{'about.remarks': tag}, exclude]}
-        query.update(base_query)
 
         if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed
             #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1})
             print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?')
             continue
 
+        if skip_all_scanned and not value[1]:
+            continue
+
         print('aggregate', value[0], 'structures for', tag, '...')
-        structure_groups = value[-1].aggregate([
-            {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
-            {'$group': {
-                '_id': '$reduced_cell_formula',
-                'structures': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
-            }}
-        ], allowDiskUse=True, batchSize=1)
+        structure_groups = aggregate_by_formula(value[-1], {'about.remarks': tag})
 
         print('loop formulas for', tag, '...')
         counter = Counter()
@@ -350,7 +355,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     try:
                         sgnum = get_sg(s)
                     except Exception as ex:
-                        s.to(fmt='json', filename='sgnum-{}.json'.format(s.snl_id))
+                        s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
                         msg = 'SNL {}: {}'.format(s.snl_id, ex)
                         print(msg)
                         logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
@@ -624,10 +629,14 @@ def report(tag):
 @cli.command()
 @click.argument('archive', type=click.Path(exists=True))
 @click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections')
-def add_snls(archive, add_snls_dbs):
+@click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion')
+def add_snls(archive, add_snls_dbs, insert):
     """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection"""
     # TODO assign task_ids to structures?
 
+    if not insert:
+        print('DRY RUN! Add --insert flag to actually add SNLs')
+
     lpad = LaunchPad.auto_load()
     snl_collections = [lpad.db.snls]
     if add_snls_dbs:
@@ -641,7 +650,7 @@ def add_snls(archive, add_snls_dbs):
         print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
 
     fname, ext = os.path.splitext(os.path.basename(archive))
-    tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else fname, ''
+    tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, '']
     if sec_ext:
         ext = ''.join([sec_ext, ext])
     exts = ['tar.gz', '.tgz']
@@ -666,7 +675,6 @@ def add_snls(archive, add_snls_dbs):
             continue
         f = tar.extractfile(member)
         if f:
-            print(member.name)
             contents = f.read().decode('utf-8')
             fname = member.name.lower()
             if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
@@ -684,28 +692,64 @@ def add_snls(archive, add_snls_dbs):
                 break #continue
 
             formula = struct.composition.reduced_formula
-            query = {'$and': [{'formula_pretty': formula}, exclude]}
-            query.update(base_query)
+            sg = get_sg(struct)
+            struct_added = False
 
             for snl_coll in snl_collections:
-                snl_groups = snl_coll.aggregate([
-                    {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
-                    {'$group': {
-                        '_id': '$formula_pretty',
-                        'snls': {'$push': dict((k.split('.')[-1], '${}'.format(k)) for k in structure_keys)}
-                    }}
-                ], allowDiskUse=True, batchSize=1)
-            return
-
-            snls.append(StructureNL(
-                struct, authors, references=references.strip(), remarks=[tag]
-            ))
-    print(len(snls))
-
-#                snls.append(snl.as_dict())
-#    if snls:
-#        print('add', len(snls), 'SNLs')
-#        result = target.db.snls.insert_many(snls)
-#        print('#SNLs inserted:', len(result.inserted_ids))
-#    else:
-#        print('no SNLs to insert')
+                try:
+                    group = aggregate_by_formula(snl_coll, {'formula_pretty': formula}, key='formula_pretty').next() # only one formula
+                except StopIteration:
+                    continue
+
+                structures = []
+                for dct in group['structures']:
+                    s = Structure.from_dict(dct)
+                    s.snl_id = dct['snl_id']
+                    s.remove_oxidation_states()
+                    try:
+                        sgnum = get_sg(s)
+                    except Exception as ex:
+                        s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
+                        print('SNL {}: {}'.format(s.snl_id, ex))
+                        continue
+                    if sgnum == sg:
+                        structures.append(s)
+
+                if not structures:
+                    continue
+
+                canonical_structures = []
+                for g in group_structures(structures):
+                    canonical_structures.append(g[0])
+
+                if not canonical_structures:
+                    continue
+
+                for s in canonical_structures:
+                    if structures_match(struct, s):
+                        print('Structure from', member.name, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name)
+                        struct_added = True
+                        break
+
+                if struct_added:
+                    break
+
+            if struct_added:
+                continue
+
+            print('append SNL for structure from', member.name)
+            snl_dct = StructureNL(struct, meta['authors'], references=meta.get('references', '').strip(), projects=[tag]).as_dict()
+            snl_dct.update(get_meta_from_structure(struct))
+            prefix = snl_collections[0].database.name
+            index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1
+            snl_dct['snl_id'] = '{}-{}'.format(prefix, index)
+            snls.append(snl_dct)
+
+    if snls:
+        print('add', len(snls), 'SNLs')
+        if insert:
+            result = snl_collections[0].insert_many(snls)
+            print('#SNLs inserted:', len(result.inserted_ids))
+    else:
+        print('no SNLs to insert')
+

From 89c95831fc5526849d343314dba78ae29e856f1f Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 28 Jun 2018 16:28:40 -0700
Subject: [PATCH 19/97] cli: fix codacy issues

---
 emmet/scripts/emmet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index d36e266484..43d5e14421 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -8,7 +8,7 @@
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
 from pymatgen.util.provenance import StructureNL, Author
-from fireworks import LaunchPad, Workflow
+from fireworks import LaunchPad
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.workflows.presets.core import wf_structure_optimization
 from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
@@ -530,8 +530,8 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                 wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
                             #if struct.icsd_id is not None:
                             #    wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id})
-                        except:
-                            msg = 'Structure for SNL {} --> SKIP: Could not make workflow'.format(struct.snl_id)
+                        except Exception as ex:
+                            msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex))
                             print(msg)
                             logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'})
                             continue

From b7b80c7fbda9518da47361034aae2675486ad639 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 28 Jun 2018 17:55:57 -0700
Subject: [PATCH 20/97] cli: include projects in distinct tags

---
 emmet/scripts/emmet.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 43d5e14421..c0d24eb356 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -203,7 +203,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
         snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
         snl_collections.append(snl_db[snl_db_config['collection']])
 
-    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'sites.label', 'nsites', 'nelements'], snl_collections)
+    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections)
     for snl_coll in snl_collections:
         print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
 
@@ -234,12 +234,12 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
         query = dict(exclude)
         query.update(base_query)
         for snl_coll in snl_collections:
-            remarks = filter(None, snl_coll.find(query).distinct('about.remarks'))
-            for t in remarks:
-                query = {'$and': [{'about.remarks': t}, exclude]}
-                query.update(base_query)
+            remarks_projects = snl_coll.distinct('about.projects', query) + snl_coll.distinct('about.remarks', query)
+            for t in set(remarks_projects):
+                q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]}
+                q.update(base_query)
                 if t not in all_tags:
-                    all_tags[t] = [snl_coll.count(query), snl_coll]
+                    all_tags[t] = [snl_coll.count(q), snl_coll]
                 else:
                     print('tag -', t, '- already in', all_tags[t][-1].full_name)
         sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0])
@@ -248,7 +248,7 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
             if item[1][0] < max_structures and to_scan:
                 tags[item[0]] = [item[1][0], to_scan, item[1][-1]]
     else:
-        query = {'$and': [{'about.remarks': tag}, exclude]}
+        query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]}
         query.update(base_query)
         for snl_coll in snl_collections:
             cnt = snl_coll.count(query)
@@ -261,6 +261,8 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
         print('nothing to scan')
         return
     print(len(tags), 'tags to scan in source SNL collections:')
+    if tag is None:
+        print('[with < {} structures to scan]'.format(max_structures))
     print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()]))
 
     canonical_task_structures = {}

From 7e9e6ac3b9d1c0133f22fc39a07e131da447491c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 29 Jun 2018 16:31:46 -0700
Subject: [PATCH 21/97] more progress with cli

---
 emmet/scripts/emmet.py | 112 +++++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 42 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index c0d24eb356..475bc28c05 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -25,10 +25,20 @@
 base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
 task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
 structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
+aggregation_keys = ['reduced_cell_formula', 'formula_pretty']
 
-def aggregate_by_formula(coll, q, key='reduced_cell_formula'):
+def aggregate_by_formula(coll, q, key=None):
     query = {'$and': [q, exclude]}
     query.update(base_query)
+    if key is None:
+        for k in aggregation_keys:
+            q = {k: {'$exists': 1}}
+            q.update(base_query)
+            if coll.count(q):
+                key = k
+                break
+        if key is None:
+            raise ValueError('could not find aggregation keys', aggregation_keys, 'in', coll.full_name)
     return coll.aggregate([
         {'$match': query}, {'$sort': OrderedDict([('nelements', 1), ('nsites', 1)])},
         {'$group': {
@@ -75,8 +85,8 @@ def ensure_meta(snls_db):
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute task addition')
-def add_tasks(target_db_file, tag, insert):
-    """Retrieve tasks from source and add to target"""
+def copy_tasks(target_db_file, tag, insert):
+    """Retrieve tasks from source and copy to target task collection"""
 
     if not insert:
         print('DRY RUN: add --insert flag to actually add tasks to production')
@@ -180,14 +190,14 @@ def get_subdir(dn):
 
 
 @cli.command()
-@click.option('--add_snls_db', type=click.Path(exists=True), help='config file for additional SNLs collection')
-@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection')
+@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
+@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan')
 @click.option('--tag', default=None, help='only include structures with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
-@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection')
-@click.option('--max-structures', default=1000, help='set max structures for tags to scan')
+@click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag')
+@click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan')
 @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors')
-def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
+def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
     """add workflows based on tags in SNL collection"""
 
     if not insert:
@@ -196,14 +206,14 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
     lpad = LaunchPad.auto_load()
 
     snl_collections = [lpad.db.snls]
-    if add_snls_db is not None:
-        snl_db_config = yaml.safe_load(open(add_snls_db, 'r'))
-        snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
-        snl_db = snl_db_conn[snl_db_config['db']]
-        snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
-        snl_collections.append(snl_db[snl_db_config['collection']])
-
-    ensure_indexes(['snl_id', 'reduced_cell_formula', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections)
+    if add_snls_dbs is not None:
+        for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')):
+            snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+            snl_db = snl_db_conn[snl_db_config['db']]
+            snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+            snl_collections.append(snl_db[snl_db_config['collection']])
+
+    ensure_indexes(['snl_id', 'reduced_cell_formula', 'formula_pretty', 'about.remarks', 'about.projects', 'sites.label', 'nsites', 'nelements'], snl_collections)
     for snl_coll in snl_collections:
         print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
 
@@ -213,8 +223,8 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
         username=lpad.username, password=lpad.password, authentication_db=lpad.name, formatter=MyMongoFormatter()
     )
     logger.addHandler(mongo_handler)
-    if clear_logs:
-        mongo_handler.collection.drop()
+    if clear_logs and tag is not None:
+        mongo_handler.collection.remove({'tags': tag})
     ensure_indexes(['level', 'message', 'snl_id', 'formula', 'tags'], [mongo_handler.collection])
 
     tasks_collections = OrderedDict()
@@ -234,14 +244,22 @@ def add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structure
         query = dict(exclude)
         query.update(base_query)
         for snl_coll in snl_collections:
-            remarks_projects = snl_coll.distinct('about.projects', query) + snl_coll.distinct('about.remarks', query)
-            for t in set(remarks_projects):
+            print('collecting tags from', snl_coll.full_name, '...')
+            projects = snl_coll.distinct('about.projects', query)
+            remarks = snl_coll.distinct('about.remarks', query)
+            projects_remarks = projects
+            if len(remarks) < 100:
+                projects_remarks += remarks
+            else:
+                print('too many remarks in', snl_coll.full_name, '({})'.format(len(remarks)))
+            for t in set(projects_remarks):
                 q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]}
                 q.update(base_query)
                 if t not in all_tags:
                     all_tags[t] = [snl_coll.count(q), snl_coll]
                 else:
                     print('tag -', t, '- already in', all_tags[t][-1].full_name)
+        print('sort and analyze tags ...')
         sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0])
         for item in sorted_tags:
             to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]})
@@ -313,16 +331,11 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
     for tag, value in tags.items():
 
-        if tag == 'new_ordered_icsd_2017': # TODO WIP will be removed
-            #TODO for new_ordered_icsd_2017: docs = db.icsd.find(query, {'snl': 1, 'formula_reduced_abc': 1, 'icsd_id': 1, 'elements': 1})
-            print(tag, 'TODO implement db.icsd as snl_coll -> add_snls?')
-            continue
-
         if skip_all_scanned and not value[1]:
             continue
 
         print('aggregate', value[0], 'structures for', tag, '...')
-        structure_groups = aggregate_by_formula(value[-1], {'about.remarks': tag})
+        structure_groups = aggregate_by_formula(value[-1], {'$or': [{'about.remarks': tag}, {'about.projects': tag}]})
 
         print('loop formulas for', tag, '...')
         counter = Counter()
@@ -530,8 +543,6 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             wf = add_tags(wf, [tag])
                             if struct.task_id is not None:
                                 wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
-                            #if struct.icsd_id is not None:
-                            #    wf = add_additional_fields_to_taskdocs(wf, update_dict={'icsd_id': struct.icsd_id})
                         except Exception as ex:
                             msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex))
                             print(msg)
@@ -557,6 +568,9 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                 for x in canonical_structures_list
             ])
             print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
+            if tag is not None:
+                print('trying again ...')
+                add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, True)
 
         print(counter)
 
@@ -596,11 +610,16 @@ def report(tag):
     if tag is None:
         tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None]
         tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags]
+        all_tags = []
+        for t in tags:
+            all_tags.append((t, lpad.db.add_wflows_logs.count({'tags': t})))
+        tags = [t[0] for t in sorted(all_tags, key=lambda x: x[1], reverse=True)]
         print(len(tags), 'tags in WFs and logs collections')
 
     from prettytable import PrettyTable
     table = PrettyTable()
     table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress']
+    sums = ['total'] + [0] * (len(table.field_names)-1)
 
     for t in tags:
         wflows = lpad.workflows.find({'metadata.tags': t}, {'state': 1})
@@ -615,22 +634,27 @@ def report(tag):
             progress = '{:.0f}%'.format(progress)
         entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states]
         fizzled = counter['FIZZLED'] / total if total else 0.
+        if progress != '-':
+            fizzled = counter['FIZZLED'] / counter['COMPLETED'] if counter['COMPLETED'] else 0.
         percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
                 if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
         entry.append(percent_fizzled)
         entry.append(progress)
+        for idx, e in enumerate(entry):
+            if isinstance(e, int):
+                sums[idx] += e
         if any(entry[2:-2]):
             table.add_row(entry)
 
-    table.sortby = 'SNLs'
-    table.reversesort = True
+    if tag is None:
+        table.add_row(['\033[1;32m{}\033[0m'.format(s if s else '-') for s in sums])
     table.align['Tag'] = 'r'
     print(table)
 
 
 @cli.command()
 @click.argument('archive', type=click.Path(exists=True))
-@click.option('--add_snls_dbs', '-a', multiple=True, type=click.Path(exists=True), help='config files for additional SNLs collections')
+@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against')
 @click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion')
 def add_snls(archive, add_snls_dbs, insert):
     """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection"""
@@ -641,9 +665,8 @@ def add_snls(archive, add_snls_dbs, insert):
 
     lpad = LaunchPad.auto_load()
     snl_collections = [lpad.db.snls]
-    if add_snls_dbs:
-        for add_snls_db in add_snls_dbs:
-            snl_db_config = yaml.safe_load(open(add_snls_db, 'r'))
+    if add_snls_dbs is not None:
+        for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')):
             snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
             snl_db = snl_db_conn[snl_db_config['db']]
             snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
@@ -661,14 +684,14 @@ def add_snls(archive, add_snls_dbs, insert):
         return
 
     meta_path = '{}.yaml'.format(tag)
+    meta = None
     if not os.path.exists(meta_path):
-        print('Please include meta info in', meta_path)
-        return
-    with open(meta_path, 'r') as f:
-        meta = yaml.safe_load(f)
-        meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
-
-    exclude = {'about.remarks': {'$ne': 'DEPRECATED'}}
+        meta = {'authors': ['Materials Project <feedback@materialsproject.org>']}
+        print(meta_path, 'not found. Using', meta)
+    else:
+        with open(meta_path, 'r') as f:
+            meta = yaml.safe_load(f)
+    meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
 
     snls = []
     tar = tarfile.open(archive, 'r:gz')
@@ -693,13 +716,18 @@ def add_snls(archive, add_snls_dbs, insert):
                 print(ex)
                 break #continue
 
+            if not (struct.is_ordered and struct.is_valid()):
+                print('Structure from', member.name, 'not ordered and valid!')
+                continue
+
             formula = struct.composition.reduced_formula
             sg = get_sg(struct)
             struct_added = False
 
             for snl_coll in snl_collections:
                 try:
-                    group = aggregate_by_formula(snl_coll, {'formula_pretty': formula}, key='formula_pretty').next() # only one formula
+                    q = {'$or': [{k: formula} for k in aggregation_keys]}
+                    group = aggregate_by_formula(snl_coll, q).next() # only one formula
                 except StopIteration:
                     continue
 

From d63021bb0167ec3b1318823cebdccf13fca9b77d Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 29 Jun 2018 17:20:07 -0700
Subject: [PATCH 22/97] cli.add_snls: support bson and TransformedStructure

---
 emmet/scripts/emmet.py | 78 +++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 475bc28c05..02c17e4f40 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, tarfile
+import click, os, yaml, sys, logging, tarfile, bson, gzip
 from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
@@ -7,6 +7,7 @@
 from pymongo.collection import ReturnDocument
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
+from pymatgen.alchemy.materials import TransformedStructure
 from pymatgen.util.provenance import StructureNL, Author
 from fireworks import LaunchPad
 from atomate.vasp.database import VaspCalcDb
@@ -678,7 +679,7 @@ def add_snls(archive, add_snls_dbs, insert):
     tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, '']
     if sec_ext:
         ext = ''.join([sec_ext, ext])
-    exts = ['tar.gz', '.tgz']
+    exts = ['tar.gz', '.tgz', 'bson.gz']
     if ext not in exts:
         print(ext, 'not supported (yet)! Please use one of', exts)
         return
@@ -693,37 +694,46 @@ def add_snls(archive, add_snls_dbs, insert):
             meta = yaml.safe_load(f)
     meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
 
-    snls = []
-    tar = tarfile.open(archive, 'r:gz')
-    for member in tar.getmembers():
-        if os.path.basename(member.name).startswith('.'):
-            continue
-        f = tar.extractfile(member)
-        if f:
-            contents = f.read().decode('utf-8')
-            fname = member.name.lower()
-            if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
-                fmt = 'cif'
-            elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
-                fmt = 'json'
-            else:
-                print('reading', fname, 'not supported (yet)')
+    input_structures = []
+    if ext == 'bson.gz':
+        for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))):
+            if idx and not idx%1000:
+                print(idx, '...')
+            input_structures.append(TransformedStructure.from_dict(doc['structure']))
+    else:
+        tar = tarfile.open(archive, 'r:gz')
+        for member in tar.getmembers():
+            if os.path.basename(member.name).startswith('.'):
                 continue
+            f = tar.extractfile(member)
+            if f:
+                contents = f.read().decode('utf-8')
+                fname = member.name.lower()
+                if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
+                    fmt = 'cif'
+                elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
+                    fmt = 'json'
+                else:
+                    print('reading', fname, 'not supported (yet)')
+                    continue
+                try:
+                    input_structures.append(Structure.from_str(contents, fmt=fmt))
+                except Exception as ex:
+                    print(ex)
+                    break #continue
 
-            try:
-                struct = Structure.from_str(contents, fmt=fmt)
-            except Exception as ex:
-                print(ex)
-                break #continue
+    print(len(input_structures), 'structure(s) loaded.')
 
-            if not (struct.is_ordered and struct.is_valid()):
-                print('Structure from', member.name, 'not ordered and valid!')
-                continue
+    snls = []
+    for struct in input_structures:
 
             formula = struct.composition.reduced_formula
             sg = get_sg(struct)
-            struct_added = False
+            if not (struct.is_ordered and struct.is_valid()):
+                print('Structure for', formula, sg, 'not ordered and valid!')
+                continue
 
+            struct_added = False
             for snl_coll in snl_collections:
                 try:
                     q = {'$or': [{k: formula} for k in aggregation_keys]}
@@ -757,7 +767,7 @@ def add_snls(archive, add_snls_dbs, insert):
 
                 for s in canonical_structures:
                     if structures_match(struct, s):
-                        print('Structure from', member.name, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name)
+                        print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name)
                         struct_added = True
                         break
 
@@ -767,12 +777,18 @@ def add_snls(archive, add_snls_dbs, insert):
             if struct_added:
                 continue
 
-            print('append SNL for structure from', member.name)
-            snl_dct = StructureNL(struct, meta['authors'], references=meta.get('references', '').strip(), projects=[tag]).as_dict()
-            snl_dct.update(get_meta_from_structure(struct))
             prefix = snl_collections[0].database.name
             index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1
-            snl_dct['snl_id'] = '{}-{}'.format(prefix, index)
+            snl_id = '{}-{}'.format(prefix, index)
+            print('append SNL for structure with', formula, sg, 'as', snl_id)
+            references = meta.get('references', '').strip()
+            if isinstance(struct, TransformedStructure):
+                snl = struct.to_snl(meta['authors'], references=references, projects=[tag])
+            else:
+                snl = StructureNL(struct, meta['authors'], references=references, projects=[tag])
+            snl_dct = snl.as_dict()
+            snl_dct.update(get_meta_from_structure(struct))
+            snl_dct['snl_id'] = snl_id
             snls.append(snl_dct)
 
     if snls:

From 511d1c53031e33eeadb83c1523b3e728020ff2fd Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 6 Jul 2018 10:43:54 -0700
Subject: [PATCH 23/97] cli: Trafos, bug fixes, insert_snls

---
 emmet/scripts/emmet.py | 159 ++++++++++++++++++++++++-----------------
 1 file changed, 95 insertions(+), 64 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 02c17e4f40..43c75ad402 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -22,8 +22,8 @@
     sys.exit(0)
 
 exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}}
-no_electroneg = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+']
-base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': no_electroneg}}
+skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+']
+base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': skip_labels}}
 task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
 structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
 aggregation_keys = ['reduced_cell_formula', 'formula_pretty']
@@ -367,7 +367,13 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     s = Structure.from_dict(dct)
                     s.snl_id = dct['snl_id']
                     s.task_id = dct.get('task_id')
-                    s.remove_oxidation_states()
+                    try:
+                        s.remove_oxidation_states()
+                    except Exception as ex:
+                        msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                        print(msg)
+                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
+                        continue
                     try:
                         sgnum = get_sg(s)
                     except Exception as ex:
@@ -571,7 +577,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
             print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
             if tag is not None:
                 print('trying again ...')
-                add_wflows(add_snls_db, add_tasks_db, tag, insert, clear_logs, max_structures, True)
+                add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True)
 
         print(counter)
 
@@ -675,6 +681,17 @@ def add_snls(archive, add_snls_dbs, insert):
     for snl_coll in snl_collections:
         print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
 
+    def insert_snls(snls_list):
+        if snls_list:
+            print('add', len(snls_list), 'SNLs')
+            if insert:
+                result = snl_collections[0].insert_many(snls_list)
+                print('#SNLs inserted:', len(result.inserted_ids))
+                snls_list.clear()
+        else:
+            print('no SNLs to insert')
+
+
     fname, ext = os.path.splitext(os.path.basename(archive))
     tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, '']
     if sec_ext:
@@ -699,6 +716,9 @@ def add_snls(archive, add_snls_dbs, insert):
         for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))):
             if idx and not idx%1000:
                 print(idx, '...')
+            elements = set([specie['element'] for site in doc['structure']['sites'] for specie in site['species']])
+            if any([bool(l in elements) for l in skip_labels]):
+                continue
             input_structures.append(TransformedStructure.from_dict(doc['structure']))
     else:
         tar = tarfile.open(archive, 'r:gz')
@@ -724,78 +744,89 @@ def add_snls(archive, add_snls_dbs, insert):
 
     print(len(input_structures), 'structure(s) loaded.')
 
-    snls = []
-    for struct in input_structures:
+    snls, index = [], None
+    for idx, istruct in enumerate(input_structures):
 
-            formula = struct.composition.reduced_formula
+        struct = istruct.final_structure if isinstance(istruct, TransformedStructure) else istruct
+        formula = struct.composition.reduced_formula
+        try:
             sg = get_sg(struct)
-            if not (struct.is_ordered and struct.is_valid()):
-                print('Structure for', formula, sg, 'not ordered and valid!')
+        except Exception as ex:
+            struct.to(fmt='json', filename='sgnum_{}_{}.json'.format(tag, formula))
+            print('Structure for {}: {}'.format(formula, ex))
+            continue
+        if not (struct.is_ordered and struct.is_valid()):
+            print('Structure for', formula, sg, 'not ordered and valid!')
+            continue
+        try:
+            struct.remove_oxidation_states()
+        except Exception as ex:
+            print(struct.sites)
+            print(ex)
+            print('Structure for', formula, sg, 'error in remove_oxidation_states!')
+            sys.exit(0) #continue
+
+        struct_added = False
+        for snl_coll in snl_collections:
+            try:
+                q = {'$or': [{k: formula} for k in aggregation_keys]}
+                group = aggregate_by_formula(snl_coll, q).next() # only one formula
+            except StopIteration:
                 continue
 
-            struct_added = False
-            for snl_coll in snl_collections:
+            structures = []
+            for dct in group['structures']:
+                s = Structure.from_dict(dct)
+                s.snl_id = dct['snl_id']
+                s.remove_oxidation_states()
                 try:
-                    q = {'$or': [{k: formula} for k in aggregation_keys]}
-                    group = aggregate_by_formula(snl_coll, q).next() # only one formula
-                except StopIteration:
-                    continue
-
-                structures = []
-                for dct in group['structures']:
-                    s = Structure.from_dict(dct)
-                    s.snl_id = dct['snl_id']
-                    s.remove_oxidation_states()
-                    try:
-                        sgnum = get_sg(s)
-                    except Exception as ex:
-                        s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
-                        print('SNL {}: {}'.format(s.snl_id, ex))
-                        continue
-                    if sgnum == sg:
-                        structures.append(s)
-
-                if not structures:
+                    sgnum = get_sg(s)
+                except Exception as ex:
+                    s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
+                    print('SNL {}: {}'.format(s.snl_id, ex))
                     continue
+                if sgnum == sg:
+                    structures.append(s)
 
-                canonical_structures = []
-                for g in group_structures(structures):
-                    canonical_structures.append(g[0])
+            if not structures:
+                continue
 
-                if not canonical_structures:
-                    continue
+            canonical_structures = []
+            for g in group_structures(structures):
+                canonical_structures.append(g[0])
 
-                for s in canonical_structures:
-                    if structures_match(struct, s):
-                        print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name)
-                        struct_added = True
-                        break
+            if not canonical_structures:
+                continue
 
-                if struct_added:
+            for s in canonical_structures:
+                if structures_match(struct, s):
+                    print('Structure for', formula, sg, 'already added as SNL', s.snl_id, 'in', snl_coll.full_name)
+                    struct_added = True
                     break
 
             if struct_added:
-                continue
+                break
 
-            prefix = snl_collections[0].database.name
-            index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + len(snls) + 1
-            snl_id = '{}-{}'.format(prefix, index)
-            print('append SNL for structure with', formula, sg, 'as', snl_id)
-            references = meta.get('references', '').strip()
-            if isinstance(struct, TransformedStructure):
-                snl = struct.to_snl(meta['authors'], references=references, projects=[tag])
-            else:
-                snl = StructureNL(struct, meta['authors'], references=references, projects=[tag])
-            snl_dct = snl.as_dict()
-            snl_dct.update(get_meta_from_structure(struct))
-            snl_dct['snl_id'] = snl_id
-            snls.append(snl_dct)
-
-    if snls:
-        print('add', len(snls), 'SNLs')
-        if insert:
-            result = snl_collections[0].insert_many(snls)
-            print('#SNLs inserted:', len(result.inserted_ids))
-    else:
-        print('no SNLs to insert')
+        if struct_added:
+            continue
+
+        prefix = snl_collections[0].database.name
+        if index is None:
+            index = max([int(snl_id[len(prefix)+1:]) for snl_id in snl_collections[0].distinct('snl_id')]) + 1
+        else:
+            index += 1
+        snl_id = '{}-{}'.format(prefix, index)
+        print('append SNL for structure with', formula, sg, 'as', snl_id)
+        references = meta.get('references', '').strip()
+        if isinstance(istruct, TransformedStructure):
+            snl = istruct.to_snl(meta['authors'], references=references, projects=[tag])
+        else:
+            snl = StructureNL(istruct, meta['authors'], references=references, projects=[tag])
+        snl_dct = snl.as_dict()
+        snl_dct.update(get_meta_from_structure(struct))
+        snl_dct['snl_id'] = snl_id
+        snls.append(snl_dct)
+
+        if idx and not idx%100 or idx == len(input_structures)-1:
+            insert_snls(snls)
 

From 0a97d882b84506a2fd32a1c9622a99375dd8ade2 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 6 Jul 2018 14:43:36 -0700
Subject: [PATCH 24/97] cli: include tags from all collections

---
 emmet/scripts/emmet.py | 482 ++++++++++++++++++++---------------------
 1 file changed, 241 insertions(+), 241 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 43c75ad402..000af25502 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -257,24 +257,22 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
                 q = {'$and': [{'$or': [{'about.remarks': t}, {'about.projects': t}]}, exclude]}
                 q.update(base_query)
                 if t not in all_tags:
-                    all_tags[t] = [snl_coll.count(q), snl_coll]
-                else:
-                    print('tag -', t, '- already in', all_tags[t][-1].full_name)
+                    all_tags[t] = []
+                all_tags[t].append([snl_coll.count(q), snl_coll])
         print('sort and analyze tags ...')
-        sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0])
+        sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0][0])
         for item in sorted_tags:
-            to_scan = item[1][0] - lpad.db.add_wflows_logs.count({'tags': item[0]})
-            if item[1][0] < max_structures and to_scan:
-                tags[item[0]] = [item[1][0], to_scan, item[1][-1]]
+            total = sum([x[0] for x in item[1]])
+            to_scan = total - lpad.db.add_wflows_logs.count({'tags': item[0]})
+            if total < max_structures and to_scan:
+                tags[item[0]] = [total, to_scan, [x[-1] for x in item[1]]]
     else:
         query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]}
         query.update(base_query)
-        for snl_coll in snl_collections:
-            cnt = snl_coll.count(query)
-            if cnt:
-                to_scan = cnt - lpad.db.add_wflows_logs.count({'tags': tag})
-                tags[tag] = [cnt, to_scan, snl_coll]
-                break
+        total = sum([snl_coll.count(query) for snl_coll in snl_collections])
+        if total:
+            to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag})
+            tags[tag] = [total, to_scan, snl_collections]
 
     if not tags:
         print('nothing to scan')
@@ -282,7 +280,7 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
     print(len(tags), 'tags to scan in source SNL collections:')
     if tag is None:
         print('[with < {} structures to scan]'.format(max_structures))
-    print('\n'.join(['{} {} ({}) --> {} TO SCAN'.format(v[2].full_name, k, v[0], v[1]) for k, v in tags.items()]))
+    print('\n'.join(['{} ({}) --> {} TO SCAN'.format(k, v[0], v[1]) for k, v in tags.items()]))
 
     canonical_task_structures = {}
     grouped_workflow_structures = {}
@@ -335,251 +333,253 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
         if skip_all_scanned and not value[1]:
             continue
 
-        print('aggregate', value[0], 'structures for', tag, '...')
-        structure_groups = aggregate_by_formula(value[-1], {'$or': [{'about.remarks': tag}, {'about.projects': tag}]})
-
-        print('loop formulas for', tag, '...')
-        counter = Counter()
-        structures, canonical_structures = {}, {}
+        print(value[0], 'structures for', tag, '...')
+        for coll in value[-1]:
+            print('aggregate structures in', coll.full_name,  '...')
+            structure_groups = aggregate_by_formula(coll, {'$or': [{'about.remarks': tag}, {'about.projects': tag}]})
 
-        try:
-            for idx_group, group in enumerate(structure_groups):
-
-                counter['formulas'] += 1
-                formula = group['_id']
-                if formula not in structures:
-                    structures[formula] = {}
-                if formula not in canonical_structures:
-                    canonical_structures[formula] = {}
-                if idx_group and not idx_group%1000:
-                    print(idx_group, '...')
-
-                for dct in group['structures']:
-                    q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}
-                    if mongo_handler.collection.find_one(q):
-                        lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}})
-                        continue # already checked
-                    q['level'] = 'ERROR'
-                    if skip_all_scanned and mongo_handler.collection.find_one(q):
-                        continue
-                    mongo_handler.collection.remove(q) # avoid dups
-                    counter['structures'] += 1
-                    s = Structure.from_dict(dct)
-                    s.snl_id = dct['snl_id']
-                    s.task_id = dct.get('task_id')
-                    try:
-                        s.remove_oxidation_states()
-                    except Exception as ex:
-                        msg = 'SNL {}: {}'.format(s.snl_id, ex)
-                        print(msg)
-                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
-                        continue
-                    try:
-                        sgnum = get_sg(s)
-                    except Exception as ex:
-                        s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
-                        msg = 'SNL {}: {}'.format(s.snl_id, ex)
-                        print(msg)
-                        logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
-                        continue
-                    if sgnum not in structures[formula]:
-                        structures[formula][sgnum] = []
-                    structures[formula][sgnum].append(s)
-
-                for sgnum, slist in structures[formula].items():
-                    for g in group_structures(slist):
-                        if sgnum not in canonical_structures[formula]:
-                            canonical_structures[formula][sgnum] = []
-                        canonical_structures[formula][sgnum].append(g[0])
-                        if len(g) > 1:
-                            for s in g[1:]:
-                                logger.warning('duplicate structure', extra={
-                                    'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id
-                                })
+            print('loop formulas for', tag, '...')
+            counter = Counter()
+            structures, canonical_structures = {}, {}
 
-                if not canonical_structures[formula]:
-                    continue
-                canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist]
-
-                if formula not in canonical_workflow_structures:
-                    canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {}
-                    workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1})
-                    if workflows.count() > 0:
-                        workflow_structures = {}
-                        for wf in workflows:
-                            s = Structure.from_dict(wf['metadata']['structure'])
+            try:
+                for idx_group, group in enumerate(structure_groups):
+
+                    counter['formulas'] += 1
+                    formula = group['_id']
+                    if formula not in structures:
+                        structures[formula] = {}
+                    if formula not in canonical_structures:
+                        canonical_structures[formula] = {}
+                    if idx_group and not idx_group%1000:
+                        print(idx_group, '...')
+
+                    for dct in group['structures']:
+                        q = {'level': 'WARNING', 'formula': formula, 'snl_id': dct['snl_id']}
+                        if mongo_handler.collection.find_one(q):
+                            lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}})
+                            continue # already checked
+                        q['level'] = 'ERROR'
+                        if skip_all_scanned and mongo_handler.collection.find_one(q):
+                            continue
+                        mongo_handler.collection.remove(q) # avoid dups
+                        counter['structures'] += 1
+                        s = Structure.from_dict(dct)
+                        s.snl_id = dct['snl_id']
+                        s.task_id = dct.get('task_id')
+                        try:
                             s.remove_oxidation_states()
-                            sgnum = get_sg(s)
-                            if sgnum in canonical_structures[formula]:
-                                if sgnum not in workflow_structures:
-                                    workflow_structures[sgnum] = []
-                                s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework
-                                workflow_structures[sgnum].append(s)
-                        if workflow_structures:
-                            for sgnum, slist in workflow_structures.items():
-                                grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
-                                canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
-                            #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
-
-                for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
-
-                    for struc in slist:
-
+                        except Exception as ex:
+                            msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                            print(msg)
+                            logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
+                            continue
                         try:
-                            struct = vp.get_predicted_structure(struc)
-                            struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                            sgnum = get_sg(s)
                         except Exception as ex:
-                            print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
-                            print(ex)
-                            struct = struc
-
-                        if not structures_match(struct, struc):
-                            print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
-                            struct = struc
-
-                        wf_found = False
-                        if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
-                            for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
-                                if structures_match(struct, s):
-                                    msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id)
-                                    print(msg)
-                                    if struct.task_id is not None:
-                                        task_query = {'task_id': struct.task_id}
-                                        task_query.update(task_base_query)
-                                        for full_name in reversed(tasks_collections):
-                                            task = tasks_collections[full_name].find_one(task_query, ['input.structure'])
+                            s.to(fmt='json', filename='sgnum_{}.json'.format(s.snl_id))
+                            msg = 'SNL {}: {}'.format(s.snl_id, ex)
+                            print(msg)
+                            logger.error(msg, extra={'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'error': str(ex)})
+                            continue
+                        if sgnum not in structures[formula]:
+                            structures[formula][sgnum] = []
+                        structures[formula][sgnum].append(s)
+
+                    for sgnum, slist in structures[formula].items():
+                        for g in group_structures(slist):
+                            if sgnum not in canonical_structures[formula]:
+                                canonical_structures[formula][sgnum] = []
+                            canonical_structures[formula][sgnum].append(g[0])
+                            if len(g) > 1:
+                                for s in g[1:]:
+                                    logger.warning('duplicate structure', extra={
+                                        'formula': formula, 'snl_id': s.snl_id, 'tags': [tag], 'canonical_snl_id': g[0].snl_id
+                                    })
+
+                    if not canonical_structures[formula]:
+                        continue
+                    canonical_structures_list = [x for sublist in canonical_structures[formula].values() for x in sublist]
+
+                    if formula not in canonical_workflow_structures:
+                        canonical_workflow_structures[formula], grouped_workflow_structures[formula] = {}, {}
+                        workflows = lpad.workflows.find({'metadata.formula_pretty': formula}, {'metadata.structure': 1, 'nodes': 1, 'parent_links': 1})
+                        if workflows.count() > 0:
+                            workflow_structures = {}
+                            for wf in workflows:
+                                s = Structure.from_dict(wf['metadata']['structure'])
+                                s.remove_oxidation_states()
+                                sgnum = get_sg(s)
+                                if sgnum in canonical_structures[formula]:
+                                    if sgnum not in workflow_structures:
+                                        workflow_structures[sgnum] = []
+                                    s.fw_id = [n for n in wf['nodes'] if str(n) not in wf['parent_links']][0] # first node = SO firework
+                                    workflow_structures[sgnum].append(s)
+                            if workflow_structures:
+                                for sgnum, slist in workflow_structures.items():
+                                    grouped_workflow_structures[formula][sgnum] = [g for g in group_structures(slist)]
+                                    canonical_workflow_structures[formula][sgnum] = [g[0] for g in grouped_workflow_structures[formula][sgnum]]
+                                #print(sum([len(x) for x in canonical_workflow_structures[formula].values()]), 'canonical workflow structure(s) for', formula)
+
+                    for idx_canonical, (sgnum, slist) in enumerate(canonical_structures[formula].items()):
+
+                        for struc in slist:
+
+                            try:
+                                struct = vp.get_predicted_structure(struc)
+                                struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                            except Exception as ex:
+                                print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
+                                print(ex)
+                                struct = struc
+
+                            if not structures_match(struct, struc):
+                                print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
+                                struct = struc
+
+                            wf_found = False
+                            if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:
+                                for sidx, s in enumerate(canonical_workflow_structures[formula][sgnum]):
+                                    if structures_match(struct, s):
+                                        msg = 'Structure for SNL {} already added in WF {}'.format(struct.snl_id, s.fw_id)
+                                        print(msg)
+                                        if struct.task_id is not None:
+                                            task_query = {'task_id': struct.task_id}
+                                            task_query.update(task_base_query)
+                                            for full_name in reversed(tasks_collections):
+                                                task = tasks_collections[full_name].find_one(task_query, ['input.structure'])
+                                                if task:
+                                                    break
                                             if task:
-                                                break
-                                        if task:
-                                            s_task = Structure.from_dict(task['input']['structure'])
-                                            s_task.remove_oxidation_states()
-                                            if not structures_match(struct, s_task):
-                                                msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
-                                                msg += '  --> CLEANUP: remove task_id from SNL'
-                                                print(msg)
-                                                value[-1].update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
-                                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
-                                                counter['snl-task_mismatch'] += 1
-                                            else:
-                                                msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
-                                                print(msg)
-                                                logger.warning(msg, extra={
-                                                    'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]
-                                                })
-                                        else:
-                                            print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
-                                            fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]]
-                                            fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks'])
-                                            fw_found = False
-                                            for fw in fws:
-                                                if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
-                                                    msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
+                                                s_task = Structure.from_dict(task['input']['structure'])
+                                                s_task.remove_oxidation_states()
+                                                if not structures_match(struct, s_task):
+                                                    msg = '  --> ERROR: Structure for SNL {} does not match {}'.format(struct.snl_id, struct.task_id)
+                                                    msg += '  --> CLEANUP: remove task_id from SNL'
                                                     print(msg)
-                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]})
-                                                    fw_found = True
-                                                    break
-                                            if not fw_found:
-                                                print('  --> no WF with enforced task-id', struct.task_id)
-                                                fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1})
-                                                print('  -->', s.fw_id, fw['state'])
-                                                if fw['state'] == 'COMPLETED':
-                                                    # the task is in lpad.db.tasks with different integer task_id
-                                                    #    => find task => overwrite task_id => add_tasks will pick it up
-                                                    full_name = list(tasks_collections.keys())[0]
-                                                    load_canonical_task_structures(formula, full_name)
-                                                    matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name)
-                                                    if len(matched_task_ids) == 1:
-                                                        tasks_collections[full_name].update(
-                                                            {'task_id': matched_task_ids[0]}, {
-                                                                '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()},
-                                                                '$addToSet': {'tags': tag}
-                                                            }
-                                                        )
-                                                        print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name)
-                                                    elif matched_task_ids:
-                                                        msg = '  --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id)
+                                                    coll.update({'snl_id': struct.snl_id}, {'$unset': {'about._materialsproject.task_id': 1}})
+                                                    logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
+                                                    counter['snl-task_mismatch'] += 1
+                                                else:
+                                                    msg = '  --> OK: workflow resulted in matching task {}'.format(struct.task_id)
+                                                    print(msg)
+                                                    logger.warning(msg, extra={
+                                                        'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]
+                                                    })
+                                            else:
+                                                print('  --> did not find task', struct.task_id, 'for WF', s.fw_id)
+                                                fw_ids = [x.fw_id for x in grouped_workflow_structures[formula][sgnum][sidx]]
+                                                fws = lpad.fireworks.find({'fw_id': {'$in': fw_ids}}, ['fw_id', 'spec._tasks'])
+                                                fw_found = False
+                                                for fw in fws:
+                                                    if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
+                                                        msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
                                                         print(msg)
-                                                        logger.error(msg, extra={
-                                                            'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF'
-                                                        })
+                                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]})
+                                                        fw_found = True
+                                                        break
+                                                if not fw_found:
+                                                    print('  --> no WF with enforced task-id', struct.task_id)
+                                                    fw = lpad.fireworks.find_one({'fw_id': s.fw_id}, {'state': 1})
+                                                    print('  -->', s.fw_id, fw['state'])
+                                                    if fw['state'] == 'COMPLETED':
+                                                        # the task is in lpad.db.tasks with different integer task_id
+                                                        #    => find task => overwrite task_id => add_tasks will pick it up
+                                                        full_name = list(tasks_collections.keys())[0]
+                                                        load_canonical_task_structures(formula, full_name)
+                                                        matched_task_ids = find_matching_canonical_task_structures(formula, struct, full_name)
+                                                        if len(matched_task_ids) == 1:
+                                                            tasks_collections[full_name].update(
+                                                                {'task_id': matched_task_ids[0]}, {
+                                                                    '$set': {'task_id': struct.task_id, 'retired_task_id': matched_task_ids[0], 'last_updated': datetime.utcnow()},
+                                                                    '$addToSet': {'tags': tag}
+                                                                }
+                                                            )
+                                                            print(' --> replaced task_id', matched_task_ids[0], 'with', struct.task_id, 'in', full_name)
+                                                        elif matched_task_ids:
+                                                            msg = '  --> ERROR: multiple tasks {} for completed WF {}'.format(matched_task_ids, s.fw_id)
+                                                            print(msg)
+                                                            logger.error(msg, extra={
+                                                                'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'Multiple tasks for Completed WF'
+                                                            })
+                                                        else:
+                                                            msg = '  --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id)
+                                                            msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id)
+                                                            print(msg)
+                                                            lpad.delete_wf(s.fw_id)
+                                                            break
                                                     else:
-                                                        msg = '  --> ERROR: task for completed WF {} does not exist!'.format(s.fw_id)
-                                                        msg += ' --> CLEANUP: delete {} WF and re-add/run to enforce task-id {}'.format(fw['state'], struct.task_id)
-                                                        print(msg)
+                                                        print('  --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state']))
                                                         lpad.delete_wf(s.fw_id)
                                                         break
-                                                else:
-                                                    print('  --> CLEANUP: delete {} WF and re-add to include task_id as additional_field'.format(fw['state']))
-                                                    lpad.delete_wf(s.fw_id)
-                                                    break
-                                    else:
-                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
-                                    wf_found = True
+                                        else:
+                                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'fw_id': s.fw_id, 'tags': [tag]})
+                                        wf_found = True
+                                        break
+
+                            if wf_found:
+                                continue
+
+                            # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
+                            msg, matched_task_ids = '', OrderedDict()
+                            for full_name in reversed(tasks_collections):
+                                load_canonical_task_structures(formula, full_name)
+                                matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name)
+                                if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]:
+                                    msg = '  --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name])
+                                    print(msg)
+                                if matched_task_ids[full_name]:
                                     break
+                            if any(matched_task_ids.values()):
+                                logger.warning('matched task ids' + msg, extra={
+                                    'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag],
+                                    'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
+                                })
+                                continue
 
-                        if wf_found:
-                            continue
-
-                        # need to check tasks b/c not every task is guaranteed to have a workflow (e.g. VASP dir parsing)
-                        msg, matched_task_ids = '', OrderedDict()
-                        for full_name in reversed(tasks_collections):
-                            load_canonical_task_structures(formula, full_name)
-                            matched_task_ids[full_name] = find_matching_canonical_task_structures(formula, struct, full_name)
-                            if struct.task_id is not None and matched_task_ids[full_name] and struct.task_id not in matched_task_ids[full_name]:
-                                msg = '  --> WARNING: task {} not in {}'.format(struct.task_id, matched_task_ids[full_name])
+                            no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
+                            if len(no_potcars) > 0:
+                                msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
                                 print(msg)
-                            if matched_task_ids[full_name]:
-                                break
-                        if any(matched_task_ids.values()):
-                            logger.warning('matched task ids' + msg, extra={
-                                'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag],
-                                'task_id(s)': dict((k.replace('.', '#'), v) for k, v in matched_task_ids.items())
-                            })
-                            continue
-
-                        no_potcars = set(NO_POTCARS) & set(struct.composition.elements)
-                        if len(no_potcars) > 0:
-                            msg = 'Structure for SNL {} --> NO POTCARS: {}'.format(struct.snl_id, no_potcars)
-                            print(msg)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars})
-                            continue
+                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': no_potcars})
+                                continue
+
+                            try:
+                                wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
+                                wf = add_trackers(wf)
+                                wf = add_tags(wf, [tag])
+                                if struct.task_id is not None:
+                                    wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
+                            except Exception as ex:
+                                msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex))
+                                print(msg)
+                                logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'})
+                                continue
 
-                        try:
-                            wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
-                            wf = add_trackers(wf)
-                            wf = add_tags(wf, [tag])
+                            msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
                             if struct.task_id is not None:
-                                wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
-                        except Exception as ex:
-                            msg = 'Structure for SNL {} --> SKIP: Could not make workflow --> {}'.format(struct.snl_id, str(ex))
+                                msg += ' --> enforcing task-id {}'.format(struct.task_id)
                             print(msg)
-                            logger.error(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'error': 'could not make workflow'})
-                            continue
-
-                        msg = 'Structure for SNL {} --> ADD WORKFLOW'.format(struct.snl_id)
-                        if struct.task_id is not None:
-                            msg += ' --> enforcing task-id {}'.format(struct.task_id)
-                        print(msg)
-
-                        if insert:
-                            old_new = lpad.add_wf(wf)
-                            logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]})
-                        else:
-                            logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]})
-                        counter['add(ed)'] += 1
 
-        except CursorNotFound as ex:
-            print(ex)
-            sites_elements = set([
-                (len(set([e.symbol for e in x.composition.elements])), x.num_sites)
-                for x in canonical_structures_list
-            ])
-            print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
-            if tag is not None:
-                print('trying again ...')
-                add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True)
-
-        print(counter)
+                            if insert:
+                                old_new = lpad.add_wf(wf)
+                                logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag], 'fw_id': list(old_new.values())[0]})
+                            else:
+                                logger.error(msg + ' --> DRY RUN', extra={'formula': formula, 'snl_id': struct.snl_id, 'tags': [tag]})
+                            counter['add(ed)'] += 1
+
+            except CursorNotFound as ex:
+                print(ex)
+                sites_elements = set([
+                    (len(set([e.symbol for e in x.composition.elements])), x.num_sites)
+                    for x in canonical_structures_list
+                ])
+                print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
+                if tag is not None:
+                    print('trying again ...')
+                    add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True)
+
+            print(counter)
 
 
 def structures_match(s1, s2):

From 1d40b10838912b32e726bc62f9dc413297b81214 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 6 Jul 2018 16:41:32 -0700
Subject: [PATCH 25/97] cli: multiple collections for tag flag

---
 emmet/scripts/emmet.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 000af25502..b9e2e52dec 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -269,10 +269,11 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
     else:
         query = {'$and': [{'$or': [{'about.remarks': tag}, {'about.projects': tag}]}, exclude]}
         query.update(base_query)
-        total = sum([snl_coll.count(query) for snl_coll in snl_collections])
+        cnts = [snl_coll.count(query) for snl_coll in snl_collections]
+        total = sum(cnts)
         if total:
             to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag})
-            tags[tag] = [total, to_scan, snl_collections]
+            tags[tag] = [total, to_scan, [snl_coll for idx, snl_coll in enumerate(snl_collections) if cnts[idx]]]
 
     if not tags:
         print('nothing to scan')

From fee49e6983347f814f05769cd7a47eb77225ae2c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 10 Jul 2018 10:27:23 -0700
Subject: [PATCH 26/97] cli: exclude query bugfix

---
 emmet/scripts/emmet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index b9e2e52dec..beb4bcd783 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -106,13 +106,13 @@ def get_subdir(dn):
 
     tags = [tag]
     if tag is None:
-        tags = [t for t in source.collection.find(exclude).distinct('tags') if t is not None]
+        tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None]
         print(len(tags), 'tags in source collection')
 
     for t in tags:
 
         print('### {} ###'.format(t))
-        query = {'$and': [{'tags': t}, exclude]}
+        query = {'$and': [{'tags': t}, task_base_query]}
         source_count = source.collection.count(query)
         print('source / target:', source_count, '/', target.collection.count(query))
 

From 410034f2efc07bc746d26a73964a21b87e689cd1 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 17 Jul 2018 13:48:11 -0700
Subject: [PATCH 27/97] cli: skip_all_scanned fix, fizzled rate

---
 emmet/scripts/emmet.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index beb4bcd783..5be0bff2f0 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -263,7 +263,10 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
         sorted_tags = sorted(all_tags.items(), key=lambda x: x[1][0][0])
         for item in sorted_tags:
             total = sum([x[0] for x in item[1]])
-            to_scan = total - lpad.db.add_wflows_logs.count({'tags': item[0]})
+            q = {'tags': item[0]}
+            if not skip_all_scanned:
+                q['level'] = 'WARNING'
+            to_scan = total - lpad.db.add_wflows_logs.count(q)
             if total < max_structures and to_scan:
                 tags[item[0]] = [total, to_scan, [x[-1] for x in item[1]]]
     else:
@@ -272,7 +275,10 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
         cnts = [snl_coll.count(query) for snl_coll in snl_collections]
         total = sum(cnts)
         if total:
-            to_scan = total - lpad.db.add_wflows_logs.count({'tags': tag})
+            q = {'tags': tag}
+            if not skip_all_scanned:
+                q['level'] = 'WARNING'
+            to_scan = total - lpad.db.add_wflows_logs.count(q)
             tags[tag] = [total, to_scan, [snl_coll for idx, snl_coll in enumerate(snl_collections) if cnts[idx]]]
 
     if not tags:
@@ -362,6 +368,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             continue # already checked
                         q['level'] = 'ERROR'
                         if skip_all_scanned and mongo_handler.collection.find_one(q):
+                            lpad.db.add_wflows_logs.update(q, {'$addToSet': {'tags': tag}})
                             continue
                         mongo_handler.collection.remove(q) # avoid dups
                         counter['structures'] += 1
@@ -642,8 +649,8 @@ def report(tag):
             progress = '{:.0f}%'.format(progress)
         entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states]
         fizzled = counter['FIZZLED'] / total if total else 0.
-        if progress != '-':
-            fizzled = counter['FIZZLED'] / counter['COMPLETED'] if counter['COMPLETED'] else 0.
+        if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']):
+            fizzled = counter['FIZZLED'] / (counter['COMPLETED'] + counter['FIZZLED'])
         percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
                 if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
         entry.append(percent_fizzled)

From 4c035c3bbdd78bc8bbf4656825e5b2b44f1ea3f7 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 15 Aug 2018 10:43:55 -0700
Subject: [PATCH 28/97] cli: insert continue, in-progress flag

---
 emmet/scripts/emmet.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 5be0bff2f0..8ac85e7129 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -141,6 +141,8 @@ def get_subdir(dn):
             continue
 
         print(len(subdirs), 'candidate tasks to insert')
+        if not insert:
+            continue
 
         for subdir_doc in subdirs:
             subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir_doc['subdir'])}}
@@ -615,7 +617,8 @@ def format(self, record):
 
 @cli.command()
 @click.option('--tag', default=None, help='only include structures with specific tag')
-def report(tag):
+@click.option('--in-progress/--no-in-progress', default=False, help='show in-progress only')
+def report(tag, in_progress):
     """generate a report of calculations status"""
 
     lpad = LaunchPad.auto_load()
@@ -647,6 +650,8 @@ def report(tag):
             tc = "\033[1;34m{}\033[0m".format(t)
             progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0.
             progress = '{:.0f}%'.format(progress)
+        elif in_progress:
+            continue
         entry = [tc, nr_snls, wflows_to_add, total] + [counter[state] for state in states]
         fizzled = counter['FIZZLED'] / total if total else 0.
         if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']):

From 9db0478ee555fc1640f0f5a41197a4ef9b9f8973 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 15 Aug 2018 10:46:04 -0700
Subject: [PATCH 29/97] add data maintenance scripts

---
 emmet/scripts/garden_to_hpss.sh  | 12 +++++
 emmet/scripts/hpss_to_mpdrive.sh | 77 ++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100755 emmet/scripts/garden_to_hpss.sh
 create mode 100755 emmet/scripts/hpss_to_mpdrive.sh

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
new file mode 100755
index 0000000000..c139184e15
--- /dev/null
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
+  echo $block_dir
+  subdir=`basename $block_dir`
+  if [ ! -e ${subdir}.tar.gz ]; then
+    tar -czvf ${subdir}.tar.gz ${block_dir}
+  fi
+  hsi cput ${subdir}.tar.gz : garden/${subdir}.tar.gz
+  [[ $? -ne 0 ]] && echo "not removing ${block_dir}" && continue
+  rm -rv $block_dir && rm -v ${subdir}.tar.gz
+done
diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh
new file mode 100755
index 0000000000..7f6a6da177
--- /dev/null
+++ b/emmet/scripts/hpss_to_mpdrive.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# $(find $dir -name 'INCAR.orig*' -printf '%h ')
+dirs=`awk -F/ '{print $1}' $1 | sort -u`
+hpss_missing="blocks_missing_in_hpss.txt"
+
+stage_dir="rclone_to_mp_drive"
+[[ ! -d $stage_dir ]] && mkdir $stage_dir
+[[ ! -e $hpss_missing ]] && touch $hpss_missing
+
+for dir in $dirs; do
+  [[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove
+
+  files=`grep "^$dir" $1`
+  extract="${dir}.extract"
+  grep -q "$dir" $hpss_missing
+  [[ $? -eq 0 ]] && continue
+
+  [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+
+  missing_paths="${dir}.paths"
+  echo $files | tr ' ' '\n' | sort -u > ${dir}.files
+  rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf
+  for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive
+    launch_dir_tar="${stage_dir}/${f}.tar.gz"
+    if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then
+	   echo $f >> $missing_paths
+	 elif [ -d $f ]; then
+		rm -rv $f
+	 fi
+  done
+
+  for f in $(comm --check-order -12 ${dir}.files ${dir}.rclone_lsf | tr '\n' ' '); do # already cloned launch dirs -> cleanup
+    launch_dir_tar="${stage_dir}/${f}.tar.gz"
+    [[ -d $f ]] && rm -rv $f
+	 [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar
+  done
+  rm -v ${dir}.files ${dir}.rclone_lsf
+
+  [[ ! -e $missing_paths ]] && continue
+
+  if [ ! -e ${dir}.tar.gz ] || [ ! -s ${dir}.tar.gz ]; then
+    hsi -q "get garden/${dir}.tar.gz"
+    [[ $? -ne 0 ]] && echo ${dir} >> $hpss_missing && continue
+  fi
+  ls -ltrh ${dir}.tar.gz
+
+  if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then
+    echo "make ${dir}.tar_list ..."
+    tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list
+  fi
+
+  paths=`cat $missing_paths`
+  for f in $paths; do
+    [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract
+  done
+
+  if [ -e $extract ] && [ -s $extract ]; then
+    echo "extract" `wc -l $extract`
+    tar -xvzf ${dir}.tar.gz --files-from $extract
+  fi
+  rm -v $extract
+
+  for f in $paths; do
+    launch_dir_tar="${stage_dir}/${f}.tar.gz"
+    echo $launch_dir_tar ...
+    mkdir -p `dirname $launch_dir_tar`
+    tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`)
+    [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && break
+    ls -ltrh $launch_dir_tar
+    [[ -d $f ]] && rm -r $f
+  done
+  rm -v $missing_paths
+
+  rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+
+done

From 455e94fb21de1a347228066dd4c50489ead29658 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 15 Aug 2018 13:48:22 -0700
Subject: [PATCH 30/97] cli parse subcommand

---
 emmet/scripts/emmet.py | 68 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 8ac85e7129..2af685db8c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -10,7 +10,9 @@
 from pymatgen.alchemy.materials import TransformedStructure
 from pymatgen.util.provenance import StructureNL, Author
 from fireworks import LaunchPad
+from fireworks.fw_config import FW_BLOCK_FORMAT
 from atomate.vasp.database import VaspCalcDb
+from atomate.vasp.drones import VaspDrone
 from atomate.vasp.workflows.presets.core import wf_structure_optimization
 from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
 from emmet.vasp.materials import group_structures, get_sg
@@ -82,6 +84,9 @@ def ensure_meta(snls_db):
     ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll])
 
 
+def get_subdir(dn):
+    return dn.rsplit(os.sep, 1)[-1]
+
 @cli.command()
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
@@ -92,9 +97,6 @@ def copy_tasks(target_db_file, tag, insert):
     if not insert:
         print('DRY RUN: add --insert flag to actually add tasks to production')
 
-    def get_subdir(dn):
-        return dn.rsplit(os.sep, 1)[-1]
-
     lpad = LaunchPad.auto_load()
     source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to source db with', source.collection.count(), 'tasks')
@@ -843,3 +845,63 @@ def insert_snls(snls_list):
         if idx and not idx%100 or idx == len(input_structures)-1:
             insert_snls(snls)
 
+
+@cli.command()
+@click.argument('base_path', type=click.Path(exists=True))
+@click.option('--insert/--no-insert', default=False, help='actually execute task insertion')
+def parse(base_path, insert):
+    """parse VASP output directories in base_path into tasks and tag"""
+    if not insert:
+        print('DRY RUN: add --insert flag to actually insert tasks')
+
+    lpad = LaunchPad.auto_load()
+    target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
+    print('connected to target db with', target.collection.count(), 'tasks')
+    base_path_split = base_path.split(os.sep)
+    tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2]
+    drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]})
+    already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')]
+    print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag)
+
+    def get_timestamp_dir(prefix='launcher'):
+        time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT)
+        return '_'.join([prefix, time_now])
+
+    def get_vasp_dirs():
+        for root, dirs, files in os.walk(base_path):
+            # TODO ignore relax1/2 subdirs if INCAR.orig found
+            if any(f.startswith("INCAR") for f in files):
+                if insert:
+                    root_split = os.path.realpath(root).split(os.sep)
+                    idx = len(base_path_split)
+                    if not root_split[idx-1].startswith('block_'):
+                        rootdir = os.sep.join(root_split[:idx])
+                        block = get_timestamp_dir(prefix='block')
+                        block_dir = os.sep.join(root_split[:idx-1] + [block])
+                        os.rename(rootdir, block_dir)
+                        os.symlink(block_dir, rootdir)
+                        print(rootdir, '->', block_dir)
+                    subdir = os.sep.join(root_split)
+                    if not root_split[-1].startswith('launcher_'):
+                        launch = get_timestamp_dir()
+                        launch_dir = os.sep.join(root_split[:-1] + [launch])
+                        os.rename(subdir, launch_dir)
+                        os.symlink(launch_dir, subdir)
+                        print(subdir, '->', launch_dir)
+                        yield launch_dir
+                    else:
+                        yield subdir
+                else:
+                    yield root
+
+    for vaspdir in get_vasp_dirs():
+        subdir = get_subdir(vaspdir)
+        if subdir not in already_inserted_subdirs:
+            print(vaspdir)
+            try:
+                task_doc = drone.assimilate(vaspdir)
+            except Exception as ex:
+                print(str(ex))
+                continue
+            if insert:
+                target.insert_task(task_doc, use_gridfs=True)

From 7c8cdb14e16444f42e51a30124e419533c30d3b6 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 10 Sep 2018 15:40:47 -0700
Subject: [PATCH 31/97] cli report: add --to-csv option

---
 emmet/scripts/emmet.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 2af685db8c..38f91163c3 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, tarfile, bson, gzip
+import click, os, yaml, sys, logging, tarfile, bson, gzip, csv
 from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
@@ -620,7 +620,8 @@ def format(self, record):
 @cli.command()
 @click.option('--tag', default=None, help='only include structures with specific tag')
 @click.option('--in-progress/--no-in-progress', default=False, help='show in-progress only')
-def report(tag, in_progress):
+@click.option('--to-csv/--no-to-csv', default=False, help='save report as CSV')
+def report(tag, in_progress, to_csv):
     """generate a report of calculations status"""
 
     lpad = LaunchPad.auto_load()
@@ -649,7 +650,7 @@ def report(tag, in_progress):
         total = sum(v for k, v in counter.items() if k in states)
         tc, progress = t, '-'
         if wflows_to_add or counter['COMPLETED'] + counter['FIZZLED'] != total:
-            tc = "\033[1;34m{}\033[0m".format(t)
+            tc = "\033[1;34m{}\033[0m".format(t) if not to_csv else t
             progress = (counter['COMPLETED'] + counter['FIZZLED']) / total * 100. if total else 0.
             progress = '{:.0f}%'.format(progress)
         elif in_progress:
@@ -658,8 +659,8 @@ def report(tag, in_progress):
         fizzled = counter['FIZZLED'] / total if total else 0.
         if progress != '-' and bool(counter['COMPLETED'] + counter['FIZZLED']):
             fizzled = counter['FIZZLED'] / (counter['COMPLETED'] + counter['FIZZLED'])
-        percent_fizzled = "\033[1;31m{:.0f}%\033[0m".format(fizzled*100.) \
-                if fizzled > 0.2 else '{:.0f}%'.format(fizzled*100.)
+        sfmt = "\033[1;31m{:.0f}%\033[0m" if (not to_csv and fizzled > 0.2) else '{:.0f}%'
+        percent_fizzled = sfmt.format(fizzled*100.)
         entry.append(percent_fizzled)
         entry.append(progress)
         for idx, e in enumerate(entry):
@@ -669,10 +670,19 @@ def report(tag, in_progress):
             table.add_row(entry)
 
     if tag is None:
-        table.add_row(['\033[1;32m{}\033[0m'.format(s if s else '-') for s in sums])
+        sfmt = '{}' if to_csv else '\033[1;32m{}\033[0m'
+        table.add_row([sfmt.format(s if s else '-') for s in sums])
     table.align['Tag'] = 'r'
     print(table)
 
+    if to_csv:
+        with open('emmet_report.csv', 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            writer.writerow(table._field_names)
+            options = table._get_options({})
+            for row in table._get_rows(options):
+                writer.writerow(row)
+
 
 @cli.command()
 @click.argument('archive', type=click.Path(exists=True))

From e4636d4ac8b5a5e9e553907a7ec0f26c342e38e7 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 17 Sep 2018 14:24:36 -0700
Subject: [PATCH 32/97] cli: add launcher_paths

---
 emmet/scripts/launcher_paths.py | 39 +++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 emmet/scripts/launcher_paths.py

diff --git a/emmet/scripts/launcher_paths.py b/emmet/scripts/launcher_paths.py
new file mode 100644
index 0000000000..fc7625bbf2
--- /dev/null
+++ b/emmet/scripts/launcher_paths.py
@@ -0,0 +1,39 @@
+import json
+from atomate.vasp.database import VaspCalcDb
+
+target_db_file = '../dbfiles/db_atomate.json'
+target = VaspCalcDb.from_db_file(target_db_file, admin=True)
+print('connected to target db with', target.collection.count(), 'tasks')
+print(target.db.materials.count(), 'materials')
+
+splits = ['block_', 'aflow_']
+mpids = json.load(open('KRao_Li_FullList.txt', 'r'))
+print(len(mpids), 'mpids')
+query = {'task_id': {'$in': mpids}}
+
+# {'mp-1002': [{'task_id': ..., 'task_type': ..., 'launcher_path': ...}, ...], ...}
+out = {}
+
+for idx, doc in enumerate(target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})):
+    mp_id = doc['task_id']
+    out[mp_id] = []
+    print(idx, mp_id)
+    for task_type, task_id in doc['blessed_tasks'].items():
+        dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name']
+        if 'maarten_piezo' in dir_name:
+            continue
+        for s in splits:
+            ds = dir_name.split(s)
+            if len(ds) == 2:
+                launcher = s + ds[-1]
+                print(task_id, task_type, launcher)
+                out[mp_id].append({'task_id': task_id, 'task_type': task_type, 'launcher_path': launcher})
+                break
+
+with open('launcher_paths.json', 'w') as f:
+    json.dump(out, f)
+
+with open('launcher_paths.txt', 'w') as f:
+    for mp_id, tasks in out.items():
+        for task in tasks:
+            f.write(task['launcher_path']+'\n')

From c2e763a7683321b03dbc127b0dd0c7cc872a5073 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 17 Sep 2018 17:43:43 -0700
Subject: [PATCH 33/97] cli.copy_tasks: also copy SNLs

---
 emmet/scripts/emmet.py | 50 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 38f91163c3..65b5cec444 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -111,6 +111,16 @@ def copy_tasks(target_db_file, tag, insert):
         tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None]
         print(len(tags), 'tags in source collection')
 
+    def insert_snls(snls_list):
+        if snls_list:
+            print('copy', len(snls_list), 'SNLs')
+            if insert:
+                result = target.db.snls.insert_many(snls_list)
+                print('#SNLs inserted:', len(result.inserted_ids))
+            snls_list.clear()
+        else:
+            print('no SNLs to insert')
+
     for t in tags:
 
         print('### {} ###'.format(t))
@@ -118,6 +128,44 @@ def copy_tasks(target_db_file, tag, insert):
         source_count = source.collection.count(query)
         print('source / target:', source_count, '/', target.collection.count(query))
 
+        # get list of SNLs to copy over
+        # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls
+        # also only need to check about.projects; add_snls adds tag to about.projects and not remarks
+        snls = lpad.db.snls.find({'about.projects': t})
+        nr_snls = snls.count()
+        if nr_snls < target.db.snls.count({'about.projects': t}):
+            snls_to_copy, index, prefix = [], None, 'snl'
+            for idx, doc in enumerate(snls):
+                snl = StructureNL.from_dict(doc)
+                formula = snl.structure.composition.reduced_formula
+                snl_copied = False
+                try:
+                    q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]}
+                    group = aggregate_by_formula(target.db.snls, q).next() # only one formula
+                    for dct in group['structures']:
+                        existing_structure = Structure.from_dict(dct)
+                        if structures_match(snl.structure, existing_structure):
+                            snl_copied = True
+                            print('SNL', doc['snl_id'], 'already added.')
+                            break
+                except StopIteration:
+                    pass
+                if snl_copied:
+                    continue
+                snl_dct = snl.as_dict()
+                if index is None:
+                    index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1
+                else:
+                    index += 1
+                snl_id = '{}-{}'.format(prefix, index)
+                snl_dct['snl_id'] = snl_id
+                snl_dct.update(get_meta_from_structure(snl.structure))
+                snls_to_copy.append(snl_dct)
+                if idx and not idx%100 or idx == nr_snls-1:
+                    insert_snls(snls_to_copy)
+        else:
+            print('SNLs already copied.')
+
         # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*])
         nr_source_mp_tasks, skip_task_ids = 0, []
         for doc in source.collection.find(query, ['task_id', 'dir_name']):
@@ -712,7 +760,7 @@ def insert_snls(snls_list):
             if insert:
                 result = snl_collections[0].insert_many(snls_list)
                 print('#SNLs inserted:', len(result.inserted_ids))
-                snls_list.clear()
+            snls_list.clear()
         else:
             print('no SNLs to insert')
 

From 6384056a7159331d2baa251cf3fb5993db813e72 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 17 Sep 2018 17:44:01 -0700
Subject: [PATCH 34/97] cli.parse: only insert task if successful

---
 emmet/scripts/emmet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 65b5cec444..f636cc81c8 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -961,5 +961,5 @@ def get_vasp_dirs():
             except Exception as ex:
                 print(str(ex))
                 continue
-            if insert:
+            if insert and task_doc['state'] == 'successful':
                 target.insert_task(task_doc, use_gridfs=True)

From cc70059c8249fa42becadc907e35f3a96d25af55 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 18 Sep 2018 16:40:49 -0700
Subject: [PATCH 35/97] cli: separate add_snls for load/parse

---
 emmet/scripts/emmet.py | 92 ++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index f636cc81c8..efe5999860 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -243,14 +243,14 @@ def insert_snls(snls_list):
 
 
 @cli.command()
-@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
+@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
 @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan')
 @click.option('--tag', default=None, help='only include structures with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
 @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag')
 @click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan')
 @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors')
-def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
+def add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
     """add workflows based on tags in SNL collection"""
 
     if not insert:
@@ -259,8 +259,8 @@ def add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structur
     lpad = LaunchPad.auto_load()
 
     snl_collections = [lpad.db.snls]
-    if add_snls_dbs is not None:
-        for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')):
+    if add_snlcolls is not None:
+        for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')):
             snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
             snl_db = snl_db_conn[snl_db_config['db']]
             snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
@@ -637,7 +637,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                 print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
                 if tag is not None:
                     print('trying again ...')
-                    add_wflows(add_snls_dbs, add_tasks_db, tag, insert, clear_logs, max_structures, True)
+                    add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True)
 
             print(counter)
 
@@ -734,37 +734,15 @@ def report(tag, in_progress, to_csv):
 
 @cli.command()
 @click.argument('archive', type=click.Path(exists=True))
-@click.option('--add_snls_dbs', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against')
+@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to check against')
 @click.option('--insert/--no-insert', default=False, help='actually execute SNL insertion')
-def add_snls(archive, add_snls_dbs, insert):
+def load(archive, add_snlcolls, insert):
     """add structures from archive of structure files (CIF, POSCAR, ...) to (local) SNLs collection"""
     # TODO assign task_ids to structures?
 
     if not insert:
         print('DRY RUN! Add --insert flag to actually add SNLs')
 
-    lpad = LaunchPad.auto_load()
-    snl_collections = [lpad.db.snls]
-    if add_snls_dbs is not None:
-        for snl_db_config in yaml.load_all(open(add_snls_dbs, 'r')):
-            snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
-            snl_db = snl_db_conn[snl_db_config['db']]
-            snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
-            snl_collections.append(snl_db[snl_db_config['collection']])
-    for snl_coll in snl_collections:
-        print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
-
-    def insert_snls(snls_list):
-        if snls_list:
-            print('add', len(snls_list), 'SNLs')
-            if insert:
-                result = snl_collections[0].insert_many(snls_list)
-                print('#SNLs inserted:', len(result.inserted_ids))
-            snls_list.clear()
-        else:
-            print('no SNLs to insert')
-
-
     fname, ext = os.path.splitext(os.path.basename(archive))
     tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, '']
     if sec_ext:
@@ -774,16 +752,6 @@ def insert_snls(snls_list):
         print(ext, 'not supported (yet)! Please use one of', exts)
         return
 
-    meta_path = '{}.yaml'.format(tag)
-    meta = None
-    if not os.path.exists(meta_path):
-        meta = {'authors': ['Materials Project <feedback@materialsproject.org>']}
-        print(meta_path, 'not found. Using', meta)
-    else:
-        with open(meta_path, 'r') as f:
-            meta = yaml.safe_load(f)
-    meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
-
     input_structures = []
     if ext == 'bson.gz':
         for idx, doc in enumerate(bson.decode_file_iter(gzip.open(archive))):
@@ -816,6 +784,42 @@ def insert_snls(snls_list):
                     break #continue
 
     print(len(input_structures), 'structure(s) loaded.')
+    add_snls(tag, input_structures, add_snlcolls, insert)
+
+
+def add_snls(tag, input_structures, add_snlcolls, insert):
+    """add structures to (local) SNLs collection"""
+
+    meta_path = '{}.yaml'.format(tag)
+    meta = None
+    if not os.path.exists(meta_path):
+        meta = {'authors': ['Materials Project <feedback@materialsproject.org>']}
+        print(meta_path, 'not found. Using', meta)
+    else:
+        with open(meta_path, 'r') as f:
+            meta = yaml.safe_load(f)
+    meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
+
+    lpad = LaunchPad.auto_load()
+    snl_collections = [lpad.db.snls]
+    if add_snlcolls is not None:
+        for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')):
+            snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+            snl_db = snl_db_conn[snl_db_config['db']]
+            snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+            snl_collections.append(snl_db[snl_db_config['collection']])
+    for snl_coll in snl_collections:
+        print(snl_coll.count(), 'SNLs in', snl_coll.full_name)
+
+    def insert_snls(snls_list):
+        if snls_list:
+            print('add', len(snls_list), 'SNLs')
+            if insert:
+                result = snl_collections[0].insert_many(snls_list)
+                print('#SNLs inserted:', len(result.inserted_ids))
+            snls_list.clear()
+        else:
+            print('no SNLs to insert')
 
     snls, index = [], None
     for idx, istruct in enumerate(input_structures):
@@ -906,8 +910,9 @@ def insert_snls(snls_list):
 
 @cli.command()
 @click.argument('base_path', type=click.Path(exists=True))
+@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
 @click.option('--insert/--no-insert', default=False, help='actually execute task insertion')
-def parse(base_path, insert):
+def parse(base_path, add_snlcolls, insert):
     """parse VASP output directories in base_path into tasks and tag"""
     if not insert:
         print('DRY RUN: add --insert flag to actually insert tasks')
@@ -952,6 +957,7 @@ def get_vasp_dirs():
                 else:
                     yield root
 
+    input_structures = []
     for vaspdir in get_vasp_dirs():
         subdir = get_subdir(vaspdir)
         if subdir not in already_inserted_subdirs:
@@ -963,3 +969,9 @@ def get_vasp_dirs():
                 continue
             if insert and task_doc['state'] == 'successful':
                 target.insert_task(task_doc, use_gridfs=True)
+                s = Structure.from_dict(task_doc['input']['structure'])
+                input_structures.append(s)
+
+    print('add SNLs for', len(input_structures), 'structures')
+    add_snls(tag, input_structures, add_snlcolls, insert)
+

From 3bb9b5721072cdc5c639880656d44791a6139381 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 18 Sep 2018 16:41:12 -0700
Subject: [PATCH 36/97] cli.copy: only if SNLs available

---
 emmet/scripts/emmet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index efe5999860..271e1d6cd5 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -133,7 +133,7 @@ def insert_snls(snls_list):
         # also only need to check about.projects; add_snls adds tag to about.projects and not remarks
         snls = lpad.db.snls.find({'about.projects': t})
         nr_snls = snls.count()
-        if nr_snls < target.db.snls.count({'about.projects': t}):
+        if nr_snls and nr_snls < target.db.snls.count({'about.projects': t}):
             snls_to_copy, index, prefix = [], None, 'snl'
             for idx, doc in enumerate(snls):
                 snl = StructureNL.from_dict(doc)
@@ -164,7 +164,7 @@ def insert_snls(snls_list):
                 if idx and not idx%100 or idx == nr_snls-1:
                     insert_snls(snls_to_copy)
         else:
-            print('SNLs already copied.')
+            print('SNLs not available or already copied.')
 
         # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*])
         nr_source_mp_tasks, skip_task_ids = 0, []

From f5f762260096c62c5971698e1ed060ebf83b11fe Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 18 Sep 2018 19:23:49 -0700
Subject: [PATCH 37/97] cli: minor subcommand renames

---
 emmet/scripts/emmet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 271e1d6cd5..2fd514c4e9 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -91,8 +91,8 @@ def get_subdir(dn):
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute task addition')
-def copy_tasks(target_db_file, tag, insert):
-    """Retrieve tasks from source and copy to target task collection"""
+def copy(target_db_file, tag, insert):
+    """Retrieve tasks from source and copy to target task collection (incl. SNLs if available)"""
 
     if not insert:
         print('DRY RUN: add --insert flag to actually add tasks to production')
@@ -250,7 +250,7 @@ def insert_snls(snls_list):
 @click.option('--clear-logs/--no-clear-logs', default=False, help='clear MongoDB logs collection for specific tag')
 @click.option('--max-structures', '-m', default=1000, help='set max structures for tags to scan')
 @click.option('--skip-all-scanned/--no-skip-all-scanned', default=False, help='skip all already scanned structures incl. WFs2Add/Errors')
-def add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
+def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, skip_all_scanned):
     """add workflows based on tags in SNL collection"""
 
     if not insert:
@@ -637,7 +637,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                 print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
                 if tag is not None:
                     print('trying again ...')
-                    add_wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True)
+                    wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True)
 
             print(counter)
 

From b2408b2ada5ad61707fef8f16104f6a84f1d7b98 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 19 Sep 2018 15:15:48 -0700
Subject: [PATCH 38/97] cli: copy/make_snls flags, untar launchers

---
 emmet/scripts/emmet.py | 178 +++++++++++++++++++++++------------------
 1 file changed, 102 insertions(+), 76 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 2fd514c4e9..23bfc55d52 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, tarfile, bson, gzip, csv
+import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile
 from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
@@ -91,7 +91,8 @@ def get_subdir(dn):
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
 @click.option('--insert/--no-insert', default=False, help='actually execute task addition')
-def copy(target_db_file, tag, insert):
+@click.option('--copy-snls/--no-copy-snls', default=False, help='also copy SNLs')
+def copy(target_db_file, tag, insert, copy_snls):
     """Retrieve tasks from source and copy to target task collection (incl. SNLs if available)"""
 
     if not insert:
@@ -131,40 +132,42 @@ def insert_snls(snls_list):
         # get list of SNLs to copy over
         # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls
         # also only need to check about.projects; add_snls adds tag to about.projects and not remarks
-        snls = lpad.db.snls.find({'about.projects': t})
-        nr_snls = snls.count()
-        if nr_snls and nr_snls < target.db.snls.count({'about.projects': t}):
-            snls_to_copy, index, prefix = [], None, 'snl'
-            for idx, doc in enumerate(snls):
-                snl = StructureNL.from_dict(doc)
-                formula = snl.structure.composition.reduced_formula
-                snl_copied = False
-                try:
-                    q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]}
-                    group = aggregate_by_formula(target.db.snls, q).next() # only one formula
-                    for dct in group['structures']:
-                        existing_structure = Structure.from_dict(dct)
-                        if structures_match(snl.structure, existing_structure):
-                            snl_copied = True
-                            print('SNL', doc['snl_id'], 'already added.')
-                            break
-                except StopIteration:
-                    pass
-                if snl_copied:
-                    continue
-                snl_dct = snl.as_dict()
-                if index is None:
-                    index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1
-                else:
-                    index += 1
-                snl_id = '{}-{}'.format(prefix, index)
-                snl_dct['snl_id'] = snl_id
-                snl_dct.update(get_meta_from_structure(snl.structure))
-                snls_to_copy.append(snl_dct)
-                if idx and not idx%100 or idx == nr_snls-1:
-                    insert_snls(snls_to_copy)
-        else:
-            print('SNLs not available or already copied.')
+        # TODO only need to copy if author not Materials Project!?
+        if copy_snls:
+            snls = lpad.db.snls.find({'about.projects': t})
+            nr_snls = snls.count()
+            if nr_snls:
+                snls_to_copy, index, prefix = [], None, 'snl'
+                for idx, doc in enumerate(snls):
+                    snl = StructureNL.from_dict(doc)
+                    formula = snl.structure.composition.reduced_formula
+                    snl_copied = False
+                    try:
+                        q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]}
+                        group = aggregate_by_formula(target.db.snls, q).next() # only one formula
+                        for dct in group['structures']:
+                            existing_structure = Structure.from_dict(dct)
+                            if structures_match(snl.structure, existing_structure):
+                                snl_copied = True
+                                print('SNL', doc['snl_id'], 'already added.')
+                                break
+                    except StopIteration:
+                        pass
+                    if snl_copied:
+                        continue
+                    snl_dct = snl.as_dict()
+                    if index is None:
+                        index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1
+                    else:
+                        index += 1
+                    snl_id = '{}-{}'.format(prefix, index)
+                    snl_dct['snl_id'] = snl_id
+                    snl_dct.update(get_meta_from_structure(snl.structure))
+                    snls_to_copy.append(snl_dct)
+                    if idx and not idx%100 or idx == nr_snls-1:
+                        insert_snls(snls_to_copy)
+            else:
+                print('No SNLs available for', t)
 
         # skip tasks with task_id existing in target and with matching dir_name (have to be a string [mp-*, mvc-*])
         nr_source_mp_tasks, skip_task_ids = 0, []
@@ -912,7 +915,8 @@ def insert_snls(snls_list):
 @click.argument('base_path', type=click.Path(exists=True))
 @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
 @click.option('--insert/--no-insert', default=False, help='actually execute task insertion')
-def parse(base_path, add_snlcolls, insert):
+@click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks')
+def parse(base_path, add_snlcolls, insert, make_snls):
     """parse VASP output directories in base_path into tasks and tag"""
     if not insert:
         print('DRY RUN: add --insert flag to actually insert tasks')
@@ -930,48 +934,70 @@ def get_timestamp_dir(prefix='launcher'):
         time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT)
         return '_'.join([prefix, time_now])
 
-    def get_vasp_dirs():
-        for root, dirs, files in os.walk(base_path):
+    def get_symlinked_path(root):
+        root_split = os.path.realpath(root).split(os.sep)
+        idx = len(base_path_split)
+        if not root_split[idx-1].startswith('block_'):
+            rootdir = os.sep.join(root_split[:idx])
+            block = get_timestamp_dir(prefix='block')
+            block_dir = os.sep.join(root_split[:idx-1] + [block])
+            if insert:
+                os.rename(rootdir, block_dir)
+                os.symlink(block_dir, rootdir)
+            print(rootdir, '->', block_dir)
+        subdir = os.sep.join(root_split)
+        if not root_split[-1].startswith('launcher_'):
+            launch = get_timestamp_dir()
+            launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch)
+            if insert:
+                os.rename(subdir, launch_dir)
+                os.symlink(launch_dir, subdir)
+            print(subdir, '->', launch_dir)
+            return launch_dir
+        else:
+            return os.path.realpath(subdir)
+
+    def contains_vasp_dirs(list_of_files):
+        for f in list_of_files:
+            if f.startswith("INCAR"):
+                return True
+
+    def get_vasp_dirs(scan_path):
+        # NOTE os.walk followlinks=False by default, as intended here
+        for root, dirs, files in os.walk(scan_path):
             # TODO ignore relax1/2 subdirs if INCAR.orig found
-            if any(f.startswith("INCAR") for f in files):
-                if insert:
-                    root_split = os.path.realpath(root).split(os.sep)
-                    idx = len(base_path_split)
-                    if not root_split[idx-1].startswith('block_'):
-                        rootdir = os.sep.join(root_split[:idx])
-                        block = get_timestamp_dir(prefix='block')
-                        block_dir = os.sep.join(root_split[:idx-1] + [block])
-                        os.rename(rootdir, block_dir)
-                        os.symlink(block_dir, rootdir)
-                        print(rootdir, '->', block_dir)
-                    subdir = os.sep.join(root_split)
-                    if not root_split[-1].startswith('launcher_'):
-                        launch = get_timestamp_dir()
-                        launch_dir = os.sep.join(root_split[:-1] + [launch])
-                        os.rename(subdir, launch_dir)
-                        os.symlink(launch_dir, subdir)
-                        print(subdir, '->', launch_dir)
-                        yield launch_dir
-                    else:
-                        yield subdir
-                else:
-                    yield root
+            if contains_vasp_dirs(files):
+                yield get_symlinked_path(root)
+            else:
+                for f in files:
+                    if f.endswith('.tar.gz'):
+                        cwd = os.path.realpath(root)
+                        path = os.path.join(cwd, f)
+                        with tarfile.open(path, 'r:gz') as tf:
+                            tf.extractall(cwd)
+                        os.remove(path)
+                        for vaspdir in get_vasp_dirs(path.replace('.tar.gz', '')):
+                            yield vaspdir
+
 
     input_structures = []
-    for vaspdir in get_vasp_dirs():
+    for vaspdir in get_vasp_dirs(base_path):
         subdir = get_subdir(vaspdir)
         if subdir not in already_inserted_subdirs:
-            print(vaspdir)
-            try:
-                task_doc = drone.assimilate(vaspdir)
-            except Exception as ex:
-                print(str(ex))
-                continue
-            if insert and task_doc['state'] == 'successful':
-                target.insert_task(task_doc, use_gridfs=True)
-                s = Structure.from_dict(task_doc['input']['structure'])
-                input_structures.append(s)
-
-    print('add SNLs for', len(input_structures), 'structures')
-    add_snls(tag, input_structures, add_snlcolls, insert)
+            print('vaspdir:', vaspdir)
+            if insert:
+                try:
+                    task_doc = drone.assimilate(vaspdir)
+                except Exception as ex:
+                    print(str(ex))
+                    continue
+                if task_doc['state'] == 'successful':
+                    target.insert_task(task_doc, use_gridfs=True)
+                    if make_snls:
+                        s = Structure.from_dict(task_doc['input']['structure'])
+                        input_structures.append(s)
+
+    if insert and make_snls:
+        print('add SNLs for', len(input_structures), 'structures')
+        add_snls(tag, input_structures, add_snlcolls, insert)
 

From 622db525293378d135c57c0fc5b61e8c6f5034b5 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 21 Sep 2018 12:42:19 -0700
Subject: [PATCH 39/97] cli: copy orig inputs if necessary

---
 emmet/scripts/emmet.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 23bfc55d52..d8101b718c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,6 @@
 import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile
+from shutil import copyfile
+from glob import glob
 from fnmatch import fnmatch
 from datetime import datetime
 from collections import Counter, OrderedDict
@@ -980,6 +982,7 @@ def get_vasp_dirs(scan_path):
                             yield vaspdir
 
 
+    inputs = ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']
     input_structures = []
     for vaspdir in get_vasp_dirs(base_path):
         subdir = get_subdir(vaspdir)
@@ -987,6 +990,12 @@ def get_vasp_dirs(scan_path):
             print('vaspdir:', vaspdir)
             if insert:
                 try:
+                    for inp in inputs:
+                        input_path = os.path.join(vaspdir, inp)
+                        orig_path = input_path + '.orig'
+                        if not glob(orig_path+'*'):
+                            copyfile(input_path, orig_path)
+                            print('cp', input_path, '->', orig_path)
                     task_doc = drone.assimilate(vaspdir)
                 except Exception as ex:
                     print(str(ex))

From e9a5307f9afad343240584ba9ed5be538cf6410e Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 24 Sep 2018 14:00:55 -0700
Subject: [PATCH 40/97] cli.parse: remove empty dirs

---
 emmet/scripts/emmet.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index d8101b718c..34225bac06 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,5 +1,5 @@
 import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile
-from shutil import copyfile
+from shutil import copyfile, rmtree
 from glob import glob
 from fnmatch import fnmatch
 from datetime import datetime
@@ -998,7 +998,11 @@ def get_vasp_dirs(scan_path):
                             print('cp', input_path, '->', orig_path)
                     task_doc = drone.assimilate(vaspdir)
                 except Exception as ex:
-                    print(str(ex))
+                    err = str(ex)
+                    print(err)
+                    if err == 'No VASP files found!':
+                        rmtree(vaspdir)
+                        print('removed', vaspdir)
                     continue
                 if task_doc['state'] == 'successful':
                     target.insert_task(task_doc, use_gridfs=True)

From 94f18f704d00d2e0c7b52a44d938c084da775509 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 24 Sep 2018 14:01:12 -0700
Subject: [PATCH 41/97] cli.parse: deal with DocumentTooLarge

---
 emmet/scripts/emmet.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 34225bac06..cd27661908 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -7,6 +7,7 @@
 from pymongo import MongoClient
 from pymongo.errors import CursorNotFound
 from pymongo.collection import ReturnDocument
+from pymongo.errors import DocumentTooLarge
 from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
 from pymatgen.alchemy.materials import TransformedStructure
@@ -1005,7 +1006,19 @@ def get_vasp_dirs(scan_path):
                         print('removed', vaspdir)
                     continue
                 if task_doc['state'] == 'successful':
-                    target.insert_task(task_doc, use_gridfs=True)
+                    try:
+                        target.insert_task(task_doc, use_gridfs=True)
+                    except DocumentTooLarge as ex:
+                        print(str(ex))
+                        print('remove normalmode_eigenvecs and retry ...')
+                        task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs')
+                        try:
+                            target.insert_task(task_doc, use_gridfs=True)
+                        except DocumentTooLarge as ex:
+                            print(str(ex))
+                            print('also remove force_constants and retry ...')
+                            task_doc['calcs_reversed'][0]['output'].pop('force_constants')
+                            target.insert_task(task_doc, use_gridfs=True)
                     if make_snls:
                         s = Structure.from_dict(task_doc['input']['structure'])
                         input_structures.append(s)

From c907204c8e869c776ef95f94a299585d791ff1a5 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 24 Sep 2018 16:18:08 -0700
Subject: [PATCH 42/97] cli.parse: minor rearrange

---
 emmet/scripts/emmet.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index cd27661908..b511e22471 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -990,17 +990,16 @@ def get_vasp_dirs(scan_path):
         if subdir not in already_inserted_subdirs:
             print('vaspdir:', vaspdir)
             if insert:
+                for inp in inputs:
+                    input_path = os.path.join(vaspdir, inp)
+                    orig_path = input_path + '.orig'
+                    if not glob(orig_path+'*'):
+                        copyfile(input_path, orig_path)
+                        print('cp', input_path, '->', orig_path)
                 try:
-                    for inp in inputs:
-                        input_path = os.path.join(vaspdir, inp)
-                        orig_path = input_path + '.orig'
-                        if not glob(orig_path+'*'):
-                            copyfile(input_path, orig_path)
-                            print('cp', input_path, '->', orig_path)
                     task_doc = drone.assimilate(vaspdir)
                 except Exception as ex:
                     err = str(ex)
-                    print(err)
                     if err == 'No VASP files found!':
                         rmtree(vaspdir)
                         print('removed', vaspdir)

From f6dd746bd5df5aff41b31ea9a6321be18b8ea8ff Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 25 Sep 2018 14:30:32 -0700
Subject: [PATCH 43/97] cli.setup: add log4mongo, prettytable

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e5243cefa7..6a05ac67a7 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
         zip_safe=False,
         install_requires=[
             'atomate', 'pymatgen>=2018.4.20','maggma','monty',
-            'six', 'pydash', 'tqdm', 'matminer',
+            'six', 'pydash', 'tqdm', 'matminer', 'log4mongo', 'prettytable',
             'prettyplotlib', 'pybtex', 'Click', 'networkx', 'sumo',
         ],
         classifiers=["Programming Language :: Python :: 3",

From f3bd2797f9441cb9dfcbaff3b53d971e2fb68d7d Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 25 Sep 2018 14:54:02 -0700
Subject: [PATCH 44/97] cli.parse: ensure trailing slash in base_path

---
 emmet/scripts/emmet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index b511e22471..7eaf52e047 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -927,8 +927,10 @@ def parse(base_path, add_snlcolls, insert, make_snls):
     lpad = LaunchPad.auto_load()
     target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to target db with', target.collection.count(), 'tasks')
+    base_path = os.path.join(base_path, '')
     base_path_split = base_path.split(os.sep)
     tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2]
+    idx = len(base_path_split)
     drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]})
     already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')]
     print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag)
@@ -939,7 +941,6 @@ def get_timestamp_dir(prefix='launcher'):
 
     def get_symlinked_path(root):
         root_split = os.path.realpath(root).split(os.sep)
-        idx = len(base_path_split)
         if not root_split[idx-1].startswith('block_'):
             rootdir = os.sep.join(root_split[:idx])
             block = get_timestamp_dir(prefix='block')

From 7df703eb208adfd004cb6e54e1e729e46b18fc2a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 25 Sep 2018 18:46:56 -0700
Subject: [PATCH 45/97] cli.parse: use multiprocessing pool

---
 emmet/scripts/emmet.py | 254 +++++++++++++++++++++++++----------------
 1 file changed, 156 insertions(+), 98 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 7eaf52e047..b1dead5fcb 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,9 +1,9 @@
-import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile
+import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing
 from shutil import copyfile, rmtree
 from glob import glob
 from fnmatch import fnmatch
 from datetime import datetime
-from collections import Counter, OrderedDict
+from collections import Counter, OrderedDict, deque
 from pymongo import MongoClient
 from pymongo.errors import CursorNotFound
 from pymongo.collection import ReturnDocument
@@ -21,6 +21,7 @@
 from emmet.vasp.materials import group_structures, get_sg
 from emmet.vasp.task_tagger import task_type
 from log4mongo.handlers import MongoHandler, MongoFormatter
+from prettytable import PrettyTable
 
 if 'FW_CONFIG_FILE' not in os.environ:
     print('Please set FW_CONFIG_FILE!')
@@ -61,6 +62,124 @@ def get_meta_from_structure(struct):
     d['is_valid'] = struct.is_valid()
     return d
 
+# a utility function to get us a slice of an iterator, as an iterator
+# when working with iterators maximum lazyness is preferred 
+def iterator_slice(iterator, length):
+    iterator = iter(iterator)
+    while True:
+        res = tuple(itertools.islice(iterator, length))
+        if not res:
+            break
+        yield res
+
+def get_subdir(dn):
+    return dn.rsplit(os.sep, 1)[-1]
+
+def get_timestamp_dir(prefix='launcher'):
+    time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT)
+    return '_'.join([prefix, time_now])
+
+def contains_vasp_dirs(list_of_files):
+    for f in list_of_files:
+        if f.startswith("INCAR"):
+            return True
+    
+def get_symlinked_path(root, base_path_index):
+    root_split = os.path.realpath(root).split(os.sep)
+    if not root_split[base_path_index-1].startswith('block_'):
+        rootdir = os.sep.join(root_split[:base_path_index])
+        block = get_timestamp_dir(prefix='block')
+        block_dir = os.sep.join(root_split[:base_path_index-1] + [block])
+        if insert:
+            os.rename(rootdir, block_dir)
+            os.symlink(block_dir, rootdir)
+        print(rootdir, '->', block_dir)
+    subdir = os.sep.join(root_split)
+    if not root_split[-1].startswith('launcher_'):
+        launch = get_timestamp_dir()
+        launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch)
+        if insert:
+            os.rename(subdir, launch_dir)
+            os.symlink(launch_dir, subdir)
+        print(subdir, '->', launch_dir)
+        return launch_dir
+    else:
+        return os.path.realpath(subdir)
+
+def get_vasp_dirs(scan_path, base_path, max_dirs):
+    base_path_split = base_path.split(os.sep)
+    base_path_index = len(base_path_split)
+    # NOTE os.walk followlinks=False by default, as intended here
+    counter = 0
+    for root, dirs, files in os.walk(scan_path):
+        # TODO ignore relax1/2 subdirs if INCAR.orig found
+        if contains_vasp_dirs(files):
+            yield get_symlinked_path(root, base_path_index)
+            counter += 1
+            if counter >= max_dirs:
+                break
+        else:
+            for f in files:
+                if f.endswith('.tar.gz'):
+                    cwd = os.path.realpath(root)
+                    path = os.path.join(cwd, f)
+                    with tarfile.open(path, 'r:gz') as tf:
+                        tf.extractall(cwd)
+                    os.remove(path)
+                    for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs):
+                        yield vaspdir
+                        counter += 1
+                        if counter >= max_dirs:
+                            break
+
+def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
+    name = multiprocessing.current_process().name
+    print(name, 'starting')
+    lpad = LaunchPad.auto_load()
+    target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
+    print(name, 'connected to target db with', target.collection.count(), 'tasks')
+    for vaspdir in vaspdirs:
+        if get_subdir(vaspdir) in already_inserted_subdirs:
+            print(name, vaspdir, 'already parsed')
+            continue
+        print(name, 'vaspdir:', vaspdir)
+        #poscar_path = os.path.join(vaspdir, 'POSCAR.relax2.gz')
+        #s = Structure.from_file(poscar_path)
+        #nelements = len(s.composition.elements)
+        #if nelements > 1:
+        #     print(name, '   -> SKIP (#elements > 1)')
+        #     continue
+        if insert:
+            for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']:
+                input_path = os.path.join(vaspdir, inp)
+                orig_path = input_path + '.orig'
+                if not glob(orig_path+'*'):
+                    copyfile(input_path, orig_path)
+                    print(name, 'cp', input_path, '->', orig_path)
+            try:
+                task_doc = drone.assimilate(vaspdir)
+            except Exception as ex:
+                err = str(ex)
+                if err == 'No VASP files found!':
+                    rmtree(vaspdir)
+                    print(name, 'removed', vaspdir)
+                continue
+            if task_doc['state'] == 'successful':
+                try:
+                    target.insert_task(task_doc, use_gridfs=True)
+                except DocumentTooLarge as ex:
+                    print(name, 'remove normalmode_eigenvecs and retry ...')
+                    task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs')
+                    try:
+                        target.insert_task(task_doc, use_gridfs=True)
+                    except DocumentTooLarge as ex:
+                        print(name, 'also remove force_constants and retry ...')
+                        task_doc['calcs_reversed'][0]['output'].pop('force_constants')
+                        target.insert_task(task_doc, use_gridfs=True)
+    nr_vaspdirs = len(vaspdirs)
+    print(name, 'processed', nr_vaspdirs, 'VASP directories')
+    return nr_vaspdirs
+
 @click.group()
 def cli():
     pass
@@ -87,9 +206,6 @@ def ensure_meta(snls_db):
     ensure_indexes(['snl_id', 'formula_pretty', 'nelements', 'nsites', 'is_ordered', 'is_valid'], [snl_coll])
 
 
-def get_subdir(dn):
-    return dn.rsplit(os.sep, 1)[-1]
-
 @cli.command()
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--tag', default=None, help='only insert tasks with specific tag')
@@ -691,7 +807,6 @@ def report(tag, in_progress, to_csv):
         tags = [t[0] for t in sorted(all_tags, key=lambda x: x[1], reverse=True)]
         print(len(tags), 'tags in WFs and logs collections')
 
-    from prettytable import PrettyTable
     table = PrettyTable()
     table.field_names = ['Tag', 'SNLs', 'WFs2Add', 'WFs'] + states + ['% FIZZLED', 'Progress']
     sums = ['total'] + [0] * (len(table.field_names)-1)
@@ -919,7 +1034,9 @@ def insert_snls(snls_list):
 @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
 @click.option('--insert/--no-insert', default=False, help='actually execute task insertion')
 @click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks')
-def parse(base_path, add_snlcolls, insert, make_snls):
+@click.option('--nproc', '-n', type=int, default=1, help='number of processes for parallel parsing')
+@click.option('--max-dirs', '-m', type=int, default=10, help='maximum number of directories to parse')
+def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     """parse VASP output directories in base_path into tasks and tag"""
     if not insert:
         print('DRY RUN: add --insert flag to actually insert tasks')
@@ -930,100 +1047,41 @@ def parse(base_path, add_snlcolls, insert, make_snls):
     base_path = os.path.join(base_path, '')
     base_path_split = base_path.split(os.sep)
     tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2]
-    idx = len(base_path_split)
     drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]})
     already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')]
     print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag)
 
-    def get_timestamp_dir(prefix='launcher'):
-        time_now = datetime.utcnow().strftime(FW_BLOCK_FORMAT)
-        return '_'.join([prefix, time_now])
-
-    def get_symlinked_path(root):
-        root_split = os.path.realpath(root).split(os.sep)
-        if not root_split[idx-1].startswith('block_'):
-            rootdir = os.sep.join(root_split[:idx])
-            block = get_timestamp_dir(prefix='block')
-            block_dir = os.sep.join(root_split[:idx-1] + [block])
-            if insert:
-                os.rename(rootdir, block_dir)
-                os.symlink(block_dir, rootdir)
-            print(rootdir, '->', block_dir)
-        subdir = os.sep.join(root_split)
-        if not root_split[-1].startswith('launcher_'):
-            launch = get_timestamp_dir()
-            launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch)
-            if insert:
-                os.rename(subdir, launch_dir)
-                os.symlink(launch_dir, subdir)
-            print(subdir, '->', launch_dir)
-            return launch_dir
-        else:
-            return os.path.realpath(subdir)
-
-    def contains_vasp_dirs(list_of_files):
-        for f in list_of_files:
-            if f.startswith("INCAR"):
-                return True
-
-    def get_vasp_dirs(scan_path):
-        # NOTE os.walk followlinks=False by default, as intended here
-        for root, dirs, files in os.walk(scan_path):
-            # TODO ignore relax1/2 subdirs if INCAR.orig found
-            if contains_vasp_dirs(files):
-                yield get_symlinked_path(root)
+    chunk_size = 100
+    if nproc > 1 and max_dirs <= chunk_size:
+        nproc = 1
+        print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially')
+    pool = multiprocessing.Pool(processes=nproc)
+    iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs)
+    iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks
+    queue = deque()
+    total_nr_vaspdirs_parsed = 0
+    while iterator or queue:
+        try:
+            args = [next(iterator), insert, drone, already_inserted_subdirs]
+            queue.append(pool.apply_async(parse_vasp_dirs, args))
+        except (StopIteration, TypeError):
+            iterator = None
+        while queue and (len(queue) >= pool._processes or not iterator):
+            process = queue.pop()
+            process.wait(1)
+            if not process.ready():
+                queue.append(process)
             else:
-                for f in files:
-                    if f.endswith('.tar.gz'):
-                        cwd = os.path.realpath(root)
-                        path = os.path.join(cwd, f)
-                        with tarfile.open(path, 'r:gz') as tf:
-                            tf.extractall(cwd)
-                        os.remove(path)
-                        for vaspdir in get_vasp_dirs(path.replace('.tar.gz', '')):
-                            yield vaspdir
-
-
-    inputs = ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']
-    input_structures = []
-    for vaspdir in get_vasp_dirs(base_path):
-        subdir = get_subdir(vaspdir)
-        if subdir not in already_inserted_subdirs:
-            print('vaspdir:', vaspdir)
-            if insert:
-                for inp in inputs:
-                    input_path = os.path.join(vaspdir, inp)
-                    orig_path = input_path + '.orig'
-                    if not glob(orig_path+'*'):
-                        copyfile(input_path, orig_path)
-                        print('cp', input_path, '->', orig_path)
-                try:
-                    task_doc = drone.assimilate(vaspdir)
-                except Exception as ex:
-                    err = str(ex)
-                    if err == 'No VASP files found!':
-                        rmtree(vaspdir)
-                        print('removed', vaspdir)
-                    continue
-                if task_doc['state'] == 'successful':
-                    try:
-                        target.insert_task(task_doc, use_gridfs=True)
-                    except DocumentTooLarge as ex:
-                        print(str(ex))
-                        print('remove normalmode_eigenvecs and retry ...')
-                        task_doc['calcs_reversed'][0]['output'].pop('normalmode_eigenvecs')
-                        try:
-                            target.insert_task(task_doc, use_gridfs=True)
-                        except DocumentTooLarge as ex:
-                            print(str(ex))
-                            print('also remove force_constants and retry ...')
-                            task_doc['calcs_reversed'][0]['output'].pop('force_constants')
-                            target.insert_task(task_doc, use_gridfs=True)
-                    if make_snls:
-                        s = Structure.from_dict(task_doc['input']['structure'])
-                        input_structures.append(s)
-
-    if insert and make_snls:
-        print('add SNLs for', len(input_structures), 'structures')
-        add_snls(tag, input_structures, add_snlcolls, insert)
+                total_nr_vaspdirs_parsed += process.get()
+    pool.close()
+    print('DONE:', total_nr_vaspdirs_parsed, 'parsed')
+
+    #input_structures = []
+    #                if make_snls:
+    #                    s = Structure.from_dict(task_doc['input']['structure'])
+    #                    input_structures.append(s)
+
+    #if insert and make_snls:
+    #    print('add SNLs for', len(input_structures), 'structures')
+    #    add_snls(tag, input_structures, add_snlcolls, insert)
 

From 8d6cff99028d81df571b7f89b026135abc3178f6 Mon Sep 17 00:00:00 2001
From: Patrick Huck <tschaume@users.noreply.github.com>
Date: Fri, 7 Dec 2018 16:33:12 +0100
Subject: [PATCH 46/97] Initial commit

---
 .gitignore | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 LICENSE    |  21 +++++++++++
 README.md  |   2 ++
 3 files changed, 127 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..894a44cc06
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,104 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..8c40a5f90a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Materials Project
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..9045646106
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# mp-nomad
+Disseminate raw MP calculations through NoMaD

From 601b520880941dbb20886dbd8018bb4e9325ce7a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 7 Dec 2018 16:35:45 +0100
Subject: [PATCH 47/97] first steps with google drive api

---
 .gitignore             |   4 ++
 retrieve_mpraw_data.py | 113 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+)
 create mode 100644 retrieve_mpraw_data.py

diff --git a/.gitignore b/.gitignore
index 894a44cc06..0fe7e91a59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,7 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+token.json
+credentials.json
+mpraw/*
diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py
new file mode 100644
index 0000000000..b38f721026
--- /dev/null
+++ b/retrieve_mpraw_data.py
@@ -0,0 +1,113 @@
+from __future__ import print_function
+import io, os
+from googleapiclient.discovery import build
+from httplib2 import Http
+from oauth2client import file, client, tools
+from googleapiclient.http import MediaIoBaseDownload
+from pprint import pprint
+
+# If modifying these scopes, delete the file token.json.
+# see https://developers.google.com/identity/protocols/googlescopes#drivev3
+SCOPES = 'https://www.googleapis.com/auth/drive'
+OUTDIR = 'mpraw'
+CHUNKSIZE = 5*1024*1024 # 5MB
+
+def download_file(service, file_id):
+    request = service.files().get_media(fileId=file_id)
+    fh = io.BytesIO()
+    downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE)
+    done = False
+    while done is False:
+        status, done = downloader.next_chunk()
+        print("Download {:d}%.".format(int(status.progress() * 100)))
+    return fh.getvalue()
+
+def main():
+    """Shows basic usage of the Drive v3 API.
+    Prints the names and ids of the first 10 files the user has access to.
+    """
+    # The file token.json stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    store = file.Storage('token.json')
+    creds = store.get()
+    if not creds or creds.invalid:
+        flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
+        creds = tools.run_flow(flow, store)
+    service = build('drive', 'v3', http=creds.authorize(Http()))
+
+    # Call the Drive v3 API
+    # https://developers.google.com/drive/api/v3/search-parameters#fn1
+    # TODO older launcher directories don't have prefix
+    # TODO also cover non-b/l hierarchy
+    block_page_token = None
+    garden_id = os.environ.get('MPDRIVE_GARDEN_ID')
+    if garden_id:
+        block_query = "'{}' in parents and name contains 'block_'".format(garden_id)
+    else:
+        print('MPDRIVE_GARDEN_ID not set!')
+        return
+
+    while True:
+        block_response = service.files().list(
+            q=block_query, spaces='drive', pageToken=block_page_token,
+            fields='nextPageToken, files(id, name)', pageSize=2
+        ).execute()
+
+        for block in block_response['files']:
+            print(block['name'])
+            block_dir = os.path.join(OUTDIR, block['name'])
+            if not os.path.exists(block_dir):
+                os.makedirs(block_dir)
+
+            block_page_token = block_response.get('nextPageToken', None)
+            if block_page_token is None:
+                break # done with blocks
+
+            # recurse into the block to retrieve launch_dir's
+            launcher_page_token = None
+            launcher_query = "'{}' in parents".format(block['id'])
+
+            while True:
+                launcher_response = service.files().list(
+                    q=launcher_query, spaces='drive', pageToken=launcher_page_token,
+                    fields='nextPageToken, files(id, name, modifiedTime, size)',
+                    pageSize=10
+                ).execute()
+
+                for launcher in launcher_response['files']:
+                    # TODO 'size' doesn't exist if launcher is another dir
+                    # due to non-reservation mode production
+                    if int(launcher['size']) < 50:
+                        service.files().delete(fileId=launcher['id']).execute()
+                        print('removed', launcher['name'])
+                    else:
+                        # download (incl. block)
+                        #pprint(launcher)
+                        path = os.path.join(block_dir, launcher['name'])
+                        print(path)
+                        if not os.path.exists(path):
+                            content = download_file(service, launcher['id'])
+                            with open(path, 'wb') as f:
+                                f.write(content)
+                            print(path, 'downloaded.')
+
+                launcher_page_token = launcher_response.get('nextPageToken', None)
+                if launcher_page_token is None:
+                    break # done with launchers in current block
+
+            # search for launchers in block again, and rm block if empty dir
+            launcher_response = service.files().list(
+                q=launcher_query, spaces='drive', pageSize=1
+            ).execute()
+            if not launcher_response['files']:
+                service.files().delete(fileId=block['id']).execute()
+                print('removed', block['name'])
+
+        break # blocks loop TODO remove
+
+    # TODO in production, subscribe to watch garden directory?
+    # https://developers.google.com/drive/api/v3/reference/files/watch
+
+if __name__ == '__main__':
+    main()

From ec4f55adad78e6f5f6b013dd9287c89b072d76cf Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 7 Dec 2018 16:36:14 +0100
Subject: [PATCH 48/97] add reqs

---
 requirements.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..7c69c3704e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+cachetools==3.0.0
+certifi==2018.10.15
+google-api-python-client==1.7.5
+google-auth==1.6.1
+google-auth-httplib2==0.0.3
+httplib2==0.12.0
+oauth2client==4.1.3
+pyasn1==0.4.4
+pyasn1-modules==0.2.2
+rsa==4.0
+six==1.11.0
+uritemplate==3.0.0

From 0f3101a39d192b0b9610691a874eae9a0bdc6bc2 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 7 Dec 2018 16:41:52 +0100
Subject: [PATCH 49/97] use tqdm for download progress

---
 retrieve_mpraw_data.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py
index b38f721026..6a2a3423b5 100644
--- a/retrieve_mpraw_data.py
+++ b/retrieve_mpraw_data.py
@@ -5,21 +5,23 @@
 from oauth2client import file, client, tools
 from googleapiclient.http import MediaIoBaseDownload
 from pprint import pprint
+from tqdm import tqdm
 
 # If modifying these scopes, delete the file token.json.
 # see https://developers.google.com/identity/protocols/googlescopes#drivev3
 SCOPES = 'https://www.googleapis.com/auth/drive'
 OUTDIR = 'mpraw'
-CHUNKSIZE = 5*1024*1024 # 5MB
+CHUNKSIZE = 1024*1024 # 5MB
 
 def download_file(service, file_id):
     request = service.files().get_media(fileId=file_id)
     fh = io.BytesIO()
     downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE)
     done = False
-    while done is False:
-        status, done = downloader.next_chunk()
-        print("Download {:d}%.".format(int(status.progress() * 100)))
+    with tqdm(total=100) as pbar:
+        while done is False:
+            status, done = downloader.next_chunk()
+            pbar.update(int(status.progress() * 100))
     return fh.getvalue()
 
 def main():

From 0a10393d01ca4f42d862d0767325bdd3fac7b0d9 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Fri, 7 Dec 2018 16:46:45 +0100
Subject: [PATCH 50/97] update reqs

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 7c69c3704e..9e850ed138 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ pyasn1==0.4.4
 pyasn1-modules==0.2.2
 rsa==4.0
 six==1.11.0
+tqdm==4.28.1
 uritemplate==3.0.0

From c560068b868e2e72bb2ed17772010ea54e607114 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 10 Dec 2018 07:56:13 -0800
Subject: [PATCH 51/97] cli: correct chunk_size

---
 emmet/scripts/emmet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index b1dead5fcb..240aa4b969 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing
+import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math
 from shutil import copyfile, rmtree
 from glob import glob
 from fnmatch import fnmatch
@@ -1051,7 +1051,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')]
     print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag)
 
-    chunk_size = 100
+    chunk_size = math.ceil(max_dirs/nproc)
     if nproc > 1 and max_dirs <= chunk_size:
         nproc = 1
         print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially')

From 34a917d9cd002884d883d5d553ab98f2467ab364 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 10 Dec 2018 07:56:57 -0800
Subject: [PATCH 52/97] cli.copy: better task-id

---
 emmet/scripts/emmet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 240aa4b969..e6a08af74d 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -339,8 +339,8 @@ def insert_snls(snls_list):
 
             if isinstance(task_doc['task_id'], int):
                 if insert:
-                    c = target.db.counter.find_one_and_update({"_id": "taskid"}, {"$inc": {"c": 1}}, return_document=ReturnDocument.AFTER)["c"]
-                    task_doc['task_id'] = 'mp-{}'.format(c)
+                    next_tid = max([int(tid[len('mp')+1:]) for tid in target.collection.distinct('task_id')]) + 1
+                    task_doc['task_id'] = 'mp-{}'.format(next_tid)
             else:
                 task = target.collection.find_one({'task_id': task_doc['task_id']}, ['orig_inputs', 'output.structure'])
                 if task:

From 982726bb7fbc0ca8ff7a27cccf4b3bb183f21d08 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 18 Dec 2018 04:40:13 -0800
Subject: [PATCH 53/97] cli: start gdrive subcommand

---
 .gitignore                      |  6 ++-
 emmet/scripts/emmet.py          | 70 +++++++++++++++++++++++++++++++++
 emmet/scripts/launcher_paths.py | 39 ------------------
 3 files changed, 75 insertions(+), 40 deletions(-)
 delete mode 100644 emmet/scripts/launcher_paths.py

diff --git a/.gitignore b/.gitignore
index ffc341b871..9826898353 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,4 +105,8 @@ ENV/
 .DS_Store
 
 # PyCharm
-.idea
\ No newline at end of file
+.idea
+
+# GDrive
+emmet/scripts/credentials.json
+emmet/scripts/token.json
diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index e6a08af74d..b866f6da90 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -22,6 +22,9 @@
 from emmet.vasp.task_tagger import task_type
 from log4mongo.handlers import MongoHandler, MongoFormatter
 from prettytable import PrettyTable
+from googleapiclient.discovery import build
+from httplib2 import Http
+from oauth2client import file, client, tools
 
 if 'FW_CONFIG_FILE' not in os.environ:
     print('Please set FW_CONFIG_FILE!')
@@ -33,6 +36,7 @@
 task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
 structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
 aggregation_keys = ['reduced_cell_formula', 'formula_pretty']
+SCOPES = 'https://www.googleapis.com/auth/drive'
 
 def aggregate_by_formula(coll, q, key=None):
     query = {'$and': [q, exclude]}
@@ -1085,3 +1089,69 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     #    print('add SNLs for', len(input_structures), 'structures')
     #    add_snls(tag, input_structures, add_snlcolls, insert)
 
+@cli.command()
+@click.argument('target_db_file', type=click.Path(exists=True))
+def gdrive(target_db_file):
+    """sync launch directories for target task DB to Google Drive"""
+    target = VaspCalcDb.from_db_file(target_db_file, admin=True)
+    print('connected to target db with', target.collection.count(), 'tasks')
+    print(target.db.materials.count(), 'materials')
+
+    store = file.Storage('token.json')
+    creds = store.get()
+    if not creds or creds.invalid:
+        flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
+        creds = tools.run_flow(flow, store)
+    service = build('drive', 'v3', http=creds.authorize(Http()))
+    garden_id = os.environ.get('MPDRIVE_GARDEN_ID')
+    if not garden_id:
+        print('MPDRIVE_GARDEN_ID not set!')
+        return
+    
+    query = {}
+    materials = target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})
+    blessed_tasks = dict((doc['task_id'], doc['blessed_tasks']) for doc in materials)
+    nr_blessed_tasks = sum([len(l) for l in blessed_tasks.values()])
+    print(nr_blessed_tasks, 'tasks to sync')
+
+    batch = service.new_batch_http_request()
+    splits = ['block_', 'aflow_']
+    nr_tasks_processed = 0
+    for mpid, tasks in blessed_tasks.items():
+        for task_type, task_id in tasks.items():
+            if task_type == 'GGA Structure Optimization': # TODO remove
+                if len(batch._order) == 100:
+                    print('execute batch request ...')
+                    batch.execute()
+                    batch = service.new_batch_http_request()
+                dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name']
+                if '_2011-' not in dir_name and '_2012-' not in dir_name: # TODO remove
+                    continue
+                for s in splits:
+                    ds = dir_name.split(s)
+                    if len(ds) == 2:
+                        block_launcher = s + ds[-1]
+                        print(mpid, task_id, block_launcher)
+                        block, launcher = block_launcher.rsplit(os.sep, 1)
+                        query = "name = '{}.tar.gz'".format(launcher)
+                        response = service.files().list(
+                            q=query, spaces='drive', fields='files(id, name, size)', pageSize=1
+                        ).execute()
+                        files = response['files']
+                        if files:
+                            if int(files[0]['size']) < 50:
+                                batch.add(service.files().delete(fileId=files[0]['id']))
+                                print('TODO: re-upload', files[0]['name'])
+                        else:
+                            print('to upload')
+                        nr_tasks_processed += 1
+                        break
+                else:
+                    print(mpid, task_id, ': could not split', dir_name)
+                    return
+
+    if len(batch._order) > 0:
+        print('execute final batch request ...')
+        batch.execute()
+    print(nr_tasks_processed)
+
diff --git a/emmet/scripts/launcher_paths.py b/emmet/scripts/launcher_paths.py
deleted file mode 100644
index fc7625bbf2..0000000000
--- a/emmet/scripts/launcher_paths.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import json
-from atomate.vasp.database import VaspCalcDb
-
-target_db_file = '../dbfiles/db_atomate.json'
-target = VaspCalcDb.from_db_file(target_db_file, admin=True)
-print('connected to target db with', target.collection.count(), 'tasks')
-print(target.db.materials.count(), 'materials')
-
-splits = ['block_', 'aflow_']
-mpids = json.load(open('KRao_Li_FullList.txt', 'r'))
-print(len(mpids), 'mpids')
-query = {'task_id': {'$in': mpids}}
-
-# {'mp-1002': [{'task_id': ..., 'task_type': ..., 'launcher_path': ...}, ...], ...}
-out = {}
-
-for idx, doc in enumerate(target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})):
-    mp_id = doc['task_id']
-    out[mp_id] = []
-    print(idx, mp_id)
-    for task_type, task_id in doc['blessed_tasks'].items():
-        dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name']
-        if 'maarten_piezo' in dir_name:
-            continue
-        for s in splits:
-            ds = dir_name.split(s)
-            if len(ds) == 2:
-                launcher = s + ds[-1]
-                print(task_id, task_type, launcher)
-                out[mp_id].append({'task_id': task_id, 'task_type': task_type, 'launcher_path': launcher})
-                break
-
-with open('launcher_paths.json', 'w') as f:
-    json.dump(out, f)
-
-with open('launcher_paths.txt', 'w') as f:
-    for mp_id, tasks in out.items():
-        for task in tasks:
-            f.write(task['launcher_path']+'\n')

From fa65fc8fe68c6d660d13d011940dc39f3ae7d08f Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 18 Dec 2018 04:40:42 -0800
Subject: [PATCH 54/97] cli: parse bugfix

---
 emmet/scripts/emmet.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index b866f6da90..90dcc3cd75 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -88,9 +88,10 @@ def contains_vasp_dirs(list_of_files):
         if f.startswith("INCAR"):
             return True
     
-def get_symlinked_path(root, base_path_index):
+def get_symlinked_path(root, base_path_index, insert):
     root_split = os.path.realpath(root).split(os.sep)
-    if not root_split[base_path_index-1].startswith('block_'):
+    if base_path_index != len(root_split) and \
+        not root_split[base_path_index-1].startswith('block_'):
         rootdir = os.sep.join(root_split[:base_path_index])
         block = get_timestamp_dir(prefix='block')
         block_dir = os.sep.join(root_split[:base_path_index-1] + [block])
@@ -110,7 +111,7 @@ def get_symlinked_path(root, base_path_index):
     else:
         return os.path.realpath(subdir)
 
-def get_vasp_dirs(scan_path, base_path, max_dirs):
+def get_vasp_dirs(scan_path, base_path, max_dirs, insert):
     base_path_split = base_path.split(os.sep)
     base_path_index = len(base_path_split)
     # NOTE os.walk followlinks=False by default, as intended here
@@ -118,7 +119,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs):
     for root, dirs, files in os.walk(scan_path):
         # TODO ignore relax1/2 subdirs if INCAR.orig found
         if contains_vasp_dirs(files):
-            yield get_symlinked_path(root, base_path_index)
+            yield get_symlinked_path(root, base_path_index, insert)
             counter += 1
             if counter >= max_dirs:
                 break
@@ -130,7 +131,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs):
                     with tarfile.open(path, 'r:gz') as tf:
                         tf.extractall(cwd)
                     os.remove(path)
-                    for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs):
+                    for vaspdir in get_vasp_dirs(path.replace('.tar.gz', ''), base_path, max_dirs, insert):
                         yield vaspdir
                         counter += 1
                         if counter >= max_dirs:
@@ -1060,7 +1061,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
         nproc = 1
         print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially')
     pool = multiprocessing.Pool(processes=nproc)
-    iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs)
+    iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs, insert)
     iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks
     queue = deque()
     total_nr_vaspdirs_parsed = 0

From 3dead80747bc1791f4ca4caa694a76e6e3c766a8 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 19 Dec 2018 08:00:46 -0800
Subject: [PATCH 55/97] cli.parse: fix orig copy

---
 emmet/scripts/emmet.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 90dcc3cd75..8802d22f25 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -157,8 +157,9 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
         if insert:
             for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']:
                 input_path = os.path.join(vaspdir, inp)
-                orig_path = input_path + '.orig'
-                if not glob(orig_path+'*'):
+                if not glob(input_path+'.orig*'):
+                    input_path = glob(input_path+'*')[0]
+                    orig_path = input_path.replace(inp, inp+'.orig')
                     copyfile(input_path, orig_path)
                     print(name, 'cp', input_path, '->', orig_path)
             try:

From c916d3c2c1e8d5325f3c46e839be8fada83e1b3a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 19 Dec 2018 08:01:37 -0800
Subject: [PATCH 56/97] cli: progress on gdrive cmd

---
 emmet/scripts/emmet.py           | 128 ++++++++++++++++++++++---------
 emmet/scripts/hpss_to_mpdrive.sh |  18 +++--
 2 files changed, 102 insertions(+), 44 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 8802d22f25..b10eb3c2d2 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -25,6 +25,7 @@
 from googleapiclient.discovery import build
 from httplib2 import Http
 from oauth2client import file, client, tools
+from googleapiclient.http import MediaFileUpload
 
 if 'FW_CONFIG_FILE' not in os.environ:
     print('Please set FW_CONFIG_FILE!')
@@ -1091,6 +1092,17 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     #    print('add SNLs for', len(input_structures), 'structures')
     #    add_snls(tag, input_structures, add_snlcolls, insert)
 
+def upload_archive(path, name, service, parent=None):
+    media = MediaFileUpload(path, mimetype='application/gzip', resumable=True)
+    body = {'name': name, 'parents': [parent]}
+    request = service.files().create(media_body=media, body=body)
+    response = None
+    while response is None:
+        status, response = request.next_chunk()
+        if status:
+            print("Uploaded %d%%." % int(status.progress() * 100))
+    print("Upload Complete!")
+
 @cli.command()
 @click.argument('target_db_file', type=click.Path(exists=True))
 def gdrive(target_db_file):
@@ -1111,49 +1123,89 @@ def gdrive(target_db_file):
         return
     
     query = {}
-    materials = target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})
-    blessed_tasks = dict((doc['task_id'], doc['blessed_tasks']) for doc in materials)
-    nr_blessed_tasks = sum([len(l) for l in blessed_tasks.values()])
-    print(nr_blessed_tasks, 'tasks to sync')
+    blessed_task_ids = [
+        task_id for doc in target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})
+        for task_type, task_id in doc['blessed_tasks'].items()
+    ]
+    print(len(blessed_task_ids), 'blessed tasks.')
+
+    dir_names = []
+    for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
+        dir_name = task['dir_name']
+        if '2011-' in dir_name or '2012-' in dir_name: # TODO remove
+            dir_names.append(dir_name)
+    dir_names.sort()
+    print(len(dir_names), 'launcher directories to sync.')
 
-    batch = service.new_batch_http_request()
     splits = ['block_', 'aflow_']
     nr_tasks_processed = 0
-    for mpid, tasks in blessed_tasks.items():
-        for task_type, task_id in tasks.items():
-            if task_type == 'GGA Structure Optimization': # TODO remove
-                if len(batch._order) == 100:
-                    print('execute batch request ...')
-                    batch.execute()
-                    batch = service.new_batch_http_request()
-                dir_name = target.collection.find_one({'task_id': task_id}, {'dir_name': 1})['dir_name']
-                if '_2011-' not in dir_name and '_2012-' not in dir_name: # TODO remove
-                    continue
-                for s in splits:
-                    ds = dir_name.split(s)
-                    if len(ds) == 2:
-                        block_launcher = s + ds[-1]
-                        print(mpid, task_id, block_launcher)
-                        block, launcher = block_launcher.rsplit(os.sep, 1)
-                        query = "name = '{}.tar.gz'".format(launcher)
-                        response = service.files().list(
-                            q=query, spaces='drive', fields='files(id, name, size)', pageSize=1
-                        ).execute()
-                        files = response['files']
-                        if files:
-                            if int(files[0]['size']) < 50:
-                                batch.add(service.files().delete(fileId=files[0]['id']))
-                                print('TODO: re-upload', files[0]['name'])
+    prev = None
+    outfile = open('launcher_paths.txt', 'w')
+    stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive'
+
+    for dir_name in dir_names:
+
+        for s in splits:
+            ds = dir_name.split(s)
+            if len(ds) == 2:
+                block_launcher = s + ds[-1]
+                block_launcher_split = block_launcher.split(os.sep)
+                #if prev is not None and block_launcher_split[0] != prev \
+                #    and block_launcher_split[0] != 'aflow_engines-mag_special':
+                #    return # TODO remove
+
+                print(block_launcher)
+                archive_name = '{}.tar.gz'.format(block_launcher_split[-1])
+                query = "name = '{}'".format(archive_name)
+                response = service.files().list(
+                    q=query, spaces='drive', fields='files(id, name, size, parents)', pageSize=1
+                ).execute()
+                files = response['files']
+                archive_path = os.path.join(stage_dir, block_launcher + '.tar.gz')
+                if files:
+                    if int(files[0]['size']) < 50:
+                        service.files().delete(fileId=files[0]['id'])
+                        if os.path.exists(archive_path):
+                            parent = files[0]['parents'][0]
+                            #upload_archive(archive_path, archive_name, service, parent=parent)
+                            #return # TODO remove
                         else:
-                            print('to upload')
-                        nr_tasks_processed += 1
-                        break
+                            print('TODO: get from HPSS')
+                            outfile.write(block_launcher + '\n')
+                    else:
+                        print('OK:', files[0])
                 else:
-                    print(mpid, task_id, ': could not split', dir_name)
-                    return
+                    if os.path.exists(archive_path):
+                        # make directories
+                        parents = [garden_id]
+                        for folder in block_launcher_split[:-1]:
+                            query = "name = '{}'".format(folder)
+                            response = service.files().list(
+                                q=query, spaces='drive', fields='files(id, name)', pageSize=1
+                            ).execute()
+                            if not response['files']:
+                                print('create dir ...', folder)
+                                body = {
+                                  'name': folder,
+                                  'mimeType': "application/vnd.google-apps.folder",
+                                  'parents': [parents[-1]]
+                                }
+                                gdrive_folder = service.files().create(body=body).execute()
+                                parents.append(gdrive_folder['id'])
+                            else:
+                                parents.append(response['files'][0]['id'])
+
+                        #upload_archive(archive_path, archive_name, service, parent=parents[-1])
+                    else:
+                        print('TODO: get from HPSS')
+                        outfile.write(block_launcher + '\n')
+                nr_tasks_processed += 1
+                prev = block_launcher_split[0]
+                break
+        else:
+            print('could not split', dir_name)
+            return
 
-    if len(batch._order) > 0:
-        print('execute final batch request ...')
-        batch.execute()
     print(nr_tasks_processed)
+    outfile.close()
 
diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh
index 7f6a6da177..2f5e10b7ef 100755
--- a/emmet/scripts/hpss_to_mpdrive.sh
+++ b/emmet/scripts/hpss_to_mpdrive.sh
@@ -9,18 +9,20 @@ stage_dir="rclone_to_mp_drive"
 [[ ! -e $hpss_missing ]] && touch $hpss_missing
 
 for dir in $dirs; do
-  [[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove
+  #[[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove
 
   files=`grep "^$dir" $1`
   extract="${dir}.extract"
   grep -q "$dir" $hpss_missing
   [[ $? -eq 0 ]] && continue
 
-  [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  #[[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
 
-  missing_paths="${dir}.paths"
   echo $files | tr ' ' '\n' | sort -u > ${dir}.files
   rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf
+
+  missing_paths="${dir}.paths"
+  [[ -e $missing_paths ]] && rm -v $missing_paths
   for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then
@@ -48,9 +50,11 @@ for dir in $dirs; do
   if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then
     echo "make ${dir}.tar_list ..."
     tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list
+    [[ $? -ne 0 ]] && exit
   fi
 
   paths=`cat $missing_paths`
+  [[ -e $extract ]] && rm -v $extract
   for f in $paths; do
     [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract
   done
@@ -58,6 +62,7 @@ for dir in $dirs; do
   if [ -e $extract ] && [ -s $extract ]; then
     echo "extract" `wc -l $extract`
     tar -xvzf ${dir}.tar.gz --files-from $extract
+    [[ $? -ne 0 ]] && rm -v $extract && exit
   fi
   rm -v $extract
 
@@ -66,12 +71,13 @@ for dir in $dirs; do
     echo $launch_dir_tar ...
     mkdir -p `dirname $launch_dir_tar`
     tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`)
-    [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && break
+    [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && exit
     ls -ltrh $launch_dir_tar
-    [[ -d $f ]] && rm -r $f
+    #[[ -d $f ]] && rm -rf $f
   done
   rm -v $missing_paths
 
-  rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  #rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  #rm -v ${dir}.tar.gz
 
 done

From af61614481c529ee7403b85e171e70befbef3834 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 20 Dec 2018 05:30:26 -0800
Subject: [PATCH 57/97] cli: progress on gdrive sync

---
 emmet/scripts/emmet.py | 131 +++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 64 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index b10eb3c2d2..4d3a6bc1d4 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1129,83 +1129,86 @@ def gdrive(target_db_file):
     ]
     print(len(blessed_task_ids), 'blessed tasks.')
 
+    splits = ['block_', 'aflow_engines-']
     dir_names = []
     for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
         dir_name = task['dir_name']
-        if '2011-' in dir_name or '2012-' in dir_name: # TODO remove
-            dir_names.append(dir_name)
+        # aflow_engines-mag_special
+        if '2011-' in dir_name and 'block_2011-10-07-08-57-17-804213' in dir_name: # TODO remove
+            for s in splits:
+                ds = dir_name.split(s)
+                if len(ds) == 2:
+                    block_launcher = s + ds[-1]
+                    dir_names.append(block_launcher)
+                    break
+            else:
+                print('could not split', dir_name)
+                return
+
     dir_names.sort()
     print(len(dir_names), 'launcher directories to sync.')
 
-    splits = ['block_', 'aflow_']
     nr_tasks_processed = 0
     prev = None
     outfile = open('launcher_paths.txt', 'w')
     stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive'
 
-    for dir_name in dir_names:
-
-        for s in splits:
-            ds = dir_name.split(s)
-            if len(ds) == 2:
-                block_launcher = s + ds[-1]
-                block_launcher_split = block_launcher.split(os.sep)
-                #if prev is not None and block_launcher_split[0] != prev \
-                #    and block_launcher_split[0] != 'aflow_engines-mag_special':
-                #    return # TODO remove
-
-                print(block_launcher)
-                archive_name = '{}.tar.gz'.format(block_launcher_split[-1])
-                query = "name = '{}'".format(archive_name)
-                response = service.files().list(
-                    q=query, spaces='drive', fields='files(id, name, size, parents)', pageSize=1
-                ).execute()
-                files = response['files']
-                archive_path = os.path.join(stage_dir, block_launcher + '.tar.gz')
-                if files:
-                    if int(files[0]['size']) < 50:
-                        service.files().delete(fileId=files[0]['id'])
-                        if os.path.exists(archive_path):
-                            parent = files[0]['parents'][0]
-                            #upload_archive(archive_path, archive_name, service, parent=parent)
-                            #return # TODO remove
-                        else:
-                            print('TODO: get from HPSS')
-                            outfile.write(block_launcher + '\n')
-                    else:
-                        print('OK:', files[0])
+    for idx, dir_name in enumerate(dir_names):
+        block_launcher_split = dir_name.split(os.sep)
+        #if prev is not None and prev != block_launcher_split[0]: # TODO remove
+        #    break
+        print(idx, dir_name)
+        archive_name = '{}.tar.gz'.format(block_launcher_split[-1])
+        query = "name = '{}'".format(archive_name)
+        response = service.files().list(
+            q=query, spaces='drive', fields='files(id, name, size, parents)'
+        ).execute()
+        files = response['files']
+        archive_path = os.path.join(stage_dir, dir_name + '.tar.gz')
+        if files:
+            if len(files) > 1:
+                # duplicate uploads - delete all and re-upload
+                for f in files:
+                  print('removing', f['name'], '...')
+                  service.files().delete(fileId=f['id']).execute()
+                print('TODO: rerun to upload!')
+            elif int(files[0]['size']) < 50:
+                service.files().delete(fileId=files[0]['id']).execute()
+                if os.path.exists(archive_path):
+                    parent = files[0]['parents'][0]
+                    upload_archive(archive_path, archive_name, service, parent=parent)
                 else:
-                    if os.path.exists(archive_path):
-                        # make directories
-                        parents = [garden_id]
-                        for folder in block_launcher_split[:-1]:
-                            query = "name = '{}'".format(folder)
-                            response = service.files().list(
-                                q=query, spaces='drive', fields='files(id, name)', pageSize=1
-                            ).execute()
-                            if not response['files']:
-                                print('create dir ...', folder)
-                                body = {
-                                  'name': folder,
-                                  'mimeType': "application/vnd.google-apps.folder",
-                                  'parents': [parents[-1]]
-                                }
-                                gdrive_folder = service.files().create(body=body).execute()
-                                parents.append(gdrive_folder['id'])
-                            else:
-                                parents.append(response['files'][0]['id'])
-
-                        #upload_archive(archive_path, archive_name, service, parent=parents[-1])
-                    else:
-                        print('TODO: get from HPSS')
-                        outfile.write(block_launcher + '\n')
-                nr_tasks_processed += 1
-                prev = block_launcher_split[0]
-                break
+                    print('TODO: get from HPSS')
+                    outfile.write(dir_name + '\n')
+            else:
+                print('OK:', files[0])
         else:
-            print('could not split', dir_name)
-            return
+            if os.path.exists(archive_path):
+                # make directories
+                parents = [garden_id]
+                for folder in block_launcher_split[:-1]:
+                    query = "name = '{}'".format(folder)
+                    response = service.files().list(
+                        q=query, spaces='drive', fields='files(id, name)', pageSize=1
+                    ).execute()
+                    if not response['files']:
+                        print('create dir ...', folder)
+                        body = {
+                          'name': folder,
+                          'mimeType': "application/vnd.google-apps.folder",
+                          'parents': [parents[-1]]
+                        }
+                        gdrive_folder = service.files().create(body=body).execute()
+                        parents.append(gdrive_folder['id'])
+                    else:
+                        parents.append(response['files'][0]['id'])
+
+                upload_archive(archive_path, archive_name, service, parent=parents[-1])
+            else:
+                print('TODO: get from HPSS')
+                outfile.write(dir_name + '\n')
+        nr_tasks_processed += 1
+        prev = block_launcher_split[0]
 
     print(nr_tasks_processed)
     outfile.close()
-

From 757534f89ba653023ceec0aa84368af370326355 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 20 Dec 2018 05:34:34 -0800
Subject: [PATCH 58/97] cli: save hpss_to_mpdrive

---
 emmet/scripts/hpss_to_mpdrive.sh | 36 +++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh
index 2f5e10b7ef..486957eadf 100755
--- a/emmet/scripts/hpss_to_mpdrive.sh
+++ b/emmet/scripts/hpss_to_mpdrive.sh
@@ -16,7 +16,7 @@ for dir in $dirs; do
   grep -q "$dir" $hpss_missing
   [[ $? -eq 0 ]] && continue
 
-  #[[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
 
   echo $files | tr ' ' '\n' | sort -u > ${dir}.files
   rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf
@@ -26,16 +26,16 @@ for dir in $dirs; do
   for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     if [[ ! -f $launch_dir_tar || ! -s $launch_dir_tar ]]; then
-	   echo $f >> $missing_paths
-	 elif [ -d $f ]; then
-		rm -rv $f
-	 fi
+      echo $f >> $missing_paths
+    elif [ -d $f ]; then
+      rm -rv $f
+    fi
   done
 
   for f in $(comm --check-order -12 ${dir}.files ${dir}.rclone_lsf | tr '\n' ' '); do # already cloned launch dirs -> cleanup
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     [[ -d $f ]] && rm -rv $f
-	 [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar
+    [[ -e $launch_dir_tar ]] && rm -v $launch_dir_tar
   done
   rm -v ${dir}.files ${dir}.rclone_lsf
 
@@ -61,8 +61,17 @@ for dir in $dirs; do
 
   if [ -e $extract ] && [ -s $extract ]; then
     echo "extract" `wc -l $extract`
-    tar -xvzf ${dir}.tar.gz --files-from $extract
-    [[ $? -ne 0 ]] && rm -v $extract && exit
+    if tar -xvzf ${dir}.tar.gz --files-from $extract; then
+      echo 'extract OK'
+    else
+      rm -v $extract
+      echo 'problem with extract!'
+      continue
+    fi
+  else
+    echo 'nothing to extract'
+    rm -v $extract
+    continue
   fi
   rm -v $extract
 
@@ -70,14 +79,17 @@ for dir in $dirs; do
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     echo $launch_dir_tar ...
     mkdir -p `dirname $launch_dir_tar`
-    tar_code=$(tar -czf $launch_dir_tar -C `dirname $f` `basename $f`)
-    [[ $tar_code -ne 0 ]] && echo 'problem with launch dir tar!' && exit
-    ls -ltrh $launch_dir_tar
+    if tar -czf $launch_dir_tar -C `dirname $f` `basename $f`; then
+      ls -ltrh $launch_dir_tar
+    else
+      echo 'problem with launch dir tar!'
+      continue
+    fi
     #[[ -d $f ]] && rm -rf $f
   done
   rm -v $missing_paths
 
-  #rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
   #rm -v ${dir}.tar.gz
 
 done

From 9352efca725abd12ff27eb0dd3909d2da1ac797a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 20 Dec 2018 16:58:31 +0100
Subject: [PATCH 59/97] retrieve from gdrive and compare to NoMaD works

---
 retrieve_mpraw_data.py | 116 ++++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 55 deletions(-)

diff --git a/retrieve_mpraw_data.py b/retrieve_mpraw_data.py
index 6a2a3423b5..7360afa514 100644
--- a/retrieve_mpraw_data.py
+++ b/retrieve_mpraw_data.py
@@ -1,22 +1,22 @@
 from __future__ import print_function
-import io, os
+import io, os, sys
 from googleapiclient.discovery import build
 from httplib2 import Http
 from oauth2client import file, client, tools
 from googleapiclient.http import MediaIoBaseDownload
-from pprint import pprint
 from tqdm import tqdm
+import requests
 
 # If modifying these scopes, delete the file token.json.
 # see https://developers.google.com/identity/protocols/googlescopes#drivev3
 SCOPES = 'https://www.googleapis.com/auth/drive'
-OUTDIR = 'mpraw'
-CHUNKSIZE = 1024*1024 # 5MB
+OUTDIR = '/nomad/nomadlab/mpraw'
+NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}'
 
 def download_file(service, file_id):
     request = service.files().get_media(fileId=file_id)
     fh = io.BytesIO()
-    downloader = MediaIoBaseDownload(fh, request, chunksize=CHUNKSIZE)
+    downloader = MediaIoBaseDownload(fh, request)
     done = False
     with tqdm(total=100) as pbar:
         while done is False:
@@ -24,6 +24,53 @@ def download_file(service, file_id):
             pbar.update(int(status.progress() * 100))
     return fh.getvalue()
 
+full_launcher_path = []
+
+def recurse(service, folder_id):
+    page_token = None
+    query = "'{}' in parents".format(folder_id)
+    while True:
+        response = service.files().list(
+            q=query, spaces='drive', pageToken=page_token,
+            fields='nextPageToken, files(id, name, modifiedTime, size)',
+            pageSize=50
+        ).execute()
+
+        for launcher in response['files']:
+            if '.tar.gz' in launcher['name']:
+                print(launcher)
+                launcher_name = launcher['name'].replace('.tar.gz', '')
+                full_launcher_path.append(launcher_name)
+                nomad_query='repository_main_file_uri="{}"'.format(launcher_name)
+                #nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO
+                print(nomad_query)
+                resp = requests.get(NOMAD_REPO.format(nomad_query)).json()
+                if 'meta' in resp:
+                    path = os.path.join(*full_launcher_path) + '.tar.gz'
+                    if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo
+                        print('Retrieve', path, '...')
+                        if not os.path.exists(path):
+                            os.makedirs(path)
+                            #content = download_file(service, launcher['id'])
+                            #with open(path, 'wb') as f:
+                            #    f.write(content)
+                            print('... DONE.')
+                    else:
+                        print(path, 'found in NoMaD repo:')
+                        for d in resp['data']:
+                            print('\t', d['attributes']['repository_uri'])
+                else:
+                    raise Exception(resp['errors'][0]['detail'])
+            else:
+                full_launcher_path.append(launcher['name'])
+                recurse(service, launcher['id'])
+
+            del full_launcher_path[-1:]
+
+        page_token = response.get('nextPageToken', None)
+        if page_token is None:
+            break # done with launchers in current block
+
 def main():
     """Shows basic usage of the Drive v3 API.
     Prints the names and ids of the first 10 files the user has access to.
@@ -45,7 +92,8 @@ def main():
     block_page_token = None
     garden_id = os.environ.get('MPDRIVE_GARDEN_ID')
     if garden_id:
-        block_query = "'{}' in parents and name contains 'block_'".format(garden_id)
+        #block_query = "'{}' in parents and name contains 'block_'".format(garden_id)
+        block_query = "'{}' in parents and name contains 'block_2011-10-07-08-57-17-804213'".format(garden_id)
     else:
         print('MPDRIVE_GARDEN_ID not set!')
         return
@@ -53,60 +101,18 @@ def main():
     while True:
         block_response = service.files().list(
             q=block_query, spaces='drive', pageToken=block_page_token,
-            fields='nextPageToken, files(id, name)', pageSize=2
+            fields='nextPageToken, files(id, name)', pageSize=10
         ).execute()
 
         for block in block_response['files']:
             print(block['name'])
-            block_dir = os.path.join(OUTDIR, block['name'])
-            if not os.path.exists(block_dir):
-                os.makedirs(block_dir)
-
-            block_page_token = block_response.get('nextPageToken', None)
-            if block_page_token is None:
-                break # done with blocks
-
-            # recurse into the block to retrieve launch_dir's
-            launcher_page_token = None
-            launcher_query = "'{}' in parents".format(block['id'])
-
-            while True:
-                launcher_response = service.files().list(
-                    q=launcher_query, spaces='drive', pageToken=launcher_page_token,
-                    fields='nextPageToken, files(id, name, modifiedTime, size)',
-                    pageSize=10
-                ).execute()
-
-                for launcher in launcher_response['files']:
-                    # TODO 'size' doesn't exist if launcher is another dir
-                    # due to non-reservation mode production
-                    if int(launcher['size']) < 50:
-                        service.files().delete(fileId=launcher['id']).execute()
-                        print('removed', launcher['name'])
-                    else:
-                        # download (incl. block)
-                        #pprint(launcher)
-                        path = os.path.join(block_dir, launcher['name'])
-                        print(path)
-                        if not os.path.exists(path):
-                            content = download_file(service, launcher['id'])
-                            with open(path, 'wb') as f:
-                                f.write(content)
-                            print(path, 'downloaded.')
-
-                launcher_page_token = launcher_response.get('nextPageToken', None)
-                if launcher_page_token is None:
-                    break # done with launchers in current block
-
-            # search for launchers in block again, and rm block if empty dir
-            launcher_response = service.files().list(
-                q=launcher_query, spaces='drive', pageSize=1
-            ).execute()
-            if not launcher_response['files']:
-                service.files().delete(fileId=block['id']).execute()
-                print('removed', block['name'])
+            full_launcher_path.clear()
+            full_launcher_path.append(block['name'])
+            recurse(service, block['id'])
 
-        break # blocks loop TODO remove
+        block_page_token = block_response.get('nextPageToken', None)
+        if block_page_token is None:
+            break # done with blocks
 
     # TODO in production, subscribe to watch garden directory?
     # https://developers.google.com/drive/api/v3/reference/files/watch

From a252b78b2fa1f4dc8400a101dbe0a88cebc0d4be Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 8 Jan 2019 16:41:01 -0800
Subject: [PATCH 60/97] cli.copy: add overview table

---
 emmet/scripts/emmet.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 4d3a6bc1d4..59d0cbb365 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -248,12 +248,17 @@ def insert_snls(snls_list):
         else:
             print('no SNLs to insert')
 
+    table = PrettyTable()
+    table.field_names = ['Tag', 'Source', 'Target', 'Skipped', 'Insert']
+    sums = ['total'] + [0] * (len(table.field_names)-1)
+
     for t in tags:
 
-        print('### {} ###'.format(t))
+        print('- {}'.format(t))
+        row = [t]
         query = {'$and': [{'tags': t}, task_base_query]}
         source_count = source.collection.count(query)
-        print('source / target:', source_count, '/', target.collection.count(query))
+        row += [source_count, target.collection.count(query)]
 
         # get list of SNLs to copy over
         # only need to check tagged SNLs in source and target; dup-check across SNL collections already done in add_snls
@@ -303,8 +308,9 @@ def insert_snls(snls_list):
                 task_query = {'task_id': doc['task_id'], '$or': [{'dir_name': doc['dir_name']}, {'_mpworks_meta': {'$exists': 0}}]}
                 if target.collection.count(task_query):
                     skip_task_ids.append(doc['task_id'])
-        if len(skip_task_ids):
-            print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks)
+        #if len(skip_task_ids):
+        #    print('skip', len(skip_task_ids), 'existing MP task ids out of', nr_source_mp_tasks)
+        row.append(len(skip_task_ids))
 
         query.update({'task_id': {'$nin': skip_task_ids}})
         already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find(query).distinct('dir_name')]
@@ -319,7 +325,11 @@ def insert_snls(snls_list):
         if len(subdirs) < 1:
             continue
 
-        print(len(subdirs), 'candidate tasks to insert')
+        row.append(len(subdirs))
+        table.add_row(row)
+        for idx, e in enumerate(row):
+            if isinstance(e, int):
+                sums[idx] += e
         if not insert:
             continue
 
@@ -370,6 +380,12 @@ def insert_snls(snls_list):
             if insert:
                 target.insert_task(task_doc, use_gridfs=True)
 
+    table.align['Tag'] = 'r'
+    if tag is None:
+        sfmt = '\033[1;32m{}\033[0m'
+        table.add_row([sfmt.format(s if s else '-') for s in sums])
+    print(table)
+
 
 @cli.command()
 @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')

From 0d501260e54236331c42bc0869d28111cbc1b7ce Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 9 Jan 2019 14:58:31 -0800
Subject: [PATCH 61/97] cli.gdrive: better scanning

---
 emmet/scripts/emmet.py | 86 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 14 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 59d0cbb365..44be35c615 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1138,6 +1138,58 @@ def gdrive(target_db_file):
         print('MPDRIVE_GARDEN_ID not set!')
         return
     
+    launcher_paths = []
+    full_launcher_path = []
+
+    def recurse(service, folder_id):
+        page_token = None
+        query = "'{}' in parents".format(folder_id)
+        while True:
+            response = service.files().list(
+                q=query, spaces='drive', pageToken=page_token,
+                fields='nextPageToken, files(id, name, modifiedTime, size)',
+            ).execute()
+
+            for launcher in response['files']:
+                if '.json' not in launcher['name']:
+                    if '.tar.gz' in launcher['name']:
+                        launcher_name = launcher['name'].replace('.tar.gz', '')
+                        full_launcher_path.append(launcher_name)
+                        launcher_paths.append(os.path.join(*full_launcher_path))
+                    else:
+                        full_launcher_path.append(launcher['name'])
+                        recurse(service, launcher['id'])
+
+                    del full_launcher_path[-1:]
+
+            page_token = response.get('nextPageToken', None)
+            if page_token is None:
+                break # done with launchers in current block
+
+    block_page_token = None
+    sample_block = 'block_2012-0' #'block_2011-10-07-08-57-17-804213'
+    block_query = "'{}' in parents".format(garden_id) if sample_block is None \
+        else "'{}' in parents and name contains '{}'".format(garden_id, sample_block)
+
+    while True:
+        block_response = service.files().list(
+            q=block_query, spaces='drive', pageToken=block_page_token,
+            fields='nextPageToken, files(id, name)'
+        ).execute()
+
+        for block in block_response['files']:
+            print(block['name'])
+            full_launcher_path.clear()
+            full_launcher_path.append(block['name'])
+            recurse(service, block['id'])
+
+        block_page_token = block_response.get('nextPageToken', None)
+        if block_page_token is None:
+            break # done with blocks
+
+    launcher_paths.sort()
+    print(len(launcher_paths), 'launcher directories in GDrive')
+
     query = {}
     blessed_task_ids = [
         task_id for doc in target.db.materials.find(query, {'task_id': 1, 'blessed_tasks': 1})
@@ -1145,24 +1197,30 @@ def gdrive(target_db_file):
     ]
     print(len(blessed_task_ids), 'blessed tasks.')
 
-    splits = ['block_', 'aflow_engines-']
-    dir_names = []
+    nr_launchers_sync = 0
+    outfile = open('launcher_paths.txt', 'w')
+    splits = ['block_', 'aflow_engines-', 'launcher_']
     for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
         dir_name = task['dir_name']
         # aflow_engines-mag_special
-        if '2011-' in dir_name and 'block_2011-10-07-08-57-17-804213' in dir_name: # TODO remove
-            for s in splits:
-                ds = dir_name.split(s)
-                if len(ds) == 2:
-                    block_launcher = s + ds[-1]
-                    dir_names.append(block_launcher)
-                    break
-            else:
-                print('could not split', dir_name)
-                return
+        if sample_block is not None and sample_block not in dir_name:
+            continue
 
-    dir_names.sort()
-    print(len(dir_names), 'launcher directories to sync.')
+        for s in splits:
+            ds = dir_name.split(s)
+            if len(ds) == 2:
+                block_launcher = s + ds[-1]
+                if dir_name not in launcher_paths:
+                    nr_launchers_sync += 1
+                    outfile.write(block_launcher + '\n')
+                break
+        else:
+            print('could not split', dir_name)
+            return
+
+    outfile.close()
+    print(nr_launchers_sync, 'launchers to sync')
+    return
 
     nr_tasks_processed = 0
     prev = None

From de620cb94bdd5d87e019e335bc275712aa6a1e5a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 10 Jan 2019 12:41:31 -0800
Subject: [PATCH 62/97] cli.report: fix status columns

---
 emmet/scripts/emmet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 44be35c615..6b42abe23c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -818,7 +818,7 @@ def report(tag, in_progress, to_csv):
     """generate a report of calculations status"""
 
     lpad = LaunchPad.auto_load()
-    states = ['COMPLETED', 'FIZZLED', 'READY', 'RUNNING']
+    states = ['READY', 'RESERVED', 'RUNNING', 'FIZZLED', 'COMPLETED']
 
     tags = [tag]
     if tag is None:

From ab07d90643fb4ac8b4723fe840061c31f52b3e4a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 10 Jan 2019 12:42:01 -0800
Subject: [PATCH 63/97] cli.wflows: add current year to future wflows

---
 emmet/scripts/emmet.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 6b42abe23c..d91c7da4c9 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -38,6 +38,7 @@
 structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
 aggregation_keys = ['reduced_cell_formula', 'formula_pretty']
 SCOPES = 'https://www.googleapis.com/auth/drive'
+current_year = int(datetime.today().year)
 
 def aggregate_by_formula(coll, q, key=None):
     query = {'$and': [q, exclude]}
@@ -752,7 +753,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             try:
                                 wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
                                 wf = add_trackers(wf)
-                                wf = add_tags(wf, [tag])
+                                wf = add_tags(wf, [tag, 'mp_{}'.format(current_year)])
                                 if struct.task_id is not None:
                                     wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
                             except Exception as ex:

From 32e54ccd7a5d68d314dc5a44d3dba4311792f005 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 10 Jan 2019 15:23:18 -0800
Subject: [PATCH 64/97] cli.copy: avoid accidentally copying tasks w/o year tag

---
 emmet/scripts/emmet.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index d91c7da4c9..93ce3fb5e1 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -39,6 +39,7 @@
 aggregation_keys = ['reduced_cell_formula', 'formula_pretty']
 SCOPES = 'https://www.googleapis.com/auth/drive'
 current_year = int(datetime.today().year)
+year_tags = ['mp_{}'.format(y) for y in range(2018, current_year+1)]
 
 def aggregate_by_formula(coll, q, key=None):
     query = {'$and': [q, exclude]}
@@ -69,7 +70,7 @@ def get_meta_from_structure(struct):
     return d
 
 # a utility function to get us a slice of an iterator, as an iterator
-# when working with iterators maximum lazyness is preferred 
+# when working with iterators maximum lazyness is preferred
 def iterator_slice(iterator, length):
     iterator = iter(iterator)
     while True:
@@ -89,7 +90,7 @@ def contains_vasp_dirs(list_of_files):
     for f in list_of_files:
         if f.startswith("INCAR"):
             return True
-    
+
 def get_symlinked_path(root, base_path_index, insert):
     root_split = os.path.realpath(root).split(os.sep)
     if base_path_index != len(root_split) and \
@@ -234,9 +235,12 @@ def copy(target_db_file, tag, insert, copy_snls):
 
     ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection])
 
+    # don't accidentally copy tasks without year tag
+    task_base_query['tags']['$in'] = year_tags
+
     tags = [tag]
     if tag is None:
-        tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None]
+        tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None and t not in year_tags]
         print(len(tags), 'tags in source collection')
 
     def insert_snls(snls_list):
@@ -753,7 +757,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                             try:
                                 wf = wf_structure_optimization(struct, c={'ADD_MODIFY_INCAR': True})
                                 wf = add_trackers(wf)
-                                wf = add_tags(wf, [tag, 'mp_{}'.format(current_year)])
+                                wf = add_tags(wf, [tag, year_tags[-1]])
                                 if struct.task_id is not None:
                                     wf = add_additional_fields_to_taskdocs(wf, update_dict={'task_id': struct.task_id})
                             except Exception as ex:
@@ -823,7 +827,7 @@ def report(tag, in_progress, to_csv):
 
     tags = [tag]
     if tag is None:
-        tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None]
+        tags = [t for t in lpad.workflows.distinct('metadata.tags') if t is not None and t not in year_tags]
         tags += [t for t in lpad.db.add_wflows_logs.distinct('tags') if t is not None and t not in tags]
         all_tags = []
         for t in tags:
@@ -1138,7 +1142,7 @@ def gdrive(target_db_file):
     if not garden_id:
         print('MPDRIVE_GARDEN_ID not set!')
         return
-    
+
     launcher_paths = []
     full_launcher_path = []
 

From fe00279b923f2c838c78b7351c83dc918b8f4603 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 14 Jan 2019 13:56:55 -0800
Subject: [PATCH 65/97] cli.gdrive: add block filter option

---
 emmet/scripts/emmet.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 93ce3fb5e1..2170673493 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1126,7 +1126,8 @@ def upload_archive(path, name, service, parent=None):
 
 @cli.command()
 @click.argument('target_db_file', type=click.Path(exists=True))
-def gdrive(target_db_file):
+@click.option('--block-filter', '-f', help='block filter substring (e.g. block_2017-)')
+def gdrive(target_db_file, block_filter):
     """sync launch directories for target task DB to Google Drive"""
     target = VaspCalcDb.from_db_file(target_db_file, admin=True)
     print('connected to target db with', target.collection.count(), 'tasks')
@@ -1172,9 +1173,8 @@ def recurse(service, folder_id):
                 break # done with launchers in current block
 
     block_page_token = None
-    sample_block = 'block_2012-0' #'block_2011-10-07-08-57-17-804213'
-    block_query = "'{}' in parents".format(garden_id) if sample_block is None \
-        else "'{}' in parents and name contains '{}'".format(garden_id, sample_block)
+    block_query = "'{}' in parents".format(garden_id) if block_filter is None \
+        else "'{}' in parents and name contains '{}'".format(garden_id, block_filter)
 
     while True:
         block_response = service.files().list(
@@ -1203,19 +1203,19 @@ def recurse(service, folder_id):
     print(len(blessed_task_ids), 'blessed tasks.')
 
     nr_launchers_sync = 0
-    outfile = open('launcher_paths.txt', 'w')
+    outfile = open('launcher_paths_{}.txt'.format(block_filter), 'w')
     splits = ['block_', 'aflow_engines-', 'launcher_']
     for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
         dir_name = task['dir_name']
         # aflow_engines-mag_special
-        if sample_block is not None and sample_block not in dir_name:
+        if block_filter is not None and block_filter not in dir_name:
             continue
 
         for s in splits:
             ds = dir_name.split(s)
             if len(ds) == 2:
                 block_launcher = s + ds[-1]
-                if dir_name not in launcher_paths:
+                if block_launcher not in launcher_paths:
                     nr_launchers_sync += 1
                     outfile.write(block_launcher + '\n')
                 break

From 490a1fd6c23872805598ac02771df68e81ed5d84 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 14 Jan 2019 21:16:35 -0800
Subject: [PATCH 66/97] cli.parse: better block/launcher organization

---
 emmet/scripts/emmet.py | 73 ++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 24 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 2170673493..12c6a66f72 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -91,34 +91,52 @@ def contains_vasp_dirs(list_of_files):
         if f.startswith("INCAR"):
             return True
 
+def clean_path(path):
+    return os.path.join(os.path.abspath(os.path.realpath(path)), '') # trailing slash
+
+def make_block(base_path):
+    block = get_timestamp_dir(prefix='block')
+    block_dir = os.path.join(base_path, block)
+    os.mkdir(block_dir)
+    print('created', block_dir)
+    return block_dir
+
 def get_symlinked_path(root, base_path_index, insert):
-    root_split = os.path.realpath(root).split(os.sep)
-    if base_path_index != len(root_split) and \
-        not root_split[base_path_index-1].startswith('block_'):
-        rootdir = os.sep.join(root_split[:base_path_index])
-        block = get_timestamp_dir(prefix='block')
-        block_dir = os.sep.join(root_split[:base_path_index-1] + [block])
-        if insert:
-            os.rename(rootdir, block_dir)
-            os.symlink(block_dir, rootdir)
-        print(rootdir, '->', block_dir)
-    subdir = os.sep.join(root_split)
+    """organize directory in block_*/launcher_* via symbolic links"""
+    root_split = root.split(os.sep)
+    base_path = os.sep.join(root_split[:base_path_index])
+
+    if not root_split[base_path_index].startswith('block_'):
+        all_blocks = glob(os.path.join(base_path, 'block_*/'))
+        if all_blocks:
+            block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block
+            nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/')))
+            if nr_launchers > 300: # start new block
+                block_dir = make_block(base_path)
+        else:
+            block_dir = make_block(base_path)
+    else:
+        block_dir = os.sep.join(root_split[:base_path_index+1])
+
     if not root_split[-1].startswith('launcher_'):
-        launch = get_timestamp_dir()
-        launch_dir = os.path.join(os.path.realpath(os.sep.join(root_split[:-1])), launch)
-        if insert:
-            os.rename(subdir, launch_dir)
-            os.symlink(launch_dir, subdir)
-        print(subdir, '->', launch_dir)
-        return launch_dir
+        launch = get_timestamp_dir(prefix='launcher')
+        launch_dir = os.path.join(block_dir, launch)
     else:
-        return os.path.realpath(subdir)
+        launch_dir = os.sep.join(block_dir, root_split[-1])
+
+    if insert:
+        os.rename(root, launch_dir)
+        os.symlink(launch_dir, root)
+    print(root, '->', launch_dir)
+    return launch_dir
 
 def get_vasp_dirs(scan_path, base_path, max_dirs, insert):
-    base_path_split = base_path.split(os.sep)
-    base_path_index = len(base_path_split)
-    # NOTE os.walk followlinks=False by default, as intended here
+    scan_path = clean_path(scan_path)
+    base_path = clean_path(base_path)
+    base_path_index = len(base_path.split(os.sep))-1 # account for abspath
     counter = 0
+
+    # NOTE os.walk followlinks=False by default, as intended here
     for root, dirs, files in os.walk(scan_path):
         # TODO ignore relax1/2 subdirs if INCAR.orig found
         if contains_vasp_dirs(files):
@@ -140,12 +158,14 @@ def get_vasp_dirs(scan_path, base_path, max_dirs, insert):
                         if counter >= max_dirs:
                             break
 
+
 def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
     name = multiprocessing.current_process().name
     print(name, 'starting')
     lpad = LaunchPad.auto_load()
     target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print(name, 'connected to target db with', target.collection.count(), 'tasks')
+
     for vaspdir in vaspdirs:
         if get_subdir(vaspdir) in already_inserted_subdirs:
             print(name, vaspdir, 'already parsed')
@@ -173,6 +193,7 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
                     rmtree(vaspdir)
                     print(name, 'removed', vaspdir)
                 continue
+
             if task_doc['state'] == 'successful':
                 try:
                     target.insert_task(task_doc, use_gridfs=True)
@@ -185,6 +206,7 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
                         print(name, 'also remove force_constants and retry ...')
                         task_doc['calcs_reversed'][0]['output'].pop('force_constants')
                         target.insert_task(task_doc, use_gridfs=True)
+
     nr_vaspdirs = len(vaspdirs)
     print(name, 'processed', nr_vaspdirs, 'VASP directories')
     return nr_vaspdirs
@@ -536,7 +558,6 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                     matched_task_ids.append(s.task_id)
         return matched_task_ids
 
-
     for tag, value in tags.items():
 
         if skip_all_scanned and not value[1]:
@@ -1075,7 +1096,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     base_path = os.path.join(base_path, '')
     base_path_split = base_path.split(os.sep)
     tag = base_path_split[-1] if base_path_split[-1] else base_path_split[-2]
-    drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag]})
+    drone = VaspDrone(parse_dos='auto', additional_fields={'tags': [tag, year_tags[-1]]})
     already_inserted_subdirs = [get_subdir(dn) for dn in target.collection.find({'tags': tag}).distinct('dir_name')]
     print(len(already_inserted_subdirs), 'VASP directories already inserted for', tag)
 
@@ -1083,11 +1104,13 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     if nproc > 1 and max_dirs <= chunk_size:
         nproc = 1
         print('max_dirs =', max_dirs, 'but chunk size =', chunk_size, '-> parsing sequentially')
+
     pool = multiprocessing.Pool(processes=nproc)
     iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs, insert)
     iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks
     queue = deque()
     total_nr_vaspdirs_parsed = 0
+
     while iterator or queue:
         try:
             args = [next(iterator), insert, drone, already_inserted_subdirs]
@@ -1101,6 +1124,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
                 queue.append(process)
             else:
                 total_nr_vaspdirs_parsed += process.get()
+
     pool.close()
     print('DONE:', total_nr_vaspdirs_parsed, 'parsed')
 
@@ -1232,6 +1256,7 @@ def recurse(service, folder_id):
     outfile = open('launcher_paths.txt', 'w')
     stage_dir = '/project/projectdirs/matgen/garden/rclone_to_mp_drive'
 
+
     for idx, dir_name in enumerate(dir_names):
         block_launcher_split = dir_name.split(os.sep)
         #if prev is not None and prev != block_launcher_split[0]: # TODO remove

From ef82ffbb50ddb32044eb1bd7224c10319b49b744 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 14 Jan 2019 21:17:22 -0800
Subject: [PATCH 67/97] cli.parse: improve orig files logic

---
 emmet/scripts/emmet.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 12c6a66f72..de183b0b00 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -171,20 +171,20 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
             print(name, vaspdir, 'already parsed')
             continue
         print(name, 'vaspdir:', vaspdir)
-        #poscar_path = os.path.join(vaspdir, 'POSCAR.relax2.gz')
-        #s = Structure.from_file(poscar_path)
-        #nelements = len(s.composition.elements)
-        #if nelements > 1:
-        #     print(name, '   -> SKIP (#elements > 1)')
-        #     continue
+
         if insert:
-            for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']:
-                input_path = os.path.join(vaspdir, inp)
-                if not glob(input_path+'.orig*'):
-                    input_path = glob(input_path+'*')[0]
-                    orig_path = input_path.replace(inp, inp+'.orig')
-                    copyfile(input_path, orig_path)
-                    print(name, 'cp', input_path, '->', orig_path)
+            try:
+                for inp in ['INCAR', 'KPOINTS', 'POTCAR', 'POSCAR']:
+                    input_path = os.path.join(vaspdir, inp)
+                    if not glob(input_path+'.orig*'):
+                        input_path = glob(input_path+'*')[0]
+                        orig_path = input_path.replace(inp, inp+'.orig')
+                        copyfile(input_path, orig_path)
+                        print(name, 'cp', input_path, '->', orig_path)
+            except Exception as ex:
+                print(str(ex))
+                continue
+
             try:
                 task_doc = drone.assimilate(vaspdir)
             except Exception as ex:

From e77dbf9b1e736bb7d1b9541af469134557064ac7 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 11:26:12 -0800
Subject: [PATCH 68/97] cli.parse: fix get_vasp_dirs

---
 emmet/scripts/emmet.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index de183b0b00..c5c161cb23 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -109,6 +109,7 @@ def get_symlinked_path(root, base_path_index, insert):
     if not root_split[base_path_index].startswith('block_'):
         all_blocks = glob(os.path.join(base_path, 'block_*/'))
         if all_blocks:
+            # TODO: getmtime doesn't get last created
             block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block
             nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/')))
             if nr_launchers > 300: # start new block
@@ -121,13 +122,13 @@ def get_symlinked_path(root, base_path_index, insert):
     if not root_split[-1].startswith('launcher_'):
         launch = get_timestamp_dir(prefix='launcher')
         launch_dir = os.path.join(block_dir, launch)
+        if insert:
+            os.rename(root, launch_dir)
+            os.symlink(launch_dir, root)
+        print(root, '->', launch_dir)
     else:
-        launch_dir = os.sep.join(block_dir, root_split[-1])
+        launch_dir = os.path.join(block_dir, root_split[-1])
 
-    if insert:
-        os.rename(root, launch_dir)
-        os.symlink(launch_dir, root)
-    print(root, '->', launch_dir)
     return launch_dir
 
 def get_vasp_dirs(scan_path, base_path, max_dirs, insert):

From 23f9b285b4b94a1d6ef894dd40139f9f15f045c5 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 12:26:23 -0800
Subject: [PATCH 69/97] cli.parse: better way to make new block

---
 emmet/scripts/emmet.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index c5c161cb23..070eefdf58 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -109,10 +109,11 @@ def get_symlinked_path(root, base_path_index, insert):
     if not root_split[base_path_index].startswith('block_'):
         all_blocks = glob(os.path.join(base_path, 'block_*/'))
         if all_blocks:
-            # TODO: getmtime doesn't get last created
-            block_dir = max(all_blocks, key=os.path.getmtime) # last-modified block
-            nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/')))
-            if nr_launchers > 300: # start new block
+            for block_dir in all_blocks:
+                nr_launchers = len(glob(os.path.join(block_dir, 'launcher_*/')))
+                if nr_launchers < 300:
+                    break # found an existing block with < 300 launchers
+            else:
                 block_dir = make_block(base_path)
         else:
             block_dir = make_block(base_path)

From ef653c1142bd181470870597e3d3afb6ca75fa36 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 14:19:12 -0800
Subject: [PATCH 70/97] garden_to_hpss: pack in archives, access hpss from user
 account

---
 emmet/scripts/garden_to_hpss.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index c139184e15..2462b83933 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -1,12 +1,15 @@
 #!/bin/bash
 
+[[ ! -d $1/archives ]] && mkdir -v $1/archives
+
 for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
   echo $block_dir
   subdir=`basename $block_dir`
-  if [ ! -e ${subdir}.tar.gz ]; then
-    tar -czvf ${subdir}.tar.gz ${block_dir}
+  if [ ! -e $1/archives/${subdir}.tar.gz ]; then
+    tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir
   fi
-  hsi cput ${subdir}.tar.gz : garden/${subdir}.tar.gz
-  [[ $? -ne 0 ]] && echo "not removing ${block_dir}" && continue
-  rm -rv $block_dir && rm -v ${subdir}.tar.gz
+  hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz
+  flag=$?
+  [[ $flag -ne 0 ]] && echo "not removing ${subdir}.tar.gz (flag=$flag)" && continue
+  rm -v $1/archives/${subdir}.tar.gz
 done

From e47c6c658b023110290230ede39f97ade43a60d6 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 15:19:45 -0800
Subject: [PATCH 71/97] garden_to_hpss: add fail safes

---
 emmet/scripts/garden_to_hpss.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index 2462b83933..eac04c477f 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -7,9 +7,19 @@ for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
   subdir=`basename $block_dir`
   if [ ! -e $1/archives/${subdir}.tar.gz ]; then
     tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir
+    flag=$?
+    if [ $flag -ne 0 ]; then
+      echo "error with ${subdir}.tar.gz (flag=$flag)"
+      rm -v $1/archives/${subdir}.tar.gz
+      continue
+    fi
   fi
   hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz
   flag=$?
-  [[ $flag -ne 0 ]] && echo "not removing ${subdir}.tar.gz (flag=$flag)" && continue
+  if [ $flag -ne 0 ]; then
+    echo "error with hsi transfer for ${subdir}.tar.gz (flag=$flag)"
+    exit
+  fi
   rm -v $1/archives/${subdir}.tar.gz
+  rm -rv $block_dir
 done

From bef604a0b09a236d2486b4c3281a9f1ff17eb484 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 19:46:38 -0800
Subject: [PATCH 72/97] cli.gdrive: fix splits

---
 emmet/scripts/emmet.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 070eefdf58..4777688297 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1230,10 +1230,9 @@ def recurse(service, folder_id):
 
     nr_launchers_sync = 0
     outfile = open('launcher_paths_{}.txt'.format(block_filter), 'w')
-    splits = ['block_', 'aflow_engines-', 'launcher_']
+    splits = ['block_', 'res_1_aflow_engines-', 'aflow_engines-']
     for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
         dir_name = task['dir_name']
-        # aflow_engines-mag_special
         if block_filter is not None and block_filter not in dir_name:
             continue
 

From 7bfe7e443fe0715a379d2a9e5ab0b0d742a3d23c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 15 Jan 2019 19:47:07 -0800
Subject: [PATCH 73/97] garden_to_hpss: ok to force rm

---
 emmet/scripts/garden_to_hpss.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index eac04c477f..56abd5f0d9 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -21,5 +21,5 @@ for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
     exit
   fi
   rm -v $1/archives/${subdir}.tar.gz
-  rm -rv $block_dir
+  rm -rfv $block_dir
 done

From 22f98a85226b8eb513d7c7babfa147a4cc9867b5 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 16 Jan 2019 13:03:30 -0800
Subject: [PATCH 74/97] cli.copy: fix year tags before copying

---
 emmet/scripts/emmet.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 4777688297..796652f06b 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -259,14 +259,37 @@ def copy(target_db_file, tag, insert, copy_snls):
 
     ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection])
 
-    # don't accidentally copy tasks without year tag
-    task_base_query['tags']['$in'] = year_tags
-
     tags = [tag]
     if tag is None:
         tags = [t for t in source.collection.find(task_base_query).distinct('tags') if t is not None and t not in year_tags]
         print(len(tags), 'tags in source collection')
 
+    # fix year tags before copying tasks
+    counter = Counter()
+    source_tasks = source.collection.find(
+        {'$and': [{'tags': {'$in': tags}}, {'tags': {'$nin': year_tags}}]}, {'_id': 0, 'dir_name': 1}
+    )
+    source_tasks_to_fix = source_tasks.count()
+    if source_tasks_to_fix > 0:
+        print(source_tasks_to_fix, 'source tasks are missing a year tag!')
+        print('ERROR: Aborting since this needs testing')
+        return
+
+    for idx, doc in enumerate(source_tasks):
+        print(idx, doc['dir_name'])
+        # check whether I copied it over to production already -> add tag for previous year
+        # anything not copied is tagged with the current year
+        prod_task = target.collection.find_one({'dir_name': doc['dir_name']}, {'dir_name': 1, 'tags': 1})
+        year_tag = year_tags[-1]
+        if prod_task:
+            for t in prod_task['tags']:
+                if t in year_tags:
+                    year_tag = t
+        print(year_tag)
+        #r = source.collection.update({'dir_name': doc['dir_name']}, {'$addToSet': {'tags': year_tag}})
+        #counter[year_tag] += r['nModified']
+    #print(counter)
+
     def insert_snls(snls_list):
         if snls_list:
             print('copy', len(snls_list), 'SNLs')

From 525ecf73af0c8703485fc6c8b4675f988a732315 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Wed, 16 Jan 2019 16:17:17 -0800
Subject: [PATCH 75/97] cli: new subcommand find

---
 emmet/scripts/emmet.py | 60 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 796652f06b..86f6e715f9 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -439,6 +439,66 @@ def insert_snls(snls_list):
     print(table)
 
 
+@cli.command()
+@click.argument('email')
+@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
+@click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan')
+def find(email, add_snlcolls, add_tasks_db):
+    """checks status of calculations by submitter or author email in SNLs"""
+    lpad = LaunchPad.auto_load()
+
+    snl_collections = [lpad.db.snls]
+    if add_snlcolls is not None:
+        for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')):
+            snl_db_conn = MongoClient(snl_db_config['host'], snl_db_config['port'], j=False, connect=False)
+            snl_db = snl_db_conn[snl_db_config['db']]
+            snl_db.authenticate(snl_db_config['username'], snl_db_config['password'])
+            snl_collections.append(snl_db[snl_db_config['collection']])
+    for snl_coll in snl_collections:
+        print(snl_coll.count(exclude), 'SNLs in', snl_coll.full_name)
+
+    tasks_collections = OrderedDict()
+    tasks_collections[lpad.db.tasks.full_name] = lpad.db.tasks
+    if add_tasks_db is not None: # TODO multiple alt_task_db_files?
+        target = VaspCalcDb.from_db_file(add_tasks_db, admin=True)
+        tasks_collections[target.collection.full_name] = target.collection
+    for full_name, tasks_coll in tasks_collections.items():
+        print(tasks_coll.count(), 'tasks in', full_name)
+
+    #ensure_indexes(['snl_id', 'about.remarks', 'submitter_email', 'about.authors.email'], snl_collections)
+    ensure_indexes(['snl_id', 'fw_id'], [lpad.db.add_wflows_logs])
+    ensure_indexes(['fw_id'], [lpad.fireworks])
+    ensure_indexes(['launch_id'], [lpad.launches])
+    ensure_indexes(['dir_name', 'task_id'], tasks_collections.values())
+
+    snl_ids = []
+    query = {'$or': [{'submitter_email': email}, {'about.authors.email': email}]}
+    query.update(exclude)
+    for snl_coll in snl_collections:
+        snl_ids.extend(snl_coll.distinct('snl_id', query))
+    print(len(snl_ids), 'SNLs')
+
+    fw_ids = lpad.db.add_wflows_logs.distinct('fw_id', {'snl_id': {'$in': snl_ids}})
+    print(len(fw_ids), 'FWs')
+
+    launch_ids = lpad.fireworks.distinct('launches', {'fw_id': {'$in': fw_ids}})
+    print(len(launch_ids), 'launches')
+
+    launches = lpad.launches.find({'launch_id': {'$in': launch_ids}}, {'launch_dir': 1})
+    subdirs = [get_subdir(launch['launch_dir']) for launch in launches]
+    print(len(subdirs), 'launch directories')
+
+    for full_name, tasks_coll in tasks_collections.items():
+        print(full_name)
+        for subdir in subdirs:
+            subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}}
+            task = tasks_coll.find_one(subdir_query, {'task_id': 1})
+            if task:
+                print(task['task_id'])
+            else:
+                print(subdir, 'not found')
+
+
 @cli.command()
 @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
 @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan')

From a458e087f380b096f27c69e24a99f6a222204599 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 00:39:04 -0800
Subject: [PATCH 76/97] hpss: use htar

---
 emmet/scripts/garden_to_hpss.sh | 25 ++++++++++---------------
 emmet/scripts/targz_to_htar.sh  | 30 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 15 deletions(-)
 create mode 100755 emmet/scripts/targz_to_htar.sh

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index 56abd5f0d9..b6f485166e 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -1,25 +1,20 @@
 #!/bin/bash
 
-[[ ! -d $1/archives ]] && mkdir -v $1/archives
+cd $1 && pwd
 
 for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
   echo $block_dir
-  subdir=`basename $block_dir`
-  if [ ! -e $1/archives/${subdir}.tar.gz ]; then
-    tar -czvf $1/archives/${subdir}.tar.gz -C $1 $subdir
-    flag=$?
-    if [ $flag -ne 0 ]; then
-      echo "error with ${subdir}.tar.gz (flag=$flag)"
-      rm -v $1/archives/${subdir}.tar.gz
-      continue
-    fi
-  fi
-  hsi -l matcomp cput $1/archives/${subdir}.tar.gz : garden/${subdir}.tar.gz
+  chmod -Rv ug+rw $block_dir
+  [[ $? -ne 0 ]] && echo 'error in chmod' && exit
+  find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \;
+  [[ $? -ne 0 ]] && echo "error in pigz" && exit
+  block=`basename $block_dir`
+  htar -cvf garden/${block}.tar $block
   flag=$?
   if [ $flag -ne 0 ]; then
-    echo "error with hsi transfer for ${subdir}.tar.gz (flag=$flag)"
+    echo "error with htar (flag=$flag)"
     exit
   fi
-  rm -v $1/archives/${subdir}.tar.gz
-  rm -rfv $block_dir
+  #rm -rfv $block_dir
+  break
 done
diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
new file mode 100755
index 0000000000..c0a40ec502
--- /dev/null
+++ b/emmet/scripts/targz_to_htar.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# NOTE make sure matcomp is first entry in ~/.netrc!
+[[ ! -e garden.txt ]] && hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt
+
+while read block_tar_gz; do
+  block=`basename ${block_tar_gz%%.tar.gz}`
+  echo $block
+  if [ ! -e ${block}.tar.gz ]; then
+    hsi -q -l matcomp get garden/${block}.tar.gz
+    [[ $? -ne 0 ]] && echo 'error in hsi get' && exit
+  fi
+  if [ ! -d ${block} ]; then
+    tar -xvzf ${block}.tar.gz
+    [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
+  fi
+  chmod -Rv ug+rw ${block}
+  [[ $? -ne 0 ]] && echo 'error in chmod' && exit
+  find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \;
+  [[ $? -ne 0 ]] && echo "error in pigz" && exit
+  htar -cvf garden/${block}.tar ${block}
+  [[ $? -ne 0 ]] && echo 'error in htar -c' && exit
+  hsi -q -l matcomp rm garden/${block}.tar.gz
+  [[ $? -ne 0 ]] && echo 'error in htar rm' && exit
+  rm -rv ${block}
+  rm -v ${block}.tar.gz
+  break # TODO remove
+done < garden.txt
+
+

From 9ea8a991d35392a29348f528db638a24ca402884 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 13:38:43 -0800
Subject: [PATCH 77/97] htar ready: garden_to_hpss & targz_to_htar

---
 emmet/scripts/garden_to_hpss.sh | 13 ++++---------
 emmet/scripts/targz_to_htar.sh  | 20 ++++++++------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index b6f485166e..64da6cfcc1 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -4,17 +4,12 @@ cd $1 && pwd
 
 for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
   echo $block_dir
-  chmod -Rv ug+rw $block_dir
+  find $block_dir -not -perm -660 -exec chmod -v g+rw {} \;
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \;
   [[ $? -ne 0 ]] && echo "error in pigz" && exit
   block=`basename $block_dir`
-  htar -cvf garden/${block}.tar $block
-  flag=$?
-  if [ $flag -ne 0 ]; then
-    echo "error with htar (flag=$flag)"
-    exit
-  fi
-  #rm -rfv $block_dir
-  break
+  htar -M 5000000 -cvf garden/${block}.tar $block
+  [[ $? -ne 0 ]] && echo "error with htar" && exit
+  rm -rfv $block_dir
 done
diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
index c0a40ec502..67c0eeeab4 100755
--- a/emmet/scripts/targz_to_htar.sh
+++ b/emmet/scripts/targz_to_htar.sh
@@ -1,30 +1,26 @@
 #!/bin/bash
 
 # NOTE make sure matcomp is first entry in ~/.netrc!
-[[ ! -e garden.txt ]] && hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt
+cd $1 && pwd
+hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt
 
 while read block_tar_gz; do
   block=`basename ${block_tar_gz%%.tar.gz}`
   echo $block
-  if [ ! -e ${block}.tar.gz ]; then
-    hsi -q -l matcomp get garden/${block}.tar.gz
-    [[ $? -ne 0 ]] && echo 'error in hsi get' && exit
-  fi
-  if [ ! -d ${block} ]; then
-    tar -xvzf ${block}.tar.gz
-    [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
-  fi
-  chmod -Rv ug+rw ${block}
+  hsi -q -l matcomp cget garden/${block}.tar.gz
+  [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit
+  tar --skip-old-files -xvzf ${block}.tar.gz
+  [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
+  find $block -not -perm -660 -exec chmod -v g+rw {} \;
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \;
   [[ $? -ne 0 ]] && echo "error in pigz" && exit
-  htar -cvf garden/${block}.tar ${block}
+  htar -M 5000000 -cvf garden/${block}.tar ${block}
   [[ $? -ne 0 ]] && echo 'error in htar -c' && exit
   hsi -q -l matcomp rm garden/${block}.tar.gz
   [[ $? -ne 0 ]] && echo 'error in htar rm' && exit
   rm -rv ${block}
   rm -v ${block}.tar.gz
-  break # TODO remove
 done < garden.txt
 
 

From fed1f3c4c6cf044255115dd65d97553a8a22c402 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 15:18:16 -0800
Subject: [PATCH 78/97] cli.gdrive: fix block_filter logic

---
 emmet/scripts/emmet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 86f6e715f9..bf5304e867 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1316,14 +1316,14 @@ def recurse(service, folder_id):
     splits = ['block_', 'res_1_aflow_engines-', 'aflow_engines-']
     for task in target.collection.find({'task_id': {'$in': blessed_task_ids}}, {'dir_name': 1}):
         dir_name = task['dir_name']
-        if block_filter is not None and block_filter not in dir_name:
-            continue
-
         for s in splits:
             ds = dir_name.split(s)
             if len(ds) == 2:
                 block_launcher = s + ds[-1]
-                if block_launcher not in launcher_paths:
+                if block_launcher not in launcher_paths and (
+		    block_filter is None or \
+		    (block_filter is not None and block_launcher.startswith(block_filter))
+		):
                     nr_launchers_sync += 1
                     outfile.write(block_launcher + '\n')
                 break

From 04baacef0f266b6a00ef67e1116d4b3d6e4c0bea Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 16:13:02 -0800
Subject: [PATCH 79/97] hpss_to_mpdrive ready for htar

---
 emmet/scripts/hpss_to_mpdrive.sh | 76 +++++++++-----------------------
 1 file changed, 21 insertions(+), 55 deletions(-)

diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh
index 486957eadf..cb82a0574e 100755
--- a/emmet/scripts/hpss_to_mpdrive.sh
+++ b/emmet/scripts/hpss_to_mpdrive.sh
@@ -1,27 +1,23 @@
 #!/bin/bash
 
-# $(find $dir -name 'INCAR.orig*' -printf '%h ')
-dirs=`awk -F/ '{print $1}' $1 | sort -u`
-hpss_missing="blocks_missing_in_hpss.txt"
+input=$PWD/launcher_paths.txt
+[[ ! -e $input ]] && echo $input missing && exit
+dirs=`awk -F/ '{print $1}' $input | sort -u`
 
-stage_dir="rclone_to_mp_drive"
-[[ ! -d $stage_dir ]] && mkdir $stage_dir
-[[ ! -e $hpss_missing ]] && touch $hpss_missing
+cd $1 && pwd
+stage_dir=rclone_to_mp_drive
+[[ ! -d $stage_dir ]] && mkdir -pv $stage_dir
 
 for dir in $dirs; do
-  #[[ ! -e ${dir}.tar.gz ]] && echo "skip ${dir}" && continue # TODO remove
-
-  files=`grep "^$dir" $1`
-  extract="${dir}.extract"
-  grep -q "$dir" $hpss_missing
-  [[ $? -eq 0 ]] && continue
-
-  [[ -d $stage_dir/$dir ]] && rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  echo $dir
+  files=`grep "^$dir" $input`
 
   echo $files | tr ' ' '\n' | sort -u > ${dir}.files
+  wc -l ${dir}.files
   rclone lsf -R --files-only mp-drive:calculations/garden/$dir | sed "s:^:$dir/:g" | sed 's:.tar.gz::g' | sort -u > ${dir}.rclone_lsf
+  wc -l ${dir}.rclone_lsf
 
-  missing_paths="${dir}.paths"
+  missing_paths=${dir}.paths
   [[ -e $missing_paths ]] && rm -v $missing_paths
   for f in $(comm --check-order -23 ${dir}.files ${dir}.rclone_lsf); do # launch dirs missing in mp-drive
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
@@ -39,57 +35,27 @@ for dir in $dirs; do
   done
   rm -v ${dir}.files ${dir}.rclone_lsf
 
-  [[ ! -e $missing_paths ]] && continue
-
-  if [ ! -e ${dir}.tar.gz ] || [ ! -s ${dir}.tar.gz ]; then
-    hsi -q "get garden/${dir}.tar.gz"
-    [[ $? -ne 0 ]] && echo ${dir} >> $hpss_missing && continue
-  fi
-  ls -ltrh ${dir}.tar.gz
-
-  if [ ! -e ${dir}.tar_list ] || [ ! -s ${dir}.tar_list ]; then
-    echo "make ${dir}.tar_list ..."
-    tar -tzvf ${dir}.tar.gz | grep ^d | grep -v -e '/relax1/' -e '/relax2/' | awk {'print $6'} 2>&1 | tee ${dir}.tar_list
-    [[ $? -ne 0 ]] && exit
-  fi
-
-  paths=`cat $missing_paths`
-  [[ -e $extract ]] && rm -v $extract
-  for f in $paths; do
-    [[ ! -d $f ]] && grep $f ${dir}.tar_list >> $extract
-  done
+  [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && exit #continue
+  wc -l $missing_paths
 
-  if [ -e $extract ] && [ -s $extract ]; then
-    echo "extract" `wc -l $extract`
-    if tar -xvzf ${dir}.tar.gz --files-from $extract; then
-      echo 'extract OK'
-    else
-      rm -v $extract
-      echo 'problem with extract!'
-      continue
-    fi
-  else
-    echo 'nothing to extract'
-    rm -v $extract
-    continue
-  fi
-  rm -v $extract
+  #htar -xvf garden/${dir}.tar -L $missing_paths
+  #[[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && exit #continue
+  ls -ltrhd ${dir}
 
-  for f in $paths; do
+  for f in `cat $missing_paths`; do
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     echo $launch_dir_tar ...
     mkdir -p `dirname $launch_dir_tar`
-    if tar -czf $launch_dir_tar -C `dirname $f` `basename $f`; then
+    if tar --use-compress-program="pigz -9rv" -cf $launch_dir_tar -C `dirname $f` `basename $f`; then
       ls -ltrh $launch_dir_tar
     else
       echo 'problem with launch dir tar!'
-      continue
+      rm -v $launch_dir_tar
+      exit
     fi
-    #[[ -d $f ]] && rm -rf $f
+    [[ -d $f ]] && rm -rv $f
   done
   rm -v $missing_paths
 
   rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
-  #rm -v ${dir}.tar.gz
-
 done

From be147e4b57b6c41d1b1ac7776d89dfc8a959912a Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 16:13:28 -0800
Subject: [PATCH 80/97] targz_to_htar: use pigz

---
 emmet/scripts/targz_to_htar.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
index 67c0eeeab4..ed62b40516 100755
--- a/emmet/scripts/targz_to_htar.sh
+++ b/emmet/scripts/targz_to_htar.sh
@@ -9,7 +9,7 @@ while read block_tar_gz; do
   echo $block
   hsi -q -l matcomp cget garden/${block}.tar.gz
   [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit
-  tar --skip-old-files -xvzf ${block}.tar.gz
+  tar -I pigz --skip-old-files -xvf ${block}.tar.gz
   [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
   find $block -not -perm -660 -exec chmod -v g+rw {} \;
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit

From 4589ca96301dde7ce31641975abd228807884222 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Thu, 17 Jan 2019 17:23:42 -0800
Subject: [PATCH 81/97] targz_to_htar: speed improvement on chmod

---
 emmet/scripts/targz_to_htar.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
index ed62b40516..b923350d32 100755
--- a/emmet/scripts/targz_to_htar.sh
+++ b/emmet/scripts/targz_to_htar.sh
@@ -11,7 +11,7 @@ while read block_tar_gz; do
   [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit
   tar -I pigz --skip-old-files -xvf ${block}.tar.gz
   [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
-  find $block -not -perm -660 -exec chmod -v g+rw {} \;
+  parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0)
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \;
   [[ $? -ne 0 ]] && echo "error in pigz" && exit

From 5495ba78a8fe3edc60f4b82582004d39cb215d46 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 28 Jan 2019 14:15:54 -0800
Subject: [PATCH 82/97] cli: add fw_id for intended mp-ids

---
 emmet/scripts/emmet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index bf5304e867..c33fed3f60 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -791,7 +791,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
                                                     if fw['spec']['_tasks'][5]['additional_fields'].get('task_id') == struct.task_id:
                                                         msg = '  --> OK: workflow {} will result in intended task-id {}'.format(fw['fw_id'], struct.task_id)
                                                         print(msg)
-                                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'tags': [tag]})
+                                                        logger.warning(msg, extra={'formula': formula, 'snl_id': struct.snl_id, 'task_id': struct.task_id, 'fw_id': fw['fw_id'], 'tags': [tag]})
                                                         fw_found = True
                                                         break
                                                 if not fw_found:

From 54f103533178e85d8c5fe448efff56cc6878aa28 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 28 Jan 2019 14:16:10 -0800
Subject: [PATCH 83/97] cli: whitespace fix

---
 emmet/scripts/emmet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index c33fed3f60..6df758a21c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1321,9 +1321,9 @@ def recurse(service, folder_id):
             if len(ds) == 2:
                 block_launcher = s + ds[-1]
                 if block_launcher not in launcher_paths and (
-		    block_filter is None or \
-		    (block_filter is not None and block_launcher.startswith(block_filter))
-		):
+                    block_filter is None or \
+                    (block_filter is not None and block_launcher.startswith(block_filter))
+                ):
                     nr_launchers_sync += 1
                     outfile.write(block_launcher + '\n')
                 break

From 23524f41cd7092b578063b0ff2275a0018a35c02 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 28 Jan 2019 14:16:51 -0800
Subject: [PATCH 84/97] cli: minor hpss/garden scripts update

---
 emmet/scripts/garden_to_hpss.sh |  2 +-
 emmet/scripts/targz_to_htar.sh  | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/garden_to_hpss.sh b/emmet/scripts/garden_to_hpss.sh
index 64da6cfcc1..0a9655c3d9 100755
--- a/emmet/scripts/garden_to_hpss.sh
+++ b/emmet/scripts/garden_to_hpss.sh
@@ -4,7 +4,7 @@ cd $1 && pwd
 
 for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
   echo $block_dir
-  find $block_dir -not -perm -660 -exec chmod -v g+rw {} \;
+  parallel -0m 'chmod -v g+rw {}' :::: <(find $block_dir -not -perm -660 -print0)
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \;
   [[ $? -ne 0 ]] && echo "error in pigz" && exit
diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
index b923350d32..812085ee88 100755
--- a/emmet/scripts/targz_to_htar.sh
+++ b/emmet/scripts/targz_to_htar.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
 # NOTE make sure matcomp is first entry in ~/.netrc!
-cd $1 && pwd
-hsi -P -l matcomp ls -1 "garden/*.tar.gz" > garden.txt
+indir=$1
+year=$2
+garden=garden_${year}.txt
+cd $indir && pwd
+hsi -P -l matcomp ls -1 "garden/block_${year}*.tar.gz" > $garden
 
 while read block_tar_gz; do
   block=`basename ${block_tar_gz%%.tar.gz}`
@@ -11,6 +14,10 @@ while read block_tar_gz; do
   [[ $? -ne 0 ]] && echo 'error in hsi cget' && exit
   tar -I pigz --skip-old-files -xvf ${block}.tar.gz
   [[ $? -ne 0 ]] && echo 'error in tar -x' && exit
+  [[ -d garden_pauling_files/$block ]] && mv -vi garden_pauling_files/$block .
+  [[ -d garden_cori/$block ]] && mv -vi garden_cori/$block .
+  [[ -d garden_JulAug2018/$block ]] && mv -vi garden_JulAug2018/$block .
+  [[ -d garden_Jul2018/$block ]] && mv -vi garden_Jul2018/$block .
   parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0)
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \;
@@ -21,6 +28,6 @@ while read block_tar_gz; do
   [[ $? -ne 0 ]] && echo 'error in htar rm' && exit
   rm -rv ${block}
   rm -v ${block}.tar.gz
-done < garden.txt
+done < $garden
 
 

From 30e6c18f1cedc2021dab50e00821233e28598d37 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Mon, 28 Jan 2019 16:18:23 -0800
Subject: [PATCH 85/97] setup: remove py_modules

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6a05ac67a7..4f78e444b3 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,6 @@
                      'Topic :: Scientific/Engineering'],
         test_suite='nose.collector',
         tests_require=['nose'],
-        py_modules=['emmet'],
         entry_points='''
         [console_scripts]
         emmet=emmet.scripts.emmet:cli

From c4a5553108faa7b45f67c3bc10c6fc5f04771f5d Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 29 Jan 2019 14:00:15 -0800
Subject: [PATCH 86/97] cli: add bandstructure subcommand

---
 emmet/scripts/emmet.py | 52 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 6df758a21c..67cf3bb14d 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -16,8 +16,8 @@
 from fireworks.fw_config import FW_BLOCK_FORMAT
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.drones import VaspDrone
-from atomate.vasp.workflows.presets.core import wf_structure_optimization
-from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs
+from atomate.vasp.workflows.presets.core import wf_structure_optimization, wf_bandstructure
+from atomate.vasp.powerups import add_trackers, add_tags, add_additional_fields_to_taskdocs, add_wf_metadata
 from emmet.vasp.materials import group_structures, get_sg
 from emmet.vasp.task_tagger import task_type
 from log4mongo.handlers import MongoHandler, MongoFormatter
@@ -498,6 +498,54 @@ def find(email, add_snlcolls, add_tasks_db):
             else:
                 print(subdir, 'not found')
 
+@cli.command()
+@click.argument('target_db_file', type=click.Path(exists=True))
+@click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
+def bandstructure(target_db_file, insert):
+    """add workflows for bandstructure based on materials collection"""
+    lpad = LaunchPad.auto_load()
+    source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
+    print('connected to source db with', source.collection.count(), 'tasks')
+    target = VaspCalcDb.from_db_file(target_db_file, admin=True)
+    print('connected to target db with', target.collection.count(), 'tasks')
+    materials = target.db["materials.core"]
+    ensure_indexes(['task_id'], [materials])
+    ensure_indexes(['metadata.task_id'], [lpad.workflows])
+    print(materials.count(), 'core materials')
+
+    all_mat_ids = set(materials.distinct('task_id'))
+    existing_mat_ids = set(filter(None, lpad.workflows.distinct('metadata.task_id')))
+    mat_ids = all_mat_ids.symmetric_difference(existing_mat_ids)
+    print(len(mat_ids), 'bandstructure workflows to add')
+
+    wflows = []
+    for mat_id in mat_ids:
+        structure = Structure.from_dict(materials.find_one({'task_id': mat_id}, {'structure': 1})['structure'])
+        dir_name = target.collection.find_one({'task_id': mat_id}, {'dir_name': 1})['dir_name']
+        subdir = get_subdir(dir_name)
+        subdir_query = {'dir_name': {'$regex': '/{}$'.format(subdir)}}
+        source_task = source.collection.find_one(subdir_query, {'tags': 1})
+        if not source_task:
+            print('source task not found -> TODO')
+            break
+
+        # bandstructure task has this year's tag (remove other year tags from source_task)
+        tags = [t for t in source_task['tags'] if t not in year_tags]
+        tags.append(year_tags[-1])
+
+        wf = wf_bandstructure(structure, c={'ADD_MODIFY_INCAR': True}) # TODO non-SO bandstructure workflow -> Alex
+        wf = add_trackers(wf)
+        wf = add_tags(wf, tags)
+        wf = add_wf_metadata(wf, structure)
+        wf.metadata["task_id"] = mat_id
+        wflows.append(wf)
+        print(wf.as_dict())
+        break
+
+    if insert:
+        lpad.bulk_add_wfs(wflows)
+
+
 
 @cli.command()
 @click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')

From 23ddab4cd1b7931b7ee4e2acb613985457ebd6f7 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 29 Jan 2019 15:20:17 -0800
Subject: [PATCH 87/97] cli.report: add all states

---
 emmet/scripts/emmet.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index 67cf3bb14d..bb9f299926 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -12,7 +12,7 @@
 from pymatgen import Structure
 from pymatgen.alchemy.materials import TransformedStructure
 from pymatgen.util.provenance import StructureNL, Author
-from fireworks import LaunchPad
+from fireworks import LaunchPad, Firework
 from fireworks.fw_config import FW_BLOCK_FORMAT
 from atomate.vasp.database import VaspCalcDb
 from atomate.vasp.drones import VaspDrone
@@ -977,7 +977,8 @@ def report(tag, in_progress, to_csv):
     """generate a report of calculations status"""
 
     lpad = LaunchPad.auto_load()
-    states = ['READY', 'RESERVED', 'RUNNING', 'FIZZLED', 'COMPLETED']
+    states = Firework.STATE_RANKS
+    states = sorted(states, key=states.get)
 
     tags = [tag]
     if tag is None:

From b5650951664b9b46855942ac075f6954e83adb14 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 29 Jan 2019 15:22:28 -0800
Subject: [PATCH 88/97] cli: skip VolumePredictor

---
 emmet/scripts/emmet.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index bb9f299926..e77876130d 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -789,17 +789,18 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
 
                         for struc in slist:
 
-                            try:
-                                struct = vp.get_predicted_structure(struc)
-                                struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
-                            except Exception as ex:
-                                print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
-                                print(ex)
-                                struct = struc
-
-                            if not structures_match(struct, struc):
-                                print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
-                                struct = struc
+                            #try:
+                            #    struct = vp.get_predicted_structure(struc)
+                            #    struct.snl_id, struct.task_id = struc.snl_id, struc.task_id
+                            #except Exception as ex:
+                            #    print('Structure for SNL', struc.snl_id, '--> VP error: use original structure!')
+                            #    print(ex)
+                            #    struct = struc
+
+                            #if not structures_match(struct, struc):
+                            #    print('Structure for SNL', struc.snl_id, '--> VP mismatch: use original structure!')
+                            #    struct = struc
+                            struct = struc
 
                             wf_found = False
                             if sgnum in canonical_workflow_structures[formula] and canonical_workflow_structures[formula][sgnum]:

From 0227a9eb4436cc92ba2e0820f013941462dcd94c Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 11:53:50 -0800
Subject: [PATCH 89/97] cli.gdrive: fix store init

---
 emmet/scripts/emmet.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index e77876130d..a67bed480c 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1292,10 +1292,13 @@ def gdrive(target_db_file, block_filter):
     print('connected to target db with', target.collection.count(), 'tasks')
     print(target.db.materials.count(), 'materials')
 
-    store = file.Storage('token.json')
-    creds = store.get()
+    creds, store = None, None
+    if os.path.exists('token.json'):
+        store = file.Storage('token.json')
+        creds = store.get()
     if not creds or creds.invalid:
         flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
+        store = file.Storage('token.json')
         creds = tools.run_flow(flow, store)
     service = build('drive', 'v3', http=creds.authorize(Http()))
     garden_id = os.environ.get('MPDRIVE_GARDEN_ID')

From eb128bd60880a9df6d2d8f9a4e4aaad56a047301 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 11:54:53 -0800
Subject: [PATCH 90/97] cli minor hpss scripts update

---
 emmet/scripts/hpss_to_mpdrive.sh | 10 ++++++----
 emmet/scripts/targz_to_htar.sh   |  2 ++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/emmet/scripts/hpss_to_mpdrive.sh b/emmet/scripts/hpss_to_mpdrive.sh
index cb82a0574e..7bc7689793 100755
--- a/emmet/scripts/hpss_to_mpdrive.sh
+++ b/emmet/scripts/hpss_to_mpdrive.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-input=$PWD/launcher_paths.txt
+input=$2
 [[ ! -e $input ]] && echo $input missing && exit
 dirs=`awk -F/ '{print $1}' $input | sort -u`
 
@@ -35,14 +35,15 @@ for dir in $dirs; do
   done
   rm -v ${dir}.files ${dir}.rclone_lsf
 
-  [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && exit #continue
+  [[ ! -e $missing_paths ]] && echo nothing missing on GDrive!? && continue
   wc -l $missing_paths
 
-  #htar -xvf garden/${dir}.tar -L $missing_paths
-  #[[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && exit #continue
+  htar -xvf garden/${dir}.tar `cat $missing_paths | tr '\n' ' '`
   ls -ltrhd ${dir}
+  [[ $? -ne 0 ]] && echo missing paths not found in HPSS!? && continue
 
   for f in `cat $missing_paths`; do
+    [[ ! -e $f ]] && echo $f not found in HPSS!? && continue
     launch_dir_tar="${stage_dir}/${f}.tar.gz"
     echo $launch_dir_tar ...
     mkdir -p `dirname $launch_dir_tar`
@@ -58,4 +59,5 @@ for dir in $dirs; do
   rm -v $missing_paths
 
   rclone -v copy $stage_dir/$dir mp-drive:calculations/garden/$dir
+  find $dir -type d -empty -print -delete
 done
diff --git a/emmet/scripts/targz_to_htar.sh b/emmet/scripts/targz_to_htar.sh
index 812085ee88..3c33754719 100755
--- a/emmet/scripts/targz_to_htar.sh
+++ b/emmet/scripts/targz_to_htar.sh
@@ -18,6 +18,8 @@ while read block_tar_gz; do
   [[ -d garden_cori/$block ]] && mv -vi garden_cori/$block .
   [[ -d garden_JulAug2018/$block ]] && mv -vi garden_JulAug2018/$block .
   [[ -d garden_Jul2018/$block ]] && mv -vi garden_Jul2018/$block .
+  [[ -d garden_Aug14-16_2018/$block ]] && mv -vi garden_Aug14-16_2018/$block .
+  [[ -d garden_Aug2018/$block ]] && mv -vi garden_Aug2018/$block .
   parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0)
   [[ $? -ne 0 ]] && echo 'error in chmod' && exit
   find ${block} -type f -not -name "*.gz" -exec pigz -9v {} \;

From 63ab2f0411da93289ee555814c9fe9b1f70bf633 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 11:55:14 -0800
Subject: [PATCH 91/97] cli: add update_hpss_archive.sh

---
 emmet/scripts/update_hpss_archive.sh | 44 ++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100755 emmet/scripts/update_hpss_archive.sh

diff --git a/emmet/scripts/update_hpss_archive.sh b/emmet/scripts/update_hpss_archive.sh
new file mode 100755
index 0000000000..31896c2c12
--- /dev/null
+++ b/emmet/scripts/update_hpss_archive.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw
+cd $indir && pwd
+
+#for block in $(find . -maxdepth 1 -type d -name "block_2011*" -exec basename {} \;); do
+#for block in $(cat hpss_update_2013.txt); do
+for block_targz in $(ls block_201*.tar.gz); do
+  tar -I pigz --skip-old-files -xvf ${block_targz}
+  [[ $? -ne 0 ]] && echo "error in tar -x" && exit
+  block=${block_targz%%.tar.gz}
+  echo $block
+  [[ ! -d $block ]] && echo $block does not exist && exit
+  find $block -type d -empty -print -delete
+  [[ ! -d $block ]] && echo $block only contained empty directories && exit
+
+  parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0)
+  [[ $? -ne 0 ]] && echo 'error in chmod' && exit
+  find $block -type f -not -name "*.gz" -exec pigz -9v {} \;
+  [[ $? -ne 0 ]] && echo "error in pigz" && exit
+
+  htar -vtf garden/${block}.tar | awk '{ print $7 }' | sort -u > ${block}.tar.idx
+  [[ $? -ne 0 ]] && echo "error in htar -t" && exit
+  find $block -type f | sort -u > ${block}.idx
+
+  comm -13 ${block}.tar.idx ${block}.idx > ${block}.missing
+  if [ -s ${block}.missing ]; then
+    nfiles=$(wc -l ${block}.missing | awk '{ print $1}')
+    echo need syncing of $nfiles files
+    htar -xvf garden/${block}.tar
+    [[ $? -ne 0 ]] && echo "error in htar -x" && exit
+    hsi -q -l matcomp mv garden/${block}.tar garden/${block}.tar.bkp
+    hsi -q -l matcomp mv garden/${block}.tar.idx garden/${block}.tar.idx.bkp
+    htar -M 5000000 -cvf garden/${block}.tar ${block}
+    [[ $? -ne 0 ]] && echo "error in htar -c" && exit
+    hsi -q -l matcomp rm garden/${block}.tar*.bkp
+    [[ $? -ne 0 ]] && echo 'error in htar rm' && exit
+  else
+    echo all files already in HTAR archive
+  fi
+  rm -rv ${block}
+  rm -v ${block}.tar.idx ${block}.idx ${block}.missing
+  rm -v ${block_targz}
+done

From f4c60869283d13fbb040d6587c5623ff6b0e6eb0 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 11:55:58 -0800
Subject: [PATCH 92/97] cli: add sbatch scripts

---
 emmet/scripts/sbatch/submit_garden_to_hpss.txt   | 14 ++++++++++++++
 emmet/scripts/sbatch/submit_hpss_MatProj.script  | 16 ++++++++++++++++
 .../scripts/sbatch/submit_hpss_to_mpdrive.script | 13 +++++++++++++
 emmet/scripts/sbatch/submit_restore_MatProj.txt  | 16 ++++++++++++++++
 emmet/scripts/sbatch/submit_rsync.script         | 13 +++++++++++++
 emmet/scripts/sbatch/submit_targz_to_htar.script | 15 +++++++++++++++
 .../sbatch/submit_update_hpss_archive.script     | 11 +++++++++++
 7 files changed, 98 insertions(+)
 create mode 100644 emmet/scripts/sbatch/submit_garden_to_hpss.txt
 create mode 100644 emmet/scripts/sbatch/submit_hpss_MatProj.script
 create mode 100644 emmet/scripts/sbatch/submit_hpss_to_mpdrive.script
 create mode 100644 emmet/scripts/sbatch/submit_restore_MatProj.txt
 create mode 100644 emmet/scripts/sbatch/submit_rsync.script
 create mode 100644 emmet/scripts/sbatch/submit_targz_to_htar.script
 create mode 100644 emmet/scripts/sbatch/submit_update_hpss_archive.script

diff --git a/emmet/scripts/sbatch/submit_garden_to_hpss.txt b/emmet/scripts/sbatch/submit_garden_to_hpss.txt
new file mode 100644
index 0000000000..4014113384
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_garden_to_hpss.txt
@@ -0,0 +1,14 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=48:00:00
+#SBATCH --job-name=garden_to_hpss
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=garden_to_hpss-%j.out
+#SBATCH --error=garden_to_hpss-%j.error
+#SBATCH --mem=10GB
+
+script=$HOME/mp_prod/codes/emmet/emmet/scripts/garden_to_hpss.sh
+indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/
+$script $indir
diff --git a/emmet/scripts/sbatch/submit_hpss_MatProj.script b/emmet/scripts/sbatch/submit_hpss_MatProj.script
new file mode 100644
index 0000000000..dcb854abfe
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_hpss_MatProj.script
@@ -0,0 +1,16 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=48:00:00
+#SBATCH --job-name=hpss_MatProj
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=hpss_MatProj-%j.out
+#SBATCH --error=hpss_MatProj-%j.error
+#SBATCH --mem=10GB
+
+while read line; do
+  echo $line
+  hsi -q -l matcomp ls -1 ${line}.idx
+  [[ $? -ne 0 ]] && htar -Xvf $line
+done < hpss_MatProj_2014.txt
diff --git a/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script b/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script
new file mode 100644
index 0000000000..b22041d5da
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_hpss_to_mpdrive.script
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=19:00:00
+#SBATCH --job-name=hpss_to_mpdrive
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=hpss_to_mpdrive-%j.out
+#SBATCH --error=hpss_to_mpdrive-%j.error
+
+indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive
+input=/global/homes/h/huck/mp_prod/workdir/emmet_gdrive/launcher_paths_block_2019.txt
+~/mp_prod/codes/emmet/emmet/scripts/hpss_to_mpdrive.sh $indir $input
diff --git a/emmet/scripts/sbatch/submit_restore_MatProj.txt b/emmet/scripts/sbatch/submit_restore_MatProj.txt
new file mode 100644
index 0000000000..e616942e64
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_restore_MatProj.txt
@@ -0,0 +1,16 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=48:00:00
+#SBATCH --job-name=restore_matproj
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=restore_matproj-%j.out
+#SBATCH --error=restore_matproj-%j.error
+
+outdir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/
+archive=/home/projects/MatProj/GARDEN/2012-Jul-Aug.tar
+
+cd $outdir && pwd
+htar -xvf $archive
+echo DONE
diff --git a/emmet/scripts/sbatch/submit_rsync.script b/emmet/scripts/sbatch/submit_rsync.script
new file mode 100644
index 0000000000..e08eaef033
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_rsync.script
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=48:00:00
+#SBATCH --job-name=rsync
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=rsync-%j.out
+#SBATCH --error=rsync-%j.error
+
+indir=/project/projectdirs/matgen/garden/control_blocks
+outdir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/
+rsync --remove-source-files -av $indir/block_* $outdir
diff --git a/emmet/scripts/sbatch/submit_targz_to_htar.script b/emmet/scripts/sbatch/submit_targz_to_htar.script
new file mode 100644
index 0000000000..953da4efa1
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_targz_to_htar.script
@@ -0,0 +1,15 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=48:00:00
+#SBATCH --job-name=targz_to_htar
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=targz_to_htar-%j.out
+#SBATCH --error=targz_to_htar-%j.error
+#SBATCH --mem=10GB
+
+targz_to_htar=$HOME/mp_prod/codes/emmet/emmet/scripts/targz_to_htar.sh
+indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/
+year=2019
+$targz_to_htar $indir $year
diff --git a/emmet/scripts/sbatch/submit_update_hpss_archive.script b/emmet/scripts/sbatch/submit_update_hpss_archive.script
new file mode 100644
index 0000000000..ad9b2da997
--- /dev/null
+++ b/emmet/scripts/sbatch/submit_update_hpss_archive.script
@@ -0,0 +1,11 @@
+#!/bin/bash -l
+#SBATCH --qos=xfer
+#SBATCH --time=06:30:00
+#SBATCH --job-name=update_hpss_archive
+#SBATCH --licenses=SCRATCH
+#SBATCH --mail-user=phuck@lbl.gov
+#SBATCH --mail-type=ALL
+#SBATCH --output=update_hpss_archive-%j.out
+#SBATCH --error=update_hpss_archive-%j.error
+
+~/mp_prod/codes/emmet/emmet/scripts/update_hpss_archive.sh

From 44fa0b5434daf4b17f893385816ffb82752a96b3 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 21:31:36 +0100
Subject: [PATCH 93/97] prepare for emmet merge

---
 .gitignore                                    | 108 ------------------
 LICENSE                                       |  21 ----
 README.md                                     |   2 -
 .../emmet/scripts/retrieve_mpraw_data.py      |   0
 requirements.txt                              |  13 ---
 5 files changed, 144 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 LICENSE
 delete mode 100644 README.md
 rename retrieve_mpraw_data.py => emmet/emmet/scripts/retrieve_mpraw_data.py (100%)
 delete mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 0fe7e91a59..0000000000
--- a/.gitignore
+++ /dev/null
@@ -1,108 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-token.json
-credentials.json
-mpraw/*
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 8c40a5f90a..0000000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2018 Materials Project
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index 9045646106..0000000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# mp-nomad
-Disseminate raw MP calculations through NoMaD
diff --git a/retrieve_mpraw_data.py b/emmet/emmet/scripts/retrieve_mpraw_data.py
similarity index 100%
rename from retrieve_mpraw_data.py
rename to emmet/emmet/scripts/retrieve_mpraw_data.py
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 9e850ed138..0000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-cachetools==3.0.0
-certifi==2018.10.15
-google-api-python-client==1.7.5
-google-auth==1.6.1
-google-auth-httplib2==0.0.3
-httplib2==0.12.0
-oauth2client==4.1.3
-pyasn1==0.4.4
-pyasn1-modules==0.2.2
-rsa==4.0
-six==1.11.0
-tqdm==4.28.1
-uritemplate==3.0.0

From 065f58e501652dce15c291aa05a9a4e37dc700bd Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 12:35:19 -0800
Subject: [PATCH 94/97] cli: fix mpnomad location

---
 emmet/{emmet => }/scripts/retrieve_mpraw_data.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename emmet/{emmet => }/scripts/retrieve_mpraw_data.py (100%)

diff --git a/emmet/emmet/scripts/retrieve_mpraw_data.py b/emmet/scripts/retrieve_mpraw_data.py
similarity index 100%
rename from emmet/emmet/scripts/retrieve_mpraw_data.py
rename to emmet/scripts/retrieve_mpraw_data.py

From 4bad927cef4b20aa095aaffa2e97d253a6b6a5c9 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 12:46:23 -0800
Subject: [PATCH 95/97] cli: retire retrieve_mpraw_data

---
 emmet/scripts/emmet.py               |  43 +++++++++-
 emmet/scripts/retrieve_mpraw_data.py | 121 ---------------------------
 2 files changed, 41 insertions(+), 123 deletions(-)
 delete mode 100644 emmet/scripts/retrieve_mpraw_data.py

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index a67bed480c..ffe4a24216 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -1,4 +1,4 @@
-import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math
+import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math, io, requests
 from shutil import copyfile, rmtree
 from glob import glob
 from fnmatch import fnmatch
@@ -25,7 +25,8 @@
 from googleapiclient.discovery import build
 from httplib2 import Http
 from oauth2client import file, client, tools
-from googleapiclient.http import MediaFileUpload
+from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
+from tqdm import tqdm
 
 if 'FW_CONFIG_FILE' not in os.environ:
     print('Please set FW_CONFIG_FILE!')
@@ -40,6 +41,8 @@
 SCOPES = 'https://www.googleapis.com/auth/drive'
 current_year = int(datetime.today().year)
 year_tags = ['mp_{}'.format(y) for y in range(2018, current_year+1)]
+NOMAD_OUTDIR = '/nomad/nomadlab/mpraw'
+NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}'
 
 def aggregate_by_formula(coll, q, key=None):
     query = {'$and': [q, exclude]}
@@ -1283,6 +1286,17 @@ def upload_archive(path, name, service, parent=None):
             print("Uploaded %d%%." % int(status.progress() * 100))
     print("Upload Complete!")
 
+def download_file(service, file_id):
+    request = service.files().get_media(fileId=file_id)
+    fh = io.BytesIO()
+    downloader = MediaIoBaseDownload(fh, request)
+    done = False
+    with tqdm(total=100) as pbar:
+        while done is False:
+            status, done = downloader.next_chunk()
+            pbar.update(int(status.progress() * 100))
+    return fh.getvalue()
+
 @cli.command()
 @click.argument('target_db_file', type=click.Path(exists=True))
 @click.option('--block-filter', '-f', help='block filter substring (e.g. block_2017-)')
@@ -1324,6 +1338,28 @@ def recurse(service, folder_id):
                         launcher_name = launcher['name'].replace('.tar.gz', '')
                         full_launcher_path.append(launcher_name)
                         launcher_paths.append(os.path.join(*full_launcher_path))
+
+			# TODO NoMaD integration
+			#nomad_query='repository_main_file_uri="{}"'.format(launcher_name)
+			##nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO
+			#print(nomad_query)
+			#resp = requests.get(NOMAD_REPO.format(nomad_query)).json()
+			#if 'meta' in resp:
+			#    path = os.path.join(*full_launcher_path) + '.tar.gz'
+			#    if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo
+			#	print('Retrieve', path, '...')
+			#	if not os.path.exists(path):
+			#	    os.makedirs(path)
+			#	    #content = download_file(service, launcher['id'])
+			#	    #with open(path, 'wb') as f:
+			#	    #    f.write(content)
+			#	    print('... DONE.')
+			#    else:
+			#	print(path, 'found in NoMaD repo:')
+			#	for d in resp['data']:
+			#	    print('\t', d['attributes']['repository_uri'])
+			#else:
+			#    raise Exception(resp['errors'][0]['detail'])
                     else:
                         full_launcher_path.append(launcher['name'])
                         recurse(service, launcher['id'])
@@ -1334,6 +1370,9 @@ def recurse(service, folder_id):
             if page_token is None:
                 break # done with launchers in current block
 
+
+    # TODO older launcher directories don't have prefix
+    # TODO also cover non-b/l hierarchy
     block_page_token = None
     block_query = "'{}' in parents".format(garden_id) if block_filter is None \
         else "'{}' in parents and name contains '{}'".format(garden_id, block_filter)
diff --git a/emmet/scripts/retrieve_mpraw_data.py b/emmet/scripts/retrieve_mpraw_data.py
deleted file mode 100644
index 7360afa514..0000000000
--- a/emmet/scripts/retrieve_mpraw_data.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import print_function
-import io, os, sys
-from googleapiclient.discovery import build
-from httplib2 import Http
-from oauth2client import file, client, tools
-from googleapiclient.http import MediaIoBaseDownload
-from tqdm import tqdm
-import requests
-
-# If modifying these scopes, delete the file token.json.
-# see https://developers.google.com/identity/protocols/googlescopes#drivev3
-SCOPES = 'https://www.googleapis.com/auth/drive'
-OUTDIR = '/nomad/nomadlab/mpraw'
-NOMAD_REPO = 'http://backend-repository-nomad.esc:8111/repo/search/calculations_oldformat?query={}'
-
-def download_file(service, file_id):
-    request = service.files().get_media(fileId=file_id)
-    fh = io.BytesIO()
-    downloader = MediaIoBaseDownload(fh, request)
-    done = False
-    with tqdm(total=100) as pbar:
-        while done is False:
-            status, done = downloader.next_chunk()
-            pbar.update(int(status.progress() * 100))
-    return fh.getvalue()
-
-full_launcher_path = []
-
-def recurse(service, folder_id):
-    page_token = None
-    query = "'{}' in parents".format(folder_id)
-    while True:
-        response = service.files().list(
-            q=query, spaces='drive', pageToken=page_token,
-            fields='nextPageToken, files(id, name, modifiedTime, size)',
-            pageSize=50
-        ).execute()
-
-        for launcher in response['files']:
-            if '.tar.gz' in launcher['name']:
-                print(launcher)
-                launcher_name = launcher['name'].replace('.tar.gz', '')
-                full_launcher_path.append(launcher_name)
-                nomad_query='repository_main_file_uri="{}"'.format(launcher_name)
-                #nomad_query='alltarget repository_uri.split="{}"'.format(','.join(full_launcher_path)) # TODO
-                print(nomad_query)
-                resp = requests.get(NOMAD_REPO.format(nomad_query)).json()
-                if 'meta' in resp:
-                    path = os.path.join(*full_launcher_path) + '.tar.gz'
-                    if resp['meta']['total_hits'] < 1: # calculation not found in NoMaD repo
-                        print('Retrieve', path, '...')
-                        if not os.path.exists(path):
-                            os.makedirs(path)
-                            #content = download_file(service, launcher['id'])
-                            #with open(path, 'wb') as f:
-                            #    f.write(content)
-                            print('... DONE.')
-                    else:
-                        print(path, 'found in NoMaD repo:')
-                        for d in resp['data']:
-                            print('\t', d['attributes']['repository_uri'])
-                else:
-                    raise Exception(resp['errors'][0]['detail'])
-            else:
-                full_launcher_path.append(launcher['name'])
-                recurse(service, launcher['id'])
-
-            del full_launcher_path[-1:]
-
-        page_token = response.get('nextPageToken', None)
-        if page_token is None:
-            break # done with launchers in current block
-
-def main():
-    """Shows basic usage of the Drive v3 API.
-    Prints the names and ids of the first 10 files the user has access to.
-    """
-    # The file token.json stores the user's access and refresh tokens, and is
-    # created automatically when the authorization flow completes for the first
-    # time.
-    store = file.Storage('token.json')
-    creds = store.get()
-    if not creds or creds.invalid:
-        flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
-        creds = tools.run_flow(flow, store)
-    service = build('drive', 'v3', http=creds.authorize(Http()))
-
-    # Call the Drive v3 API
-    # https://developers.google.com/drive/api/v3/search-parameters#fn1
-    # TODO older launcher directories don't have prefix
-    # TODO also cover non-b/l hierarchy
-    block_page_token = None
-    garden_id = os.environ.get('MPDRIVE_GARDEN_ID')
-    if garden_id:
-        #block_query = "'{}' in parents and name contains 'block_'".format(garden_id)
-        block_query = "'{}' in parents and name contains 'block_2011-10-07-08-57-17-804213'".format(garden_id)
-    else:
-        print('MPDRIVE_GARDEN_ID not set!')
-        return
-
-    while True:
-        block_response = service.files().list(
-            q=block_query, spaces='drive', pageToken=block_page_token,
-            fields='nextPageToken, files(id, name)', pageSize=10
-        ).execute()
-
-        for block in block_response['files']:
-            print(block['name'])
-            full_launcher_path.clear()
-            full_launcher_path.append(block['name'])
-            recurse(service, block['id'])
-
-        block_page_token = block_response.get('nextPageToken', None)
-        if block_page_token is None:
-            break # done with blocks
-
-    # TODO in production, subscribe to watch garden directory?
-    # https://developers.google.com/drive/api/v3/reference/files/watch
-
-if __name__ == '__main__':
-    main()

From bcb24d486b1d01c0e75e8862b8f3ec12beb5e7b6 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 22:49:55 +0100
Subject: [PATCH 96/97] cli: don't import DLSVolumePredictor

---
 emmet/scripts/emmet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index ffe4a24216..e3bbdd83e2 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -8,7 +8,7 @@
 from pymongo.errors import CursorNotFound
 from pymongo.collection import ReturnDocument
 from pymongo.errors import DocumentTooLarge
-from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
+#from pymatgen.analysis.structure_prediction.volume_predictor import DLSVolumePredictor
 from pymatgen import Structure
 from pymatgen.alchemy.materials import TransformedStructure
 from pymatgen.util.provenance import StructureNL, Author
@@ -597,7 +597,7 @@ def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures,
         print(tasks_coll.count(), 'tasks in', full_name)
 
     NO_POTCARS = ['Po', 'At', 'Rn', 'Fr', 'Ra', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
-    vp = DLSVolumePredictor()
+    #vp = DLSVolumePredictor()
 
     tags = OrderedDict()
     if tag is None:

From 42f0e80492d0ecbdbc6b02c73ed7927f3039c708 Mon Sep 17 00:00:00 2001
From: Patrick Huck <phuck@lbl.gov>
Date: Tue, 5 Feb 2019 22:50:18 +0100
Subject: [PATCH 97/97] cli: better launchpad autoload

---
 emmet/scripts/emmet.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/emmet/scripts/emmet.py b/emmet/scripts/emmet.py
index e3bbdd83e2..fb27fc5493 100644
--- a/emmet/scripts/emmet.py
+++ b/emmet/scripts/emmet.py
@@ -28,9 +28,11 @@
 from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
 from tqdm import tqdm
 
-if 'FW_CONFIG_FILE' not in os.environ:
-    print('Please set FW_CONFIG_FILE!')
-    sys.exit(0)
+def get_lpad():
+    if 'FW_CONFIG_FILE' not in os.environ:
+        print('Please set FW_CONFIG_FILE!')
+        sys.exit(0)
+    return LaunchPad.auto_load()
 
 exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}}
 skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+']
@@ -167,7 +169,7 @@ def get_vasp_dirs(scan_path, base_path, max_dirs, insert):
 def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
     name = multiprocessing.current_process().name
     print(name, 'starting')
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print(name, 'connected to target db with', target.collection.count(), 'tasks')
 
@@ -253,7 +255,7 @@ def copy(target_db_file, tag, insert, copy_snls):
     if not insert:
         print('DRY RUN: add --insert flag to actually add tasks to production')
 
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to source db with', source.collection.count(), 'tasks')
 
@@ -448,7 +450,7 @@ def insert_snls(snls_list):
 @click.option('--add_tasks_db', type=click.Path(exists=True), help='config file for additional tasks collection to scan')
 def find(email, add_snlcolls, add_tasks_db):
     """checks status of calculations by submitter or author email in SNLs"""
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
 
     snl_collections = [lpad.db.snls]
     if add_snlcolls is not None:
@@ -506,7 +508,7 @@ def find(email, add_snlcolls, add_tasks_db):
 @click.option('--insert/--no-insert', default=False, help='actually execute workflow addition')
 def bandstructure(target_db_file, insert):
     """add workflows for bandstructure based on materials collection"""
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     source = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to source db with', source.collection.count(), 'tasks')
     target = VaspCalcDb.from_db_file(target_db_file, admin=True)
@@ -564,7 +566,7 @@ def wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures,
     if not insert:
         print('DRY RUN! Add --insert flag to actually add workflows')
 
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
 
     snl_collections = [lpad.db.snls]
     if add_snlcolls is not None:
@@ -980,7 +982,7 @@ def format(self, record):
 def report(tag, in_progress, to_csv):
     """generate a report of calculations status"""
 
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     states = Firework.STATE_RANKS
     states = sorted(states, key=states.get)
 
@@ -1108,7 +1110,7 @@ def add_snls(tag, input_structures, add_snlcolls, insert):
             meta = yaml.safe_load(f)
     meta['authors'] = [Author.parse_author(a) for a in meta['authors']]
 
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     snl_collections = [lpad.db.snls]
     if add_snlcolls is not None:
         for snl_db_config in yaml.load_all(open(add_snlcolls, 'r')):
@@ -1228,7 +1230,7 @@ def parse(base_path, add_snlcolls, insert, make_snls, nproc, max_dirs):
     if not insert:
         print('DRY RUN: add --insert flag to actually insert tasks')
 
-    lpad = LaunchPad.auto_load()
+    lpad = get_lpad()
     target = VaspCalcDb(lpad.host, lpad.port, lpad.name, 'tasks', lpad.username, lpad.password)
     print('connected to target db with', target.collection.count(), 'tasks')
     base_path = os.path.join(base_path, '')