This notebook converts data from the compressed Alexandria database to the format used by our code. <br>
The notebook assumes that the .json.bz2-files of the Alexandria database are in the same folder

In [1]:
import json, bz2
from pymatgen.entries.computed_entries import ComputedStructureEntry
import os
import pickle

In [2]:
# specify elements which you want to include in your simulations
# compounds which contain materials not in this list are dropped
# The current list are all main group elements up to and including Xenon
keep_elems = [
    "H",
    "He",
    "Li",
    "Be",
    "B",
    "C",
    "N",
    "O",
    "F",
    "Ne",
    "Na",
    "Mg",
    "Al",
    "Si",
    "P",
    "S",
    "Cl",
    "Ar",
    "K",
    "Ca",
    "Ga",
    "Ge",
    "As",
    "Se",
    "Br",
    "Kr",
    "Rb",
    "Sr",
    "In",
    "Sn",
    "Sb",
    "Te",
    "I",
    "Xe",
]

In [None]:
# scan the database files and filter out the compounds that you want to run calculations for
# This runs for a very long time, around an hour on our computers!
all_files = os.listdir()
filtered_data = []

for idx, file in enumerate(all_files):
    if "json.bz2" in file:
        print(f"opening file No. {idx}", flush=True)
        with bz2.open(file) as fh:
            data = json.loads(fh.read().decode("utf-8"))

        entries = [ComputedStructureEntry.from_dict(i) for i in data["entries"]]
        print("parsing output", flush=True)
        for entry in entries:
            prop = entry.data

            ### Specify the conditions on which you want to filter here ###
            if (
                10 < prop["nsites"] <= 12
                and prop["e_above_hull"] < (0.05 * prop["nsites"])
                and prop["band_gap_ind"] >= 0.05
                and set(prop["elements"]).issubset(keep_elems)
                and 75 <= prop["spg"]
            ):
                filtered_data.append(entry)

In [4]:
# check how many compounds passed all filters
len(filtered_data)

1461

In [5]:
# save the filtered compounds
with open("alexandria_filtered_5.pckl", "wb") as f:
    pickle.dump(filtered_data, f)