In [1]:
from mp_api.client import MPRester
import json

# Replace with your actual API key
API_KEY = "vhdPJ1STyEi4znoIbrdg6s1j2Q03BQdH"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read ICSD entries
with open('full_icsd_entries.txt', 'r') as f:
    icsd_entries = [line.strip() for line in f]



In [32]:
print(f'Total number of experimental structures: {len(icsd_entries)}')

Total number of experimental structures: 135468


In [3]:
# Build ICSD to MP mapping
with MPRester(API_KEY) as mpr:
    mp_docs = mpr.materials.summary.search(        
        theoretical=False,
        fields=["database_IDs", "material_id"])

Retrieving SummaryDoc documents: 100%|██████████| 49283/49283 [00:10<00:00, 4809.23it/s]


In [4]:

# Build ICSD to MP mapping
with MPRester(API_KEY) as mpr:

    icsd_to_mpid = {}
    for mp_doc in mp_docs:
        mpid = str(mp_doc.material_id)
        for icsd_id in mp_doc.database_IDs.get("icsd", []):
            if icsd_id not in icsd_to_mpid:
                id = icsd_id.replace('icsd-','').zfill(6)
                icsd_to_mpid[id] = []
            icsd_to_mpid[id].append(mpid)

In [5]:
# Find matches
found = []
not_found = []

for icsd in icsd_entries:
    if icsd in icsd_to_mpid:
        found.append((icsd, icsd_to_mpid[icsd]))
    else:
        not_found.append(icsd)

# Save results
with open('mp_found.txt', 'w') as f:
    for icsd, mpids in found:
        f.write(f"{icsd}: {','.join(mpids)}\n")

with open('mp_full_icsd.txt', 'w') as f:
    for icsd, mpids in icsd_to_mpid.items():
        f.write(f"{icsd}: {','.join(mpids)}\n")


print(f"Found in MP: {len(found)}")
print(f"Not found: {len(not_found)}")


Found in MP: 67372
Not found: 68096


In [6]:

# Invert the dictionary - use ONLY found entries
mpid_to_icsd = {}
for icsd_id, mp_ids in found:
    for mp_id in mp_ids:
        if mp_id not in mpid_to_icsd:
            mpid_to_icsd[mp_id] = []
        mpid_to_icsd[mp_id].append(icsd_id)

# Filter for MP IDs with only one ICSD entry
filtered_mpid_to_icsd = {
    mp_id: icsd_ids 
    for mp_id, icsd_ids in mpid_to_icsd.items() 
    if len(icsd_ids) == 1
}

# Save results
with open('only_one_entry.txt', 'w') as f:
    for mpids, icsd in filtered_mpid_to_icsd.items():
        f.write(f"{mpids}: {icsd[0]}\n")

In [34]:
print(f'Filtered structures with only 1 ICSD entry: {len(filtered_mpid_to_icsd)}')

Filtered structures with only 1 ICSD entry: 28951


# Parsing the theoretical structures from Materials Project

In [7]:

with MPRester(API_KEY) as mpr:
    # Get structures for ICSD IDs
    docs = mpr.materials.summary.search(
        theoretical=False,
        fields=["material_id", "database_IDs", "structure", 'nsites', 
                'elements', 'nelements', 'composition', 'composition_reduced', 
                'formula_pretty', 'formula_anonymous', 'chemsys', 'volume', 'density', 
                'density_atomic', 'symmetry', 'origins', 'task_ids', 'formation_energy_per_atom', 
                'is_stable', 'band_gap', 'cbm', 'vbm', 'efermi', 'is_gap_direct', 'is_metal', 
                 'is_magnetic', 'ordering', 'total_magnetization', 
                 'theoretical'])

Retrieving SummaryDoc documents: 100%|██████████| 49283/49283 [00:48<00:00, 1016.71it/s]


In [8]:
selected_icsd = [i[0] for i in filtered_mpid_to_icsd.values()]

with open('icsd_structures.json', 'r') as f:
    experimental = json.load(f)

with MPRester(API_KEY) as mpr:

    selected_docs = {}
    for doc in docs:
        for icsd_id in doc.database_IDs.get("icsd", []):
            id = icsd_id.replace('icsd-','').zfill(6)

            if id in selected_icsd:
                selected_docs[id] = {'MP': {}, 'ICSD': {}}
                selected_docs[id]['MP'] = doc
                selected_docs[id]['ICSD'] = experimental[id]



In [9]:

# Collect structure origin task_ids
structure_task_ids = []
for icsd_id, data in selected_docs.items():
    for origin in data['MP'].origins:
        if origin.name == 'structure':
            structure_task_ids.append(origin.task_id)
            break

In [10]:
# Query in batches
task_to_calc = {}
batch_size = 500

with MPRester(API_KEY) as mpr:
    for i in range(0, len(structure_task_ids), batch_size):
        batch = structure_task_ids[i:i+batch_size]
        task_docs = mpr.tasks.search(
            task_ids=batch,
            fields=["task_id", "calc_type"]
        )
        task_to_calc.update({doc.task_id: doc.calc_type for doc in task_docs})
    

  task_docs = mpr.tasks.search(
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 11650844.44it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 13107200.00it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 17772474.58it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 19972876.19it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 18558867.26it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 19418074.07it/s]
Retrieving TaskDoc documents: 100%|██████████| 482/482 [00:00<00:00, 19253852.65it/s]
Retrieving TaskDoc documents: 100%|██████████| 499/499 [00:00<00:00, 20722353.43it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 7913781.13it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 20971520.00it/s]
Retrieving TaskDoc documents: 100%|██████████| 500/500 [00:00<00:00, 16777216.00it/s]
Retrieving TaskDoc docu

In [19]:
unique_calc_types = set(task_to_calc[doc] for doc in task_to_calc)
print(f"Unique calc_types found: {unique_calc_types}")
print(f"Total: {len(unique_calc_types)} different calculation types")

Unique calc_types found: {<CalcType.GGA_Structure_Optimization: 'GGA Structure Optimization'>, <CalcType.r2SCAN_Structure_Optimization: 'r2SCAN Structure Optimization'>, <CalcType.GGA_Static: 'GGA Static'>, <CalcType.GGA_U_Static: 'GGA+U Static'>, <CalcType.GGA_U_Structure_Optimization: 'GGA+U Structure Optimization'>, <CalcType.r2SCAN_Static: 'r2SCAN Static'>, <CalcType.SCAN_Structure_Optimization: 'SCAN Structure Optimization'>}
Total: 7 different calculation types


In [24]:

# Filter for GGA-optimized structures only
gga_materials = {}
for icsd_id, data in selected_docs.items():
    structure_task_ids = next(
        (origin.task_id for origin in data['MP'].origins if origin.name == 'structure'),
        None
    )
    
    if structure_task_ids and task_to_calc.get(structure_task_ids) in ['GGA Structure Optimization', 'GGA Static']:
        gga_materials[icsd_id] = data

print(f"Materials with GGA-optimized structures: {len(gga_materials)}/{len(selected_docs)}")

selected_docs = gga_materials

Materials with GGA-optimized structures: 13528/28951


In [27]:
import json

# Convert MP objects to dictionaries
selected_docs_serializable = {}
for icsd_id, data in selected_docs.items():
    selected_docs_serializable[icsd_id] = {
        'MP': data['MP'].dict() if hasattr(data['MP'], 'dict') else data['MP'],
        'ICSD': data['ICSD']
    }

# Save to JSON
with open('theoretical_experimental_structures.json', 'w') as f:
    json.dump(selected_docs_serializable, f, indent=2)

In [29]:
# Save MP ID to ICSD mapping for GGA materials
with open('final_selection.txt', 'w') as f:
    for icsd_id, data in gga_materials.items():
        mpid = data['MP'].material_id
        f.write(f"{mpid}: {icsd_id}\n")

print(f"Saved {len(gga_materials)} GGA materials to final_selection.txt")

Saved 13528 GGA materials to final_selection.txt
