In [2]:
from mp_api.client import MPRester
from pymatgen.analysis.magnetism import Ordering
import pandas as pd

# Replace with your actual API key
API_KEY = "vhdPJ1STyEi4znoIbrdg6s1j2Q03BQdH"

In [None]:
with MPRester(API_KEY) as mpr:
    
    # Step 1: Query materials with PBE calculations only
    # ===================================================
    
    excluded_elements = ["Pu", "U", "Th", "Ra", "Ac"]
    
    print("Querying materials with PBE calculations...")
    print("This will take a moment as we're filtering by calculation type...")
    print()
    
    # First get all non-magnetic materials
    initial_docs = mpr.materials.summary.search(
        magnetic_ordering=Ordering("NM"),
        theoretical=False,  # Only experimentally observed
        num_elements=(2, None),
        exclude_elements=excluded_elements,
        fields=["material_id"]
    )
    
    print(f"Found {len(initial_docs)} experimental non-magnetic materials")
    print("Now filtering for PBE calculations...")
    print()
    
    # Step 2: Filter for materials with GGA/PBE calculations
    # =======================================================
    
    mp_ids = [doc.material_id for doc in initial_docs]
    pbe_materials = []
    
    batch_size = 200
    for i in range(0, len(mp_ids), batch_size):
        batch_ids = mp_ids[i:i+batch_size]
        
        # Query calc_types to check for PBE/GGA
        calc_docs = mpr.materials.search(
            material_ids=batch_ids,
            fields=["material_id", "calc_types"]
        )
        
        for doc in calc_docs:
            if doc.calc_types:
                # Check if any calculation is GGA/PBE
                # calc_types values are CalcType enums, need to convert to string
                has_pbe = any("GGA" in str(calc_type) for calc_type in doc.calc_types.values())
                if has_pbe:
                    # Get the PBE task ID
                    pbe_task_id = next(
                        (tid for tid, ctype in doc.calc_types.items() if "GGA" in str(ctype)),
                        None
                    )
                    pbe_materials.append({
                        "material_id": doc.material_id,
                        "pbe_task_id": pbe_task_id
                    })
        
        print(f"  Checked {min(i+batch_size, len(mp_ids))}/{len(mp_ids)} materials... "
              f"Found {len(pbe_materials)} with PBE")
    
    print(f"\nFiltered to {len(pbe_materials)} materials with PBE calculations")
    

In [37]:
# Step 3: Get full data for PBE materials only
# =============================================

print("\nRetrieving full data for PBE materials...")

pbe_mp_ids = [mat["material_id"] for mat in pbe_materials]
pbe_task_dict = {mat["material_id"]: mat["pbe_task_id"] for mat in pbe_materials}

# Get summary data
full_data = []
batch_size = 200

for i in range(0, len(pbe_mp_ids), batch_size):
    batch_ids = pbe_mp_ids[i:i+batch_size]
    
    docs = mpr.materials.summary.search(
        material_ids=batch_ids,
        fields=["material_id", "formula_pretty", "structure", "volume", 
                "nsites", "database_IDs", "is_stable", "energy_above_hull",
                "band_gap", "density", "symmetry"]
    )
    
    for doc in docs:
        icsd_ids = doc.database_IDs.get("icsd", []) if doc.database_IDs else []
        
        full_data.append({
            "mp_id": str(doc.material_id),
            "formula": doc.formula_pretty,
            "volume_theo_pbe": doc.volume,  # This is from PBE calculation
            "nsites": doc.nsites,
            "density": doc.density,
            "band_gap": doc.band_gap,
            "space_group": doc.symmetry.symbol if doc.symmetry else None,
            "crystal_system": doc.symmetry.crystal_system if doc.symmetry else None,
            "is_stable": doc.is_stable,
            "energy_above_hull": doc.energy_above_hull,
            "icsd_ids": icsd_ids,
            "num_icsd_ids": len(icsd_ids),
            "pbe_task_id": pbe_task_dict.get(doc.material_id)
        })
    
    print(f"  Retrieved {min(i+batch_size, len(pbe_mp_ids))}/{len(pbe_mp_ids)} materials...")

df = pd.DataFrame(full_data)


Retrieving full data for PBE materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2452809.36it/s]


  Retrieved 200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2006844.02it/s]


  Retrieved 400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1667715.31it/s]


  Retrieved 600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1438869.30it/s]


  Retrieved 800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2964172.44it/s]


  Retrieved 1000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1453831.54it/s]


  Retrieved 1200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3072750.18it/s]


  Retrieved 1400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1200086.98it/s]


  Retrieved 1600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2872810.96it/s]


  Retrieved 1800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2056031.37it/s]


  Retrieved 2000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2452809.36it/s]


  Retrieved 2200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1280703.51it/s]


  Retrieved 2400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2557502.44it/s]


  Retrieved 2600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2680066.45it/s]


  Retrieved 2800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2298248.77it/s]


  Retrieved 3000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1556328.01it/s]


  Retrieved 3200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2481836.69it/s]


  Retrieved 3400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1978445.28it/s]


  Retrieved 3600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2902632.53it/s]


  Retrieved 3800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2863006.14it/s]


  Retrieved 4000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2330168.89it/s]


  Retrieved 4200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1342177.28it/s]


  Retrieved 4400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2460002.35it/s]


  Retrieved 4600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1648056.58it/s]


  Retrieved 4800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1436405.48it/s]


  Retrieved 5000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2156454.50it/s]


  Retrieved 5200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1919589.93it/s]


  Retrieved 5400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1910844.65it/s]


  Retrieved 5600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2285724.25it/s]


  Retrieved 5800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1153866.30it/s]


  Retrieved 6000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1259550.75it/s]


  Retrieved 6200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2953735.21it/s]


  Retrieved 6400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2863006.14it/s]


  Retrieved 6600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2964172.44it/s]


  Retrieved 6800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2267191.35it/s]


  Retrieved 7000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1864135.11it/s]


  Retrieved 7200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2242943.32it/s]


  Retrieved 7400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2129088.32it/s]


  Retrieved 7600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 802737.61it/s]


  Retrieved 7800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1276804.87it/s]


  Retrieved 8000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1111073.91it/s]


  Retrieved 8200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2041023.84it/s]


  Retrieved 8400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1288572.66it/s]


  Retrieved 8600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2150925.13it/s]


  Retrieved 8800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2178859.22it/s]


  Retrieved 9000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2369663.28it/s]


  Retrieved 9200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3095427.31it/s]


  Retrieved 9400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2843595.93it/s]


  Retrieved 9600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1792437.61it/s]


  Retrieved 9800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2933079.72it/s]


  Retrieved 10000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1136667.75it/s]


  Retrieved 10200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2985269.75it/s]


  Retrieved 10400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2663050.16it/s]


  Retrieved 10600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2646248.58it/s]


  Retrieved 10800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2383127.27it/s]


  Retrieved 11000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1950839.07it/s]


  Retrieved 11200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1419392.22it/s]


  Retrieved 11400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2597092.26it/s]


  Retrieved 11600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1718977.05it/s]


  Retrieved 11800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1448809.67it/s]


  Retrieved 12000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2824447.14it/s]


  Retrieved 12200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1342177.28it/s]


  Retrieved 12400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1751275.16it/s]


  Retrieved 12600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2213353.03it/s]


  Retrieved 12800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2129088.32it/s]


  Retrieved 13000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 892405.11it/s]


  Retrieved 13200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2605157.76it/s]


  Retrieved 13400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2403612.61it/s]


  Retrieved 13600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1585748.20it/s]


  Retrieved 13800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2317295.03it/s]


  Retrieved 14000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1889326.13it/s]


  Retrieved 14200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2036069.90it/s]


  Retrieved 14400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1987821.80it/s]


  Retrieved 14600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1150700.69it/s]


  Retrieved 14800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2654622.78it/s]


  Retrieved 15000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1915207.31it/s]


  Retrieved 15200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2213353.03it/s]


  Retrieved 15400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1915207.31it/s]


  Retrieved 15600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1150700.69it/s]


  Retrieved 15800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2317295.03it/s]


  Retrieved 16000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2145424.04it/s]


  Retrieved 16200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1860001.77it/s]


  Retrieved 16400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1992543.47it/s]


  Retrieved 16600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1872457.14it/s]


  Retrieved 16800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2438548.84it/s]


  Retrieved 17000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2225094.96it/s]


  Retrieved 17200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2750363.28it/s]


  Retrieved 17400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2902632.53it/s]


  Retrieved 17600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2943371.23it/s]


  Retrieved 17800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2431480.58it/s]


  Retrieved 18000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2581110.15it/s]


  Retrieved 18200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2663050.16it/s]


  Retrieved 18400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2654622.78it/s]


  Retrieved 18600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2091922.19it/s]


  Retrieved 18800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2213353.03it/s]


  Retrieved 19000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1198372.57it/s]


  Retrieved 19200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2565323.55it/s]


  Retrieved 19400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1138210.04it/s]


  Retrieved 19600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3264049.81it/s]


  Retrieved 19800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2467237.65it/s]


  Retrieved 20000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2474515.63it/s]


  Retrieved 20200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1021754.93it/s]


  Retrieved 20400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1941807.41it/s]


  Retrieved 20600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2504062.09it/s]


  Retrieved 20800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1726051.03it/s]


  Retrieved 21000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3214026.05it/s]


  Retrieved 21200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3130077.61it/s]


  Retrieved 21400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1407484.56it/s]


  Retrieved 21600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3072750.18it/s]


  Retrieved 21800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2534322.66it/s]


  Retrieved 22000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1368451.55it/s]


  Retrieved 22200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2195970.68it/s]


  Retrieved 22400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2213353.03it/s]


  Retrieved 22600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2231012.77it/s]


  Retrieved 22800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1955386.48it/s]


  Retrieved 23000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2549728.88it/s]


  Retrieved 23200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2489201.19it/s]


  Retrieved 23400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1061849.11it/s]


  Retrieved 23600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2481836.69it/s]


  Retrieved 23800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1042063.11it/s]


  Retrieved 24000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2081540.45it/s]


  Retrieved 24200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2255002.15it/s]


  Retrieved 24400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2167598.97it/s]


  Retrieved 24600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1357379.94it/s]


  Retrieved 24800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2356350.56it/s]


  Retrieved 25000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2814969.13it/s]


  Retrieved 25200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2460002.35it/s]


  Retrieved 25400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2184533.33it/s]


  Retrieved 25600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2310911.29it/s]


  Retrieved 25800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1359579.90it/s]


  Retrieved 26000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1506033.75it/s]


  Retrieved 26200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1355187.08it/s]


  Retrieved 26400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2356350.56it/s]


  Retrieved 26600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2922859.93it/s]


  Retrieved 26800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1102313.80it/s]


  Retrieved 27000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3289650.20it/s]


  Retrieved 27200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2723574.03it/s]


  Retrieved 27400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2549728.88it/s]


  Retrieved 27600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3201758.78it/s]


  Retrieved 27800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1103764.21it/s]


  Retrieved 28000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2438548.84it/s]


  Retrieved 28200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1138210.04it/s]


  Retrieved 28400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2863006.14it/s]


  Retrieved 28600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1353001.29it/s]


  Retrieved 28800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2833989.19it/s]


  Retrieved 29000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1242756.74it/s]


  Retrieved 29200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1516927.31it/s]


  Retrieved 29400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1910844.65it/s]


  Retrieved 29600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 3072750.18it/s]


  Retrieved 29800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1458888.35it/s]


  Retrieved 30000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1375181.64it/s]


  Retrieved 30200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1253902.54it/s]


  Retrieved 30400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2051004.40it/s]


  Retrieved 30600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1815716.02it/s]


  Retrieved 30800/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1514189.17it/s]


  Retrieved 31000/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2697301.61it/s]


  Retrieved 31200/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 2156454.50it/s]


  Retrieved 31400/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 200/200 [00:00<00:00, 1796275.80it/s]


  Retrieved 31600/31746 materials...


Retrieving SummaryDoc documents: 100%|██████████| 146/146 [00:00<00:00, 1308479.45it/s]

  Retrieved 31746/31746 materials...





In [39]:
df = df[(df["is_stable"] == True) & (df["num_icsd_ids"] == 1)].reset_index(drop=True)


In [41]:
# Step 4: Display statistics and samples
# =======================================

print("\n" + "="*80)
print("DATABASE SUMMARY")
print("="*80)
print(f"\nTotal materials with PBE calculations: {len(df)}")
print(f"Materials with ICSD IDs: {(df['num_icsd_ids'] > 0).sum()}")
print(f"Stable materials: {df['is_stable'].sum()}")

print("\n" + "-"*80)
print("Distribution by crystal system:")
print("-"*80)
print(df['crystal_system'].value_counts())

print("\n" + "-"*80)
print("Volume statistics:")
print("-"*80)
print(f"Mean volume: {df['volume_theo_pbe'].mean():.2f} Å³")
print(f"Median volume: {df['volume_theo_pbe'].median():.2f} Å³")
print(f"Min volume: {df['volume_theo_pbe'].min():.2f} Å³")
print(f"Max volume: {df['volume_theo_pbe'].max():.2f} Å³")


DATABASE SUMMARY

Total materials with PBE calculations: 11811
Materials with ICSD IDs: 11811
Stable materials: 11811

--------------------------------------------------------------------------------
Distribution by crystal system:
--------------------------------------------------------------------------------
crystal_system
Orthorhombic    3608
Monoclinic      2915
Tetragonal      1530
Hexagonal       1175
Cubic           1105
Trigonal         899
Triclinic        579
Name: count, dtype: int64

--------------------------------------------------------------------------------
Volume statistics:
--------------------------------------------------------------------------------
Mean volume: 616.90 Å³
Median volume: 421.73 Å³
Min volume: 12.43 Å³
Max volume: 6754.25 Å³


In [42]:
# Step 5: Show sample materials with ICSD IDs
# ============================================

print("\n" + "="*80)
print("SAMPLE MATERIALS WITH ICSD IDs (First 20)")
print("="*80 + "\n")

with_icsd = df[df['num_icsd_ids'] > 0].head(20)

for idx, row in with_icsd.iterrows():
    print(f"{row['mp_id']}: {row['formula']}")
    print(f"  Crystal System: {row['crystal_system']}, Space Group: {row['space_group']}")
    print(f"  PBE Volume: {row['volume_theo_pbe']:.2f} Å³ ({row['nsites']} atoms)")
    print(f"  Band Gap: {row['band_gap']:.2f} eV, Density: {row['density']:.3f} g/cm³")
    print(f"  Stable: {row['is_stable']}, E_hull: {row['energy_above_hull']:.3f} eV/atom")
    print(f"  ICSD IDs: {row['icsd_ids']}")
    print(f"  PBE Task ID: {row['pbe_task_id']}")
    print()



SAMPLE MATERIALS WITH ICSD IDs (First 20)

mp-10963: Ag2SnHgSe4
  Crystal System: Orthorhombic, Space Group: Pmn2_1
  PBE Volume: 468.52 Å³ (16 atoms)
  Band Gap: 0.00 eV, Density: 6.031 g/cm³
  Stable: True, E_hull: 0.000 eV/atom
  ICSD IDs: ['icsd-95094']
  PBE Task ID: mp-940283

mp-11641: Ag2SeO4
  Crystal System: Orthorhombic, Space Group: Fddd
  PBE Volume: 206.57 Å³ (14 atoms)
  Band Gap: 0.43 eV, Density: 5.767 g/cm³
  Stable: True, E_hull: 0.000 eV/atom
  ICSD IDs: ['icsd-413089']
  PBE Task ID: mp-11641

mp-1192057: Ag2S2O7
  Crystal System: Triclinic, Space Group: P-1
  PBE Volume: 283.01 Å³ (22 atoms)
  Band Gap: 2.30 eV, Density: 4.598 g/cm³
  Stable: True, E_hull: 0.000 eV/atom
  ICSD IDs: ['icsd-423166']
  PBE Task ID: mp-1192057

mp-1197599: Ag2Sn(S2O7)3
  Crystal System: Trigonal, Space Group: P-3
  PBE Volume: 2653.43 Å³ (180 atoms)
  Band Gap: 2.46 eV, Density: 3.240 g/cm³
  Stable: True, E_hull: 0.000 eV/atom
  ICSD IDs: ['icsd-423630']
  PBE Task ID: mp-1197599

m

In [43]:
# Step 6: Export databases
# =========================

print("="*80)
print("EXPORTING DATA")
print("="*80 + "\n")

# Export full database
df_sorted = df.sort_values('formula')
df_sorted.to_csv("mp_pbe_volume_database.csv", index=False)
print(f"✓ Saved full PBE database to: mp_pbe_volume_database.csv")
print(f"  Contains {len(df_sorted)} materials")

# # Export materials with ICSD IDs
# icsd_df = df[df['num_icsd_ids'] > 0].copy()
# icsd_df_sorted = icsd_df.sort_values('formula')
# icsd_df_sorted.to_csv("mp_pbe_with_icsd.csv", index=False)
# print(f"\n✓ Saved materials with ICSD to: mp_pbe_with_icsd.csv")
# print(f"  Contains {len(icsd_df_sorted)} materials")

# # Export expanded ICSD mapping (one row per ICSD ID)
# icsd_expanded = []
# for idx, row in icsd_df.iterrows():
#     for icsd_id in row['icsd_ids']:
#         icsd_expanded.append({
#             "icsd_id": icsd_id,
#             "mp_id": row['mp_id'],
#             "formula": row['formula'],
#             "volume_theo_pbe": row['volume_theo_pbe'],
#             "nsites": row['nsites'],
#             "density": row['density'],
#             "band_gap": row['band_gap'],
#             "space_group": row['space_group'],
#             "crystal_system": row['crystal_system'],
#             "is_stable": row['is_stable'],
#             "pbe_task_id": row['pbe_task_id']
#         })

# icsd_expanded_df = pd.DataFrame(icsd_expanded)
# icsd_expanded_df_sorted = icsd_expanded_df.sort_values('icsd_id')
# icsd_expanded_df_sorted.to_csv("icsd_to_mp_mapping.csv", index=False)
# print(f"\n✓ Saved ICSD mapping to: icsd_to_mp_mapping.csv")
# print(f"  Contains {len(icsd_expanded_df_sorted)} ICSD entries")

# Export summary statistics
summary_stats = {
    "total_materials": len(df),
    # "materials_with_icsd": len(icsd_df),
    # "total_icsd_entries": len(icsd_expanded_df),
    "stable_materials": df['is_stable'].sum(),
    "mean_volume": df['volume_theo_pbe'].mean(),
    "median_volume": df['volume_theo_pbe'].median(),
    "mean_band_gap": df['band_gap'].mean(),
    "median_band_gap": df['band_gap'].median(),
}

stats_df = pd.DataFrame([summary_stats])
stats_df.to_csv("database_statistics.csv", index=False)
print(f"\n✓ Saved statistics to: database_statistics.csv")

print("\n" + "="*80)
print("NEXT STEPS")
print("="*80)
print("\n1. Use 'icsd_to_mp_mapping.csv' to query ICSD for experimental volumes")
print("2. Match ICSD experimental volumes with MP PBE theoretical volumes")
print("3. All theoretical volumes in the database are from PBE calculations")
print("\nThe 'volume_theo_pbe' column contains volumes from PBE/GGA calculations.")
print("You can now compare these with experimental volumes from ICSD.")

EXPORTING DATA

✓ Saved full PBE database to: mp_pbe_volume_database.csv
  Contains 11811 materials

✓ Saved statistics to: database_statistics.csv

NEXT STEPS

1. Use 'icsd_to_mp_mapping.csv' to query ICSD for experimental volumes
2. Match ICSD experimental volumes with MP PBE theoretical volumes
3. All theoretical volumes in the database are from PBE calculations

The 'volume_theo_pbe' column contains volumes from PBE/GGA calculations.
You can now compare these with experimental volumes from ICSD.


In [27]:
ids = [i[0].replace('icsd-','') for i in icsd_df['icsd_ids']]

In [31]:
import requests
import pandas as pd
  # replace with your list

records = []

for icsd_id in ids[:5]:
    url = f"https://icsd.fiz-karlsruhe.de/api/v1/records/{icsd_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        cell = data.get("cell_parameters", {})
        records.append({
            "ICSD_ID": icsd_id,
            "Formula": data.get("chemical_formula"),
            "Space group": data.get("space_group"),
            "a (Å)": cell.get("a"),
            "b (Å)": cell.get("b"),
            "c (Å)": cell.get("c"),
            "α (°)": cell.get("alpha"),
            "β (°)": cell.get("beta"),
            "γ (°)": cell.get("gamma"),
            "Volume (Å³)": cell.get("volume")
        })
    else:
        print(f"Failed to fetch ICSD {icsd_id}: {response.status_code}")

df = pd.DataFrame(records)
print(df)
df.to_csv("icsd_structures.csv", index=False)


Failed to fetch ICSD 77341: 404
Failed to fetch ICSD 262635: 404
Failed to fetch ICSD 95094: 404
Failed to fetch ICSD 262637: 404
Failed to fetch ICSD 413089: 404
Empty DataFrame
Columns: []
Index: []


In [33]:
url = f"https://icsd.fiz-karlsruhe.de/api/v1/records/77341"
response = requests.get(url)

In [3]:
from mp_api.client import MPRester

with MPRester(API_KEY) as mpr:
    # Get a sample of materials with database IDs
    docs = mpr.materials.summary.search(
        theoretical=False,
        fields=["material_id", "database_IDs"],
        num_chunks=1,
        chunk_size=100
    )
    
    # Check what database IDs are available
    db_keys = set()
    for doc in docs:
        if doc.database_IDs:
            db_keys.update(doc.database_IDs.keys())
    
    print("Available database ID types:", db_keys)

Retrieving SummaryDoc documents: 100%|██████████| 100/100 [00:00<00:00, 1290555.08it/s]

Available database ID types: {'icsd', 'pf'}





In [32]:
ids

['77341',
 '262635',
 '95094',
 '262637',
 '413089',
 '15999',
 '9998',
 '195299',
 '420405',
 '423166',
 '240968',
 '67501',
 '426625',
 '420406',
 '423630',
 '75520',
 '420853',
 '416281',
 '1727',
 '24267',
 '78388',
 '100694',
 '28891',
 '65998',
 '31027',
 '65239',
 '4433',
 '174089',
 '59193',
 '51498',
 '417617',
 '8013',
 '8075',
 '93967',
 '416858',
 '24782',
 '414464',
 '154826',
 '415473',
 '35628',
 '52356',
 '43242',
 '261822',
 '262633',
 '74853',
 '89838',
 '30503',
 '20368',
 '418700',
 '420343',
 '32653',
 '174097',
 '428143',
 '422417',
 '417675',
 '180883',
 '656978',
 '202218',
 '52600',
 '52575',
 '70055',
 '64716',
 '159858',
 '949',
 '15732',
 '26521',
 '26563',
 '78968',
 '85135',
 '411041',
 '97760',
 '171959',
 '31028',
 '35544',
 '61806',
 '605709',
 '73122',
 '2426',
 '76969',
 '95120',
 '245791',
 '33714',
 '32652',
 '25523',
 '31078',
 '201004',
 '417618',
 '78803',
 '32721',
 '40127',
 '23513',
 '408802',
 '604514',
 '239251',
 '248970',
 '108857',
 '8263