In [4]:
"""
Workflow to filter COD structures via OPTIMADE and estimate MP matching candidates
Step 1: Query COD via OPTIMADE API
Step 2: Pre-filter MP database
Step 3: Estimate matching workload
"""

from mp_api.client import MPRester
from pymatgen.ext.optimade import OptimadeRester
from pymatgen.core import Composition
from pymatgen.analysis.magnetism import Ordering
import pandas as pd
import time

# Configuration
MP_API_KEY = "vhdPJ1STyEi4znoIbrdg6s1j2Q03BQdH"
TARGET_COD_COUNT = 1000

In [2]:
# Elements to exclude
RARE_EARTHS = ["La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", 
               "Ho", "Er", "Tm", "Yb", "Lu", "Sc", "Y"]
EXCLUDED_ELEMENTS = RARE_EARTHS + ["O"]  # Excluding rare earths and oxygen (oxides)


In [5]:
# ============================================================================
# STEP 1: Query COD Structures via OPTIMADE
# ============================================================================

print("="*80)
print("STEP 1: QUERYING COD VIA OPTIMADE API")
print("="*80)
print()

print("Filters:")
print(f"  - Number of elements: 2-4")
print(f"  - Exclude rare earths: {', '.join(RARE_EARTHS)}")
print(f"  - Exclude oxygen (no oxides)")
print(f"  - Target count: {TARGET_COD_COUNT}")
print()

# Initialize COD client - try direct REST API instead of OPTIMADE
print("Connecting to COD via REST API...")

# COD REST API endpoints
COD_SEARCH_URL = "https://www.crystallography.net/cod/result"
COD_CIF_URL = "https://www.crystallography.net/cod/"

STEP 1: QUERYING COD VIA OPTIMADE API

Filters:
  - Number of elements: 2-4
  - Exclude rare earths: La, Ce, Pr, Nd, Pm, Sm, Eu, Gd, Tb, Dy, Ho, Er, Tm, Yb, Lu, Sc, Y
  - Exclude oxygen (no oxides)
  - Target count: 1000

Connecting to COD via REST API...


In [6]:
import requests
from io import StringIO

def query_cod_by_elements(included_elements=None, excluded_elements=None, max_results=2000):
    """
    Query COD using their REST API
    """
    params = {}
    
    # Build element filter
    if excluded_elements:
        for el in excluded_elements:
            params[f'nel{el}'] = '0'  # Exclude element
    
    # Set format
    params['format'] = 'csv'
    
    try:
        print(f"  Querying COD with excluded elements: {excluded_elements}")
        response = requests.get(COD_SEARCH_URL, params=params, timeout=30)
        response.raise_for_status()
        
        # Parse CSV response
        import csv
        csv_data = StringIO(response.text)
        reader = csv.DictReader(csv_data)
        
        results = []
        for row in reader:
            results.append(row)
            if len(results) >= max_results:
                break
        
        return results
    
    except Exception as e:
        print(f"  Error: {e}")
        return []

def get_cod_structure(cod_id):
    """
    Download CIF file and parse structure
    """
    try:
        cif_url = f"{COD_CIF_URL}{cod_id}.cif"
        response = requests.get(cif_url, timeout=10)
        response.raise_for_status()
        
        # Parse CIF
        from pymatgen.core import Structure
        cif_string = StringIO(response.text)
        structure = Structure.from_str(response.text, fmt="cif")
        
        return structure
    except Exception as e:
        return None

In [7]:
# Query COD using REST API
print("Querying COD database...")
print("(This may take several minutes)")
print()

cod_results = query_cod_by_elements(excluded_elements=EXCLUDED_ELEMENTS, max_results=5000)

print(f"Retrieved {len(cod_results)} entries from COD")
print()

# Process COD results
print("Downloading and processing structures...")
print("(Downloading CIF files from COD)")
print()

Querying COD database...
(This may take several minutes)

  Querying COD with excluded elements: ['La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Sc', 'Y', 'O']
Retrieved 0 entries from COD

Downloading and processing structures...
(Downloading CIF files from COD)



In [8]:
cod_results

[]