# Crystal Structure Prototype Analysis

Analysis of AI-generated superconductor candidates for structural novelty using the AFLOW Prototype Encyclopedia.

## Setup

In [1]:
import pandas as pd
import pickle
import numpy as np
from pathlib import Path
import warnings
import subprocess
import tempfile
import os
import shutil

from ase.io import write
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from pymatgen.analysis.structure_matcher import StructureMatcher
from pymatgen.core import Structure
import itertools

In [2]:
# Check AFLOW availability
result = subprocess.run(['aflow', '--version'], capture_output=True, text=True)
if result.returncode == 0:
    for line in result.stdout.split('\n'):
        if 'VERSION' in line:
            print(line.strip())
            break

AFLOW VERSION 4.0.5 Automatic-FLOW [(C) 2003-2025 aflow.org consortium]


## Data Loading and Filtering

In [3]:
# Load dataset
with warnings.catch_warnings():
    warnings.simplefilter("ignore", DeprecationWarning)
    with open('../data/genai_allSC_final_may2025.pkl', 'rb') as f:
        df = pickle.load(f)

print(f"Loaded {len(df)} structures")
print(f"Columns: {list(df.columns)}")

Loaded 773 structures
Columns: ['formula', 'tcad_final', 'mp_eah', 'sgn', 'lamb_final', 'E_form', 'is_metal', 'structure']


In [4]:
# Filter for thermodynamically stable structures
df_stable = df[df['mp_eah'] <= 0.0].copy()
print(f"Stable structures (mp_eah ≤ 0): {len(df_stable)}/{len(df)}")

print("\nStable structures:")
for i, (idx, row) in enumerate(df_stable.iterrows()):
    print(f"  {i+1:2d}. {row['formula']} (mp_eah = {row['mp_eah']:.4f})")

Stable structures (mp_eah ≤ 0): 29/773

Stable structures:
   1. Mo2NbTc3 (mp_eah = -0.0085)
   2. Mo2NbTc3 (mp_eah = -0.0119)
   3. Mo6OsRu (mp_eah = -0.0294)
   4. HfScTc4 (mp_eah = -0.0327)
   5. HfScTc4 (mp_eah = -0.0327)
   6. Mo3Tc (mp_eah = -0.0116)
   7. Nb6Pt2 (mp_eah = -0.0111)
   8. AuNb6Pt (mp_eah = -0.0007)
   9. Mo5NbOs2 (mp_eah = -0.0376)
  10. IrMo6Tc (mp_eah = -0.0113)
  11. Mo3Tc (mp_eah = -0.0181)
  12. Mo4Nb2Os2 (mp_eah = -0.0099)
  13. H3Mo2Pd2 (mp_eah = -0.0054)
  14. Mo6ReRu (mp_eah = -0.0188)
  15. Mo3NbTc2 (mp_eah = -0.0067)
  16. AlIrNb6 (mp_eah = -0.0068)
  17. IrMo2Os (mp_eah = -0.0189)
  18. Be4Nb6 (mp_eah = -0.0055)
  19. Mo5ReSiTc (mp_eah = -0.0018)
  20. Ir2Mo3Os (mp_eah = -0.0275)
  21. IrNb6Pt (mp_eah = -0.0098)
  22. Mo2NbTi2 (mp_eah = -0.0011)
  23. Mo2TcW (mp_eah = -0.0077)
  24. MoNbTa2 (mp_eah = -0.0122)
  25. AsMo6Os (mp_eah = -0.0071)
  26. AsMo6Pt (mp_eah = -0.0219)
  27. MoNb2Ta (mp_eah = -0.0024)
  28. Mo4NbTc (mp_eah = -0.0175)
  29. IrNb6Tc

## Structure Standardization

In [5]:
# Standardize crystal structures to conventional cells
adaptor = AseAtomsAdaptor()
standardized_data = []

for idx, row in df_stable.iterrows():
    atoms = row['structure']
    pmg_struct = adaptor.get_structure(atoms)
    analyzer = SpacegroupAnalyzer(pmg_struct)
    standardized_struct = analyzer.get_conventional_standard_structure()
    standardized_atoms = adaptor.get_atoms(standardized_struct)
    
    standardized_data.append({
        'original_index': idx,
        'formula': row['formula'],
        'mp_eah': row['mp_eah'],
        'E_form': row['E_form'],
        'standardized_atoms': standardized_atoms,
        'space_group': analyzer.get_space_group_number()
    })

standardized_df = pd.DataFrame(standardized_data)
print(f"Standardized {len(standardized_df)} structures")
print(f"Space group distribution: {standardized_df['space_group'].value_counts().to_dict()}")

Standardized 29 structures
Space group distribution: {200: 9, 216: 2, 71: 2, 25: 2, 65: 1, 164: 1, 223: 1, 115: 1, 40: 1, 123: 1, 166: 1, 8: 1, 127: 1, 10: 1, 12: 1, 225: 1, 47: 1, 99: 1}


## Deduplication

In [6]:
# Check for duplicate formulas
formula_counts = standardized_df['formula'].value_counts()
duplicates = formula_counts[formula_counts > 1]

print(f"Unique formulas: {len(formula_counts)}/{len(standardized_df)}")
if len(duplicates) > 0:
    print(f"Duplicate formulas: {len(duplicates)}")
    for formula, count in duplicates.items():
        structures = standardized_df[standardized_df['formula'] == formula]
        print(f"  {formula}: {count} structures")
        for _, row in structures.iterrows():
            print(f"    Index {row['original_index']}: mp_eah = {row['mp_eah']:.4f}")

Unique formulas: 26/29
Duplicate formulas: 3
  Mo2NbTc3: 2 structures
    Index 20657: mp_eah = -0.0085
    Index 13944: mp_eah = -0.0119
  HfScTc4: 2 structures
    Index 13223: mp_eah = -0.0327
    Index 16950: mp_eah = -0.0327
  Mo3Tc: 2 structures
    Index 1449: mp_eah = -0.0116
    Index 59903: mp_eah = -0.0181


In [7]:
# Keep most stable structure for each formula
deduplicated_data = []

for formula in standardized_df['formula'].unique():
    formula_group = standardized_df[standardized_df['formula'] == formula]
    most_stable = formula_group.loc[formula_group['mp_eah'].idxmin()]
    deduplicated_data.append(most_stable)

deduplicated_df = pd.DataFrame(deduplicated_data).reset_index(drop=True)
print(f"After deduplication: {len(deduplicated_df)} structures")

for i, row in deduplicated_df.iterrows():
    print(f"  {i+1:2d}. {row['formula']} (Index {row['original_index']}, mp_eah = {row['mp_eah']:.4f})")

After deduplication: 26 structures
   1. Mo2NbTc3 (Index 13944, mp_eah = -0.0119)
   2. Mo6OsRu (Index 88841, mp_eah = -0.0294)
   3. HfScTc4 (Index 16950, mp_eah = -0.0327)
   4. Mo3Tc (Index 59903, mp_eah = -0.0181)
   5. Nb6Pt2 (Index 1695, mp_eah = -0.0111)
   6. AuNb6Pt (Index 51254, mp_eah = -0.0007)
   7. Mo5NbOs2 (Index 64238, mp_eah = -0.0376)
   8. IrMo6Tc (Index 33668, mp_eah = -0.0113)
   9. Mo4Nb2Os2 (Index 56825, mp_eah = -0.0099)
  10. H3Mo2Pd2 (Index 96733, mp_eah = -0.0054)
  11. Mo6ReRu (Index 34530, mp_eah = -0.0188)
  12. Mo3NbTc2 (Index 175399, mp_eah = -0.0067)
  13. AlIrNb6 (Index 29530, mp_eah = -0.0068)
  14. IrMo2Os (Index 1428, mp_eah = -0.0189)
  15. Be4Nb6 (Index 394, mp_eah = -0.0055)
  16. Mo5ReSiTc (Index 57708, mp_eah = -0.0018)
  17. Ir2Mo3Os (Index 2112, mp_eah = -0.0275)
  18. IrNb6Pt (Index 1452, mp_eah = -0.0098)
  19. Mo2NbTi2 (Index 82458, mp_eah = -0.0011)
  20. Mo2TcW (Index 102684, mp_eah = -0.0077)
  21. MoNbTa2 (Index 1782, mp_eah = -0.0122)

## Encyclopedia Comparison

In [8]:
def parse_encyclopedia_output(output):
    """Parse AFLOW --compare2prototypes output"""
    if "No compatible prototypes found" in output:
        return {"status": "novel", "matches": [], "best_misfit": None}
    
    matches = []
    lines = output.split('\n')
    in_duplicates = False
    
    for line in lines:
        if "List of duplicates" in line or "list of duplicates" in line:
            in_duplicates = True
            continue
        if "-----" in line:
            continue
        if in_duplicates and line.strip() and not line.startswith('-'):
            parts = line.split()
            if len(parts) >= 2:
                prototype = parts[0]
                try:
                    misfit = float(parts[-1])
                    matches.append({"prototype": prototype, "misfit": misfit})
                except ValueError:
                    pass
        if in_duplicates and line.startswith('='):
            break
    
    if matches:
        best_misfit = min(m["misfit"] for m in matches)
        status = "known" if best_misfit <= 0.10 else "family" if best_misfit <= 0.20 else "novel"
        return {"status": status, "matches": matches, "best_misfit": best_misfit}
    
    return {"status": "novel", "matches": [], "best_misfit": None}

def parse_isopointal_output(output):
    """Parse AFLOW --get_isopointal_prototypes output"""
    if "no isopointal prototypes" in output.lower():
        return {"isopointal_codes": [], "isopointal_count": 0}
    
    codes = []
    for line in output.split('\n'):
        line = line.strip()
        if line and not line.startswith('0000') and not line.startswith('MESSAGE'):
            if ',' in line:
                parts = [code.strip() for code in line.split(',') if code.strip()]
                codes.extend(parts)
            else:
                codes.append(line)
    
    return {"isopointal_codes": codes, "isopointal_count": len(codes)}

In [9]:
# Check each structure against AFLOW encyclopedia
novelty_results = []
novel_structures = []

for i, row in deduplicated_df.iterrows():
    # Write temporary CIF
    with tempfile.NamedTemporaryFile(mode='w', suffix='.cif', delete=False) as f:
        temp_cif = f.name
    write(temp_cif, row['standardized_atoms'], format='cif')
    
    # Encyclopedia comparison
    cmd = ['aflow', '--compare2prototypes', '--np=1']
    with open(temp_cif, 'r') as f:
        result = subprocess.run(cmd, stdin=f, capture_output=True, text=True, timeout=180)
    encyclopedia_data = parse_encyclopedia_output(result.stdout)
    
    # Isopointal analysis for novel structures
    isopointal_data = {"isopointal_codes": [], "isopointal_count": 0}
    if encyclopedia_data['status'] == 'novel':
        cmd_iso = ['aflow', '--get_isopointal_prototypes']
        with open(temp_cif, 'r') as f:
            iso_result = subprocess.run(cmd_iso, stdin=f, capture_output=True, text=True, timeout=60)
        if iso_result.returncode == 0:
            isopointal_data = parse_isopointal_output(iso_result.stdout)
    
    os.remove(temp_cif)
    
    # Classify novelty
    if encyclopedia_data['status'] == 'known':
        novelty_type = 'known_prototype'
    elif encyclopedia_data['status'] == 'family':
        novelty_type = 'same_family'
    else:
        if isopointal_data['isopointal_count'] > 0:
            novelty_type = 'novel_arrangement'
        else:
            novelty_type = 'novel'
    
    result_data = {
        'original_index': row['original_index'],
        'formula': row['formula'],
        'mp_eah': row['mp_eah'],
        'space_group': row['space_group'],
        'novelty_type': novelty_type,
        'best_misfit': encyclopedia_data['best_misfit'],
        'isopointal_codes': isopointal_data['isopointal_codes']
    }
    novelty_results.append(result_data)
    
    # Print result
    print(f"{i+1:2d}. {row['formula']}", end=" -> ")
    if novelty_type == "novel":
        novel_structures.append(row)
        print("NOVEL")
    elif novelty_type == "novel_arrangement":
        iso_str = ', '.join(isopointal_data['isopointal_codes'])
        print(f"NOVEL ARRANGEMENT (isopointal: {iso_str})")
    elif novelty_type == "known_prototype":
        best_match = encyclopedia_data['matches'][0]['prototype']
        misfit = encyclopedia_data['best_misfit']
        print(f"KNOWN: {best_match} (misfit: {misfit:.3f})")
    else:
        best_match = encyclopedia_data['matches'][0]['prototype']
        misfit = encyclopedia_data['best_misfit']
        print(f"FAMILY: {best_match} (misfit: {misfit:.3f})")

novelty_df = pd.DataFrame(novelty_results)
print(f"\nNovelty classification: {novelty_df['novelty_type'].value_counts().to_dict()}")
print(f"Novel structures for clustering: {len(novel_structures)}")

 1. Mo2NbTc3 -> KNOWN: AB2C3_oC12_65_a_i_cj-001 (misfit: 0.018)
 2. Mo6OsRu -> NOVEL
 3. HfScTc4 -> KNOWN: AB4C_cF24_216_a_e_c-001 (misfit: 0.000)
 4. Mo3Tc -> KNOWN: A3B_oI8_71_af_b-001 (misfit: 0.009)
 5. Nb6Pt2 -> KNOWN: A3B_cP8_223_c_a-002 (misfit: 0.000)
 6. AuNb6Pt -> NOVEL
 7. Mo5NbOs2 -> NOVEL
 8. IrMo6Tc -> NOVEL
 9. Mo4Nb2Os2 -> NOVEL
10. H3Mo2Pd2 -> NOVEL ARRANGEMENT (isopointal: L_1Z5Z, L_CXMZ)
11. Mo6ReRu -> NOVEL
12. Mo3NbTc2 -> NOVEL ARRANGEMENT (isopointal: L_DNMB)
13. AlIrNb6 -> NOVEL
14. IrMo2Os -> NOVEL
15. Be4Nb6 -> KNOWN: A2B3_tP10_127_g_ah-001 (misfit: 0.028)
16. Mo5ReSiTc -> NOVEL
17. Ir2Mo3Os -> NOVEL
18. IrNb6Pt -> NOVEL
19. Mo2NbTi2 -> NOVEL ARRANGEMENT (isopointal: L_66R4, L_6WMF, L_7J12, L_B1C0, L_K5GV, L_ZD5H)
20. Mo2TcW -> NOVEL ARRANGEMENT (isopointal: L_Y65D, L_YVVW)
21. MoNbTa2 -> KNOWN: AB2C_cF16_225_a_c_b-001 (misfit: 0.000)
22. AsMo6Os -> NOVEL
23. AsMo6Pt -> NOVEL
24. MoNb2Ta -> NOVEL ARRANGEMENT (isopointal: L_8FMW)
25. Mo4NbTc -> NOVEL
26. IrNb6Tc

## Structure Clustering

In [10]:
# Write novel structures to CIF files for AFLOW clustering
batch_dir = Path("novel_structures_batch")
if batch_dir.exists():
    shutil.rmtree(batch_dir)
batch_dir.mkdir()

structure_mapping = {}
for i, row in enumerate(novel_structures):
    safe_formula = row['formula'].replace('/', '_')
    filename = f"struct_{row['original_index']}_{safe_formula}.cif"
    filepath = batch_dir / filename
    
    write(str(filepath), row['standardized_atoms'], format='cif')
    
    structure_name = f"struct_{row['original_index']}_{safe_formula}"
    structure_mapping[structure_name] = {
        'original_index': row['original_index'],
        'formula': row['formula'],
        'mp_eah': row['mp_eah'],
        'space_group': row['space_group']
    }

print(f"Written {len(novel_structures)} CIF files")

Written 15 CIF files


In [11]:
# Run AFLOW clustering
cmd = ['aflow', '--compare_structures', '-D', str(batch_dir), '--np=1']
result = subprocess.run(cmd, capture_output=True, text=True, timeout=1800)

print("AFLOW clustering output:")
print("-" * 40)
print(result.stdout)
print("-" * 40)

AFLOW clustering output:
----------------------------------------
00000  MESSAGE XtalFinderCalculator::getOptions(): Misfit threshold for matched structures: 0.1 (default: 0.1) - [dir=novel_structures_batch] - [user=jhamlin] - [host=physicsphamlint450] - [PID=67965] - [date=Wed Oct  8 13:36:15 2025] - [/home/runner/_work/AFLOW4/AFLOW4/src/modules/COMPARE/aflow_compare_structure_function.cpp:283]
00000  MESSAGE XtalFinderCalculator::getOptions(): Misfit threshold for structures in the same family: 0.2 (default: 0.2) - [dir=novel_structures_batch] - [user=jhamlin] - [host=physicsphamlint450] - [PID=67965] - [date=Wed Oct  8 13:36:15 2025] - [/home/runner/_work/AFLOW4/AFLOW4/src/modules/COMPARE/aflow_compare_structure_function.cpp:285]
00000  MESSAGE compare::compareInputStructures(): Comparison directory: novel_structures_batch - [dir=novel_structures_batch] - [user=jhamlin] - [host=physicsphamlint450] - [PID=67965] - [date=Wed Oct  8 13:36:15 2025] - [/home/runner/_work/AFLOW4/AFLOW4/sr

In [12]:
def parse_aflow_directory_output(output):
    """Parse AFLOW --compare_structures -D output"""
    results = []
    lines = output.split('\n')
    current_prototype = None
    in_duplicates_list = False
    
    for line in lines:
        line = line.strip()
        
        if line.startswith('prototype='):
            current_prototype = {
                'representative': line.split('=', 1)[1].strip(),
                'duplicates': []
            }
            in_duplicates_list = False
            continue
        
        if 'List of duplicates' in line and current_prototype:
            in_duplicates_list = True
            continue
        
        if 'No duplicates' in line and current_prototype:
            results.append(current_prototype)
            current_prototype = None
            continue
        
        if in_duplicates_list and current_prototype and line:
            if line.startswith('---') or 'misfit value' in line:
                continue
            if '.cif' in line and any(char.isdigit() for char in line):
                parts = line.split()
                if len(parts) >= 2:
                    structure_path = parts[0]
                    try:
                        misfit = float(parts[-1])
                        current_prototype['duplicates'].append({
                            'structure_path': structure_path,
                            'misfit': misfit
                        })
                    except ValueError:
                        pass
        
        if line.startswith('Identified unique prototype:') and current_prototype and in_duplicates_list:
            results.append(current_prototype)
            current_prototype = None
            in_duplicates_list = False
    
    if current_prototype:
        results.append(current_prototype)
    
    return results

def convert_aflow_groups_to_clusters(parsed_groups, structure_mapping):
    """Convert AFLOW groups to cluster format"""
    clusters = []
    
    for group_id, group in enumerate(parsed_groups):
        rep_file = os.path.basename(group['representative'])
        rep_name = rep_file.replace('.cif', '')
        cluster_structures = [rep_name] if rep_name in structure_mapping else []
        
        for dup in group['duplicates']:
            dup_file = os.path.basename(dup['structure_path'])
            dup_name = dup_file.replace('.cif', '')
            if dup_name in structure_mapping:
                cluster_structures.append(dup_name)
        
        if cluster_structures:
            cluster_info = {
                'cluster_id': group_id,
                'cluster_size': len(cluster_structures),
                'structure_indices': [structure_mapping[name]['original_index'] for name in cluster_structures],
                'formulas': [structure_mapping[name]['formula'] for name in cluster_structures],
                'space_groups': [structure_mapping[name].get('space_group') for name in cluster_structures],
                'mp_eah_values': [structure_mapping[name]['mp_eah'] for name in cluster_structures]
            }
            clusters.append(cluster_info)
    
    return clusters

# Parse clustering results
parsed_groups = parse_aflow_directory_output(result.stdout)
clusters = convert_aflow_groups_to_clusters(parsed_groups, structure_mapping)

cluster_sizes = [cluster['cluster_size'] for cluster in clusters]
print(f"Structural families: {len(clusters)}")
print(f"Family sizes: {sorted(cluster_sizes, reverse=True)}")

for i, cluster in enumerate(clusters):
    min_eah_idx = np.argmin(cluster['mp_eah_values'])
    most_stable = cluster['formulas'][min_eah_idx]
    stability = cluster['mp_eah_values'][min_eah_idx]
    print(f"Family {i+1}: {cluster['cluster_size']} structures, representative: {most_stable} (mp_eah = {stability:.4f})")

Structural families: 7
Family sizes: [9, 1, 1, 1, 1, 1, 1]
Family 1: 1 structures, representative: IrMo2Os (mp_eah = -0.0189)
Family 2: 9 structures, representative: Mo6OsRu (mp_eah = -0.0294)
Family 3: 1 structures, representative: Mo4NbTc (mp_eah = -0.0175)
Family 4: 1 structures, representative: Ir2Mo3Os (mp_eah = -0.0275)
Family 5: 1 structures, representative: Mo4Nb2Os2 (mp_eah = -0.0099)
Family 6: 1 structures, representative: Mo5ReSiTc (mp_eah = -0.0018)
Family 7: 1 structures, representative: Mo5NbOs2 (mp_eah = -0.0376)


## Representative Selection and A15 Analysis

In [13]:
# Select representative structures (most stable from each family)
representatives = []

for i, cluster in enumerate(clusters):
    min_eah_idx = np.argmin(cluster['mp_eah_values'])
    rep = {
        'family_id': i + 1,
        'original_index': cluster['structure_indices'][min_eah_idx],
        'formula': cluster['formulas'][min_eah_idx],
        'mp_eah': cluster['mp_eah_values'][min_eah_idx],
        'family_size': cluster['cluster_size'],
        'space_group': cluster['space_groups'][min_eah_idx]
    }
    representatives.append(rep)

print(f"Selected {len(representatives)} representative structures:")
for rep in representatives:
    print(f"  {rep['family_id']}. {rep['formula']} (family size: {rep['family_size']}, mp_eah: {rep['mp_eah']:.4f})")

Selected 7 representative structures:
  1. IrMo2Os (family size: 1, mp_eah: -0.0189)
  2. Mo6OsRu (family size: 9, mp_eah: -0.0294)
  3. Mo4NbTc (family size: 1, mp_eah: -0.0175)
  4. Ir2Mo3Os (family size: 1, mp_eah: -0.0275)
  5. Mo4Nb2Os2 (family size: 1, mp_eah: -0.0099)
  6. Mo5ReSiTc (family size: 1, mp_eah: -0.0018)
  7. Mo5NbOs2 (family size: 1, mp_eah: -0.0376)


In [14]:
# A15-like analysis
a15_prototype = Structure.from_file("../data/Cr3Si.cif")
print(f"A15 prototype: {a15_prototype.formula}")

def check_a15_like(pmg_structure, a15_prototype):
    """Check for A15-like structure using element mapping"""
    elements = pmg_structure.elements
    n_elements = len(elements)
    
    mappings = []
    if n_elements == 2:
        mappings = [{}]
    elif n_elements == 3:
        element_pairs = list(itertools.combinations(elements, 2))
        for pair in element_pairs:
            mappings.append({pair[0]: pair[1]})
    elif n_elements >= 4:
        target_pairs = list(itertools.combinations(elements, 2))
        for target_pair in target_pairs:
            elements_to_map = [e for e in elements if e not in target_pair]
            for i in range(min(8, 2**len(elements_to_map))):
                mapping = {}
                for j, element in enumerate(elements_to_map):
                    if (i >> j) & 1:
                        mapping[element] = target_pair[0]
                    else:
                        mapping[element] = target_pair[1]
                if mapping:
                    mappings.append(mapping)
    
    matcher = StructureMatcher(primitive_cell=True, attempt_supercell=True)
    for mapping in mappings:
        if mapping:
            mapped_structure = pmg_structure.replace_species(mapping, in_place=False)
        else:
            mapped_structure = pmg_structure
        
        is_match = matcher.fit_anonymous(a15_prototype, mapped_structure)
        if is_match:
            return True, mapping
    
    return False, None

# Check each representative
a15_results = []
for rep in representatives:
    atoms = df.loc[rep['original_index'], 'structure']
    pmg_struct = adaptor.get_structure(atoms)
    is_a15_like, mapping = check_a15_like(pmg_struct, a15_prototype)
    
    a15_results.append({
        'family_id': rep['family_id'],
        'formula': rep['formula'],
        'is_a15_like': is_a15_like,
        'mapping': mapping
    })
    
    print(f"{rep['family_id']}. {rep['formula']}", end=" -> ")
    if is_a15_like:
        mapping_str = str(mapping) if mapping else "direct"
        print(f"A15-like ({mapping_str})")
    else:
        print("Not A15-like")

a15_count = sum(1 for r in a15_results if r['is_a15_like'])
print(f"\nA15-like representatives: {a15_count}/{len(representatives)}")

A15 prototype: Cr6 Si2
1. IrMo2Os -> Not A15-like
2. Mo6OsRu -> A15-like ({Element Os: Element Ru})
3. Mo4NbTc -> Not A15-like
4. Ir2Mo3Os -> Not A15-like
5. Mo4Nb2Os2 -> A15-like ({Element Nb: Element Mo})
6. Mo5ReSiTc -> A15-like ({Element Si: Element Re, Element Mo: Element Tc})
7. Mo5NbOs2 -> A15-like ({Element Nb: Element Mo})

A15-like representatives: 4/7


## Export Representative Structures

In [15]:
# Export representative structures as CIF files
output_dir = Path("./representative_structures")
if output_dir.exists():
    shutil.rmtree(output_dir)
output_dir.mkdir()

for rep in representatives:
    # Get standardized structure
    atoms = None
    for _, row in deduplicated_df.iterrows():
        if row['original_index'] == rep['original_index']:
            atoms = row['standardized_atoms']
            break
    
    if atoms is not None:
        safe_formula = rep['formula'].replace('/', '_')
        filename = f"{safe_formula}_sg{rep['space_group']}_idx{rep['original_index']}.cif"
        filepath = output_dir / filename
        
        write(str(filepath), atoms, format='cif')
        print(f"  {rep['family_id']}. {filename}")

# Create summary file
summary_lines = [
    "Representative Novel Crystal Structures",
    "=" * 40,
    f"Total representatives: {len(representatives)}",
    "All structures are thermodynamically stable and completely novel",
    ""
]

for rep in representatives:
    safe_formula = rep['formula'].replace('/', '_')
    filename = f"{safe_formula}_sg{rep['space_group']}_idx{rep['original_index']}.cif"
    summary_lines.extend([
        f"{rep['family_id']}. {rep['formula']}",
        f"   File: {filename}",
        f"   Family size: {rep['family_size']} structures",
        f"   Stability: {rep['mp_eah']:.4f} eV/atom",
        f"   Space group: {rep['space_group']}",
        ""
    ])

summary_file = output_dir / "README.txt"
with open(summary_file, 'w') as f:
    f.write('\n'.join(summary_lines))

print(f"\nExported {len(representatives)} representative structures to {output_dir}")

  1. IrMo2Os_sg25_idx1428.cif
  2. Mo6OsRu_sg200_idx88841.cif
  3. Mo4NbTc_sg99_idx162167.cif
  4. Ir2Mo3Os_sg10_idx2112.cif
  5. Mo4Nb2Os2_sg40_idx56825.cif
  6. Mo5ReSiTc_sg25_idx57708.cif
  7. Mo5NbOs2_sg115_idx64238.cif

Exported 7 representative structures to representative_structures


In [16]:
# Cleanup working directory
if batch_dir.exists():
    shutil.rmtree(batch_dir)
    print(f"Cleaned up {batch_dir}")

Cleaned up novel_structures_batch
