# Reference atomic structures

This Notebook gets reference atomic crystal structures from various external sites and compares them to known crystal prototype structures.

**Library imports**

In [1]:
# Standard Python libraries
from __future__ import (print_function, division, absolute_import,
                        unicode_literals)
import os
import glob

# http://www.numpy.org/
import numpy as np

# https://pandas.pydata.org/
import pandas as pd

# https://github.com/usnistgov/atomman
import atomman as am
import atomman.lammps as lmp

# https://github.com/usnistgov/iprPy
import iprPy
print('iprPy version', iprPy.__version__)

iprPy version 0.8.3


## 1. Build unique sets of elements from included potentials

### Load database

In [2]:
#database = iprPy.load_database('demo')
database = iprPy.load_database('master')

### Option #1: Get all elements lists from potentials

In [3]:
elements_set = set()
for potential_record in database.get_records(style='potential_LAMMPS'):
    potential = lmp.Potential(potential_record.content)
    elements = potential.elements()
    elements.sort()
    elements_set.add(' '.join(elements))

### Option #2: Only get elements lists for certain systems

In [3]:
elements_set = set()
elements_set.add('Ag')

## 2. Fetch Materials Project reference structures

### Define get_mp_structures() function

In [7]:
def get_mp_structures(elements, api_key=None, lib_directory=None):
    """
    Accesses the Materials Project and downloads all structures for a list of
    elements as poscar files.
    
    Parameters
    ----------
    elements : list
        A list of element symbols.
    api_key : str, optional
        The user's Materials Project API key. If not given, will use "MAPI_KEY"
        environment variable
    lib_directory : str
        Path to the lib_directory to save the poscar files to.  Default uses
        the iprPy library/dft_structures directory.
    """
    # Function-specific imports
    import pymatgen as pmg
    from pymatgen.ext.matproj import MPRester
    
    # Define subset generator
    def subsets(fullset):
        for i, item in enumerate(fullset):
            yield [item]
            if len(fullset) > 1:
                for subset in subsets(fullset[i+1:]):
                    yield [item] + subset
    
    # Handle lib_directory
    if lib_directory is None:
        lib_directory = os.path.join(os.path.dirname(iprPy.rootdir), 'library', 'ref')
    lib_directory = os.path.abspath(lib_directory)
    
    elements.sort()
    
    # Open connection to Materials Project
    with MPRester(api_key) as m:
        
        # Loop over subsets of elements
        for subelements in subsets(elements):
            
            # Set comp_directory
            elements_string = '-'.join(subelements)
            comp_directory = os.path.join(lib_directory, elements_string)
            if not os.path.isdir(comp_directory):
                os.makedirs(comp_directory)
            
            # Build list of downloaded entries
            have = []
            for fname in glob.iglob(os.path.join(comp_directory, 'mp-*.poscar')):
                have.append(os.path.splitext(os.path.basename(fname))[0])
            #print('Have', len(have), elements_string, 'records')
            
            # Query MP for all entries corresponding to the elements
            entries = m.query({"elements": subelements}, ["material_id"])
            
            # Add entries to the list if not there
            missing = []
            for entry in entries:
                if entry['material_id'] not in have and entry['material_id'] not in missing:
                    missing.append(entry['material_id'])
            #print('Missing', len(missing), elements_string, 'records')
            
            # Download missing entries
            try:
                entries = m.query({"material_id": {"$in": missing}}, ['material_id', 'cif'])
            except:
                pass
            else:
                # Convert cif to poscar and save
                for entry in entries:
                    struct = pmg.Structure.from_str(entry['cif'], fmt='cif')
                    struct = pmg.symmetry.analyzer.SpacegroupAnalyzer(struct).get_conventional_standard_structure()
                    system = am.load('pymatgen_Structure', struct)
                    system = system.normalize()
                    structure_file = os.path.join(comp_directory, entry['material_id']+'.poscar')
                    system.dump('poscar', f=structure_file)
                    print('Added', entry['material_id'])

### Get reference structures

Specify personal Materials Project API key

In [8]:
mp_api_key_location = 'C:\\users\\lmh1\\Documents\\Materials Project\\API key.txt'

with open(mp_api_key_location) as f:
    mp_api_key = f.read()

In [9]:
for elements in elements_set:
    get_mp_structures(elements.split(), api_key=mp_api_key)

Added mp-1007760
Added mp-1008733
Added mp-1061054
Added mp-1067619
Added mp-1079020
Added mp-1080106
Added mp-1091415
Added mp-12093
Added mp-128
Added mp-137
Added mp-148
Added mp-32
Added mp-78
Added mp-998883
Added mp-10
Added mp-1096826
Added mp-11
Added mp-158
Added mp-10048
Added mp-1059094
Added mp-15619
Added mp-2534
Added mp-603640
Added mp-8883
Added mp-10021
Added mp-1007857
Added mp-1067880
Added mp-140
Added mp-142
Added mp-567540
Added mp-569007
Added mp-569423
Added mp-1071163
Added mp-1079207
Added mp-1101022
Added mp-1101295
Added mp-1101463
Added mp-1102255
Added mp-1102591
Added mp-1105338
Added mvc-11115
Added mvc-11423
Added mvc-11500
Added mvc-11600
Added mvc-11912
Added mvc-12404
Added mvc-12466
Added mvc-12939
Added mvc-13391
Added mvc-1923
Added mvc-2169
Added mvc-379
Added mvc-4715
Added mvc-5171
Added mvc-6590
Added mvc-9726
Added mp-24093
Added mp-1007824
Added mp-2853
Added mp-804
Added mp-830
Added mp-1056004
Added mp-1066523
Added mp-1079710
Added mp-107

Added mp-1097290
Added mp-12086
Added mp-12608
Added mp-644311
Added mp-973983
Added mp-1100766
Added mp-12798
Added mp-945
Added mp-1057216
Added mp-1094034
Added mp-1094056
Added mp-1096549
Added mp-972751
Added mp-978534


## 3. Fetch OQMD reference structures

### Define get_oqmd_structures() function

In [10]:
def get_oqmd_structures(elements, lib_directory=None):
    """
    Accesses the Materials Project and downloads all structures for a list of
    elements as poscar files.
    
    Parameters
    ----------
    elements : list
        A list of element symbols.
    lib_directory : str
        Path to the lib_directory to save the poscar files to.  Default uses
        the iprPy library/dft_structures directory.
    """
    # Function-specific imports
    import requests
    
    # Define subset generator
    def subsets(fullset):
        for i, item in enumerate(fullset):
            yield [item]
            if len(fullset) > 1:
                for subset in subsets(fullset[i+1:]):
                    yield [item] + subset
    
    # Get default lib_directory
    if lib_directory is None:
        lib_directory = os.path.join(os.path.dirname(iprPy.rootdir), 'library', 'ref')
    lib_directory = os.path.abspath(lib_directory)
    
    # Set comp_directory
    elements.sort()
    have = []
    for subelements in subsets(elements):
        elements_string = '-'.join(subelements)
        comp_directory = os.path.join(lib_directory, elements_string)
        if not os.path.isdir(comp_directory):
            os.makedirs(comp_directory)
        
        # Build list of downloaded entries
        for fname in glob.iglob(os.path.join(comp_directory, 'oqmd-*.poscar')):
            have.append(os.path.splitext(os.path.basename(fname))[0])
    #print('Have', len(have), 'records')
    
    # Build list of missing OQMD entries
    elements_string = '-'.join(elements)
    
    composition_r = requests.get('http://oqmd.org/materials/composition/' + elements_string)
    composition_html = composition_r.text
    
    missing = []
    count = 0
    while True:
        count += 1
        try:
            start = composition_html.index('href="/materials/entry/') + len('href="/materials/entry/')
        except:
            break
        else:
            end = start + composition_html[start:].index('">')
            entry_number = composition_html[start:end]
            composition_html = composition_html[end+2:]
            entry_id = 'oqmd-'+entry_number
            if entry_id not in have and entry_id not in missing:
                missing.append(entry_id)
        if count > 100:
            raise ValueError('Loop likely infinite')
    #print('Missing', len(missing), 'records')
    
    # Download missing entries
    for entry_id in missing:
        entry_number = entry_id.replace('oqmd-', '')
        entry_r = requests.get('http://oqmd.org/materials/entry/' + entry_number)
        entry_html = entry_r.text
        
        start = entry_html.index('href="/materials/structure/') + len('href="/materials/structure/')
        end = start + entry_html[start:].index('">')
        structure_number = entry_html[start:end]
        
        try:
            structure_url = 'http://oqmd.org/materials/export/conventional/poscar/' + structure_number
            structure_r = requests.get(structure_url)
            structure_r.raise_for_status()
        except:
            try:
                structure_url = 'http://oqmd.org/materials/export/primitive/poscar/' + structure_number
                structure_r = requests.get(structure_url)
                structure_r.raise_for_status()
            except:
                continue
        
        # Save poscar
        poscar = structure_r.text
        system = am.load('poscar', poscar)
        system = system.normalize()
        elements_string = '-'.join(system.symbols)
        structure_file = os.path.join(lib_directory, elements_string, entry_id + '.poscar')
        
        with open(structure_file, 'w') as f:
            f.write(poscar)
        print('Added', entry_id)

### Get reference structures

In [11]:
for elements in elements_set:
    try:
        get_oqmd_structures(elements.split())
    except:
        print('Failed:', elements)

Failed: Al H Ni
Added oqmd-347599
Added oqmd-323113
Added oqmd-326510
Added oqmd-1230982
Added oqmd-336968
Added oqmd-345030
Added oqmd-320544
Added oqmd-1104611
Added oqmd-1224010
Added oqmd-1221817
Added oqmd-1227496
Added oqmd-1234468
Added oqmd-428546
Added oqmd-1215454
Added oqmd-1215275
Added oqmd-1216080
Added oqmd-1214830
Added oqmd-1215721
Added oqmd-1214919
Added oqmd-1214652
Added oqmd-1215008
Added oqmd-1215186
Added oqmd-1215899
Added oqmd-1215097
Added oqmd-1214741
Added oqmd-1215632
Added oqmd-1215988
Added oqmd-1215543
Added oqmd-1215920
Added oqmd-1214851
Added oqmd-1214940
Added oqmd-1215742
Added oqmd-1214673
Added oqmd-1215296
Added oqmd-1215475
Added oqmd-1216101
Added oqmd-1215385
Added oqmd-1215118
Added oqmd-1215653
Added oqmd-1214762
Added oqmd-1216009
Added oqmd-1215564
Added oqmd-1215254
Added oqmd-1216059
Added oqmd-1215433
Added oqmd-1215789
Added oqmd-1215878
Added oqmd-1215700
Added oqmd-1214898
Added oqmd-1214987
Added oqmd-1214809
Added oqmd-1215076
Add

Added oqmd-1215398
Added oqmd-1215219
Added oqmd-1215041
Added oqmd-1215932
Added oqmd-1215308
Added oqmd-1214863
Added oqmd-1215487
Added oqmd-1215576
Added oqmd-1214774
Added oqmd-1214596
Added oqmd-1215843
Added oqmd-1216021
Added oqmd-1214507
Added oqmd-1215665
Added oqmd-1214952
Added oqmd-1215130
Added oqmd-675529
Added oqmd-676234
Added oqmd-1215715
Added oqmd-1215448
Added oqmd-1216074
Added oqmd-1215358
Added oqmd-1215269
Added oqmd-1214646
Added oqmd-1215893
Added oqmd-1214824
Added oqmd-1214913
Added oqmd-1215002
Added oqmd-1215091
Added oqmd-1214735
Added oqmd-1215626
Added oqmd-1215537
Added oqmd-1215982
Added oqmd-1215599
Added oqmd-1104666
Added oqmd-1234996
Added oqmd-324676
Added oqmd-349162
Added oqmd-1229742
Added oqmd-320642
Added oqmd-345128
Added oqmd-1220577
Added oqmd-1226256
Added oqmd-1233228
Added oqmd-427306
Added oqmd-1215688
Added oqmd-1215421
Added oqmd-1214797
Added oqmd-1215064
Added oqmd-1214975
Added oqmd-1214619
Added oqmd-1215242
Added oqmd-1215866


Added oqmd-1215851
Added oqmd-1215762
Added oqmd-1215940
Added oqmd-1215495
Failed: Ag Au Cu
Failed: Al Cu H
Added oqmd-1215750
Added oqmd-1217513
Added oqmd-1227979
Added oqmd-326940
Added oqmd-324802
Added oqmd-321396
Added oqmd-349288
Added oqmd-345882
Added oqmd-1231465
Added oqmd-1105041
Added oqmd-425542
Added oqmd-1224493
Added oqmd-1218299
Added oqmd-684912
Added oqmd-1214873
Added oqmd-1215853
Added oqmd-1215675
Added oqmd-1215408
Added oqmd-1216031
Added oqmd-1215229
Added oqmd-1214606
Added oqmd-1214784
Added oqmd-1215051
Added oqmd-1214962
Added oqmd-1215140
Added oqmd-1214695
Added oqmd-1215586
Added oqmd-1215942
Added oqmd-1215497
Added oqmd-1217542
Added oqmd-1228008
Added oqmd-318597
Added oqmd-325571
Added oqmd-344444
Added oqmd-319958
Added oqmd-1231494
Added oqmd-425572
Added oqmd-1103673
Added oqmd-1224522
Added oqmd-1218328
Added oqmd-1235829
Added oqmd-1217571
Added oqmd-323620
Added oqmd-347186
Added oqmd-1228037
Added oqmd-327581
Added oqmd-425601
Added oqmd-110

Added oqmd-1214844
Added oqmd-1214577
Added oqmd-1215557
Added oqmd-1215022
Added oqmd-1215315
Added oqmd-1214514
Added oqmd-1216028
Added oqmd-1215672
Added oqmd-1215226
Added oqmd-1215405
Added oqmd-1214603
Added oqmd-1214870
Added oqmd-1214959
Added oqmd-1214781
Added oqmd-1215137
Added oqmd-1215048
Added oqmd-1215583
Added oqmd-1214692
Added oqmd-1215850
Added oqmd-1215939
Added oqmd-1215494
Added oqmd-1215301
Added oqmd-1216106
Added oqmd-1215747
Added oqmd-1215480
Added oqmd-1215836
Added oqmd-1214678
Added oqmd-1215925
Added oqmd-1214945
Added oqmd-1214856
Added oqmd-1215034
Added oqmd-1215123
Added oqmd-1214767
Added oqmd-1215658
Added oqmd-1216014
Added oqmd-1215569
Added oqmd-319467
Added oqmd-343953
Added oqmd-1220347
Added oqmd-338225
Added oqmd-347556
Added oqmd-1229512
Added oqmd-327767
Added oqmd-323070
Added oqmd-427076
Added oqmd-1105868
Added oqmd-1226026
Added oqmd-1232998
Added oqmd-1222541
Added oqmd-348629
Added oqmd-324143
Added oqmd-325327
Added oqmd-349813
Adde

## 4. Run crystal_space_group calculations on the reference structures

In [12]:
calculation = iprPy.load_calculation('crystal_space_group')
run_directory = iprPy.load_run_directory('master_1')

In [13]:
input_script = """

# Build load information based on prototype records
buildcombos                 crystalprototype load_file proto

# Build load information based on reference structures
buildcombos                 atomicreference load_file ref

# Specify reference buildcombos limiters (only build for element sets listed)
#ref_elements                Ag

# Units that input/output values are in
length_unit                 
pressure_unit               
energy_unit                 
force_unit                  

# Run parameters
symmetryprecision           
primitivecell               
idealcell                   
"""
with open('input_script.in', 'w') as f:
    f.write(input_script)

In [14]:
calculation = iprPy.load_calculation('crystal_space_group')

with open('input_script.in') as f:
    input_dict = iprPy.input.parse(f, singularkeys=calculation.singularkeys)
    
database.prepare(run_directory, calculation, **input_dict)
database.check_records(calculation.record_style)

In database style local at C:\Users\lmh1\Documents\calculations\ipr\master :
- 4829 of style calculation_crystal_space_group
 - 0 are complete
 - 4829 still to run
 - 0 issued errors


In [15]:
database.runner(run_directory)
database.check_records(calculation.record_style)

Runner started with pid 12292
No simulations left to run
In database style local at C:\Users\lmh1\Documents\calculations\ipr\master :
- 4829 of style calculation_crystal_space_group
 - 4829 are complete
 - 0 still to run
 - 0 issued errors


## 5. Match prototypes to references

### Get results and split into reference and prototype

In [13]:
results_df = database.get_records_df(style=calculation.record_style, full=True, flat=True)

In [14]:
reference_df = results_df[results_df.family+'.poscar'==results_df.load_file].reset_index()
prototype_df = results_df[results_df.family+'.json'==results_df.load_file].reset_index()

### Match based on space group information

In [17]:
try:
    match_df = pd.read_csv('reference_prototype_match.csv')
except:
    match_df = pd.DataFrame()

if len(reference_df) != len(match_df):
    # Match based on Pearson symbol and space group number
    match_df = []
    for reference in reference_df.itertuples():
        match_dict = {}
        match_dict['reference'] = reference.family
        match_dict['site'], match_dict['number'] = reference.family.split('-')
        match_dict['number'] = int(match_dict['number'])
        matches = prototype_df[((reference.pearson_symbol == prototype_df.pearson_symbol)
                               &(reference.spacegroup_number == prototype_df.spacegroup_number))]
        if len(matches) == 1:
            match_dict['prototype'] = matches.iloc[0].family
            match_dict['ref_wykoff'] = reference.wykoff_letters
        elif len(matches) == 0:
            match_dict['prototype'] = np.nan
        else:
            match_dict['prototype'] = 'multiple'
        match_df.append(match_dict)
    match_df = pd.DataFrame(match_df)

    # Check known equivalent Wykoff sites for prototypes
    match_df.loc[(match_df.prototype=='A1--Cu--fcc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A2--W--bcc') & (~match_df.ref_wykoff.isin(['a'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A3--Mg--hcp') & (~match_df.ref_wykoff.isin(['b', 'c', 'd'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=="A3'--alpha-La--double-hcp") & (~match_df.ref_wykoff.isin(['a b', 'a c', 'a d'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A4--C--dc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A5--beta-Sn') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A6--In--bct') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A7--alpha-As') & (~match_df.ref_wykoff.isin(['c'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='Ah--alpha-Po--sc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A15--beta-W') & (~match_df.ref_wykoff.isin(['a c', 'a d'])),
                 'prototype'] = np.nan

    # Sort and remove excess information
    match_df = match_df.sort_values(['site', 'number']).reset_index()[['reference', 'prototype']]
    
    # Identify reference compositions
    refdir = os.path.join(iprPy.rootdir, '..', 'library', 'ref')
    match_df['composition'] = [np.nan for i in range(len(match_df))]
    for fname in glob.iglob(os.path.join(refdir, '*', '*.poscar')):
        reference = os.path.splitext(os.path.basename(fname))[0]
        with open(fname) as f:
            lines = f.readlines()
            symbols = lines[5].split()
            counts = list(np.array(lines[6].split(), dtype=int))
            composition = iprPy.analysis.composition(symbols, counts)
            match_df.loc[match_df.reference==reference, 'composition'] = composition
    
    # Save
    match_df.to_csv('reference_prototype_match.csv', index=False)

In [18]:
match_df

Unnamed: 0,reference,prototype,composition
0,mp-124,A1--Cu--fcc,Ag
1,mp-8566,A3'--alpha-La--double-hcp,Ag
2,mp-10597,A3--Mg--hcp,Ag
3,mp-989737,,Ag
4,oqmd-10125,A3--Mg--hcp,Ag
5,oqmd-11876,A3'--alpha-La--double-hcp,Ag
6,oqmd-676271,A1--Cu--fcc,Ag
7,oqmd-1214591,,Ag
8,oqmd-1214680,,Ag
9,oqmd-1214769,,Ag
