# Reference atomic structures

This Notebook gets reference atomic crystal structures from various external sites and compares them to known crystal prototype structures.

**Library imports**

In [1]:
# Standard Python libraries
from __future__ import (print_function, division, absolute_import,
                        unicode_literals)
import os
import glob

# http://www.numpy.org/
import numpy as np

# https://pandas.pydata.org/
import pandas as pd

# https://github.com/usnistgov/atomman
import atomman as am
import atomman.lammps as lmp

# https://github.com/usnistgov/iprPy
import iprPy
print('iprPy version', iprPy.__version__)

iprPy version 0.8.0


## 1. Build unique sets of elements from included potentials

### Load database

In [2]:
database = iprPy.load_database('master')
run_directory = iprPy.load_database('master_1')

### Get all elements lists from potentials

In [3]:
elements_set = set()
for potential_record in database.get_records(style='potential_LAMMPS'):
    potential = lmp.Potential(potential_record.content)
    elements = potential.elements()
    elements.sort()
    elements_set.add(' '.join(elements))

## 2. Fetch Materials Project reference structures

### Define get_mp_structures() function

In [4]:
def get_mp_structures(elements, api_key=None, lib_directory=None):
    """
    Accesses the Materials Project and downloads all structures for a list of
    elements as poscar files.
    
    Parameters
    ----------
    elements : list
        A list of element symbols.
    api_key : str, optional
        The user's Materials Project API key. If not given, will use "MAPI_KEY"
        environment variable
    lib_directory : str
        Path to the lib_directory to save the poscar files to.  Default uses
        the iprPy library/dft_structures directory.
    """
    # Function-specific imports
    import pymatgen as pmg
    from pymatgen.ext.matproj import MPRester
    
    # Define subset generator
    def subsets(fullset):
        for i, item in enumerate(fullset):
            yield [item]
            if len(fullset) > 1:
                for subset in subsets(fullset[i+1:]):
                    yield [item] + subset
    
    # Handle lib_directory
    if lib_directory is None:
        lib_directory = os.path.join(os.path.dirname(iprPy.rootdir), 'library', 'ref')
    lib_directory = os.path.abspath(lib_directory)
    
    elements.sort()
    
    # Open connection to Materials Project
    with MPRester(api_key) as m:
        
        # Loop over subsets of elements
        for subelements in subsets(elements):
            
            # Set comp_directory
            elements_string = '-'.join(subelements)
            comp_directory = os.path.join(lib_directory, elements_string)
            if not os.path.isdir(comp_directory):
                os.makedirs(comp_directory)
            
            # Build list of downloaded entries
            have = []
            for fname in glob.iglob(os.path.join(comp_directory, 'mp-*.poscar')):
                have.append(os.path.splitext(os.path.basename(fname))[0])
            #print('Have', len(have), elements_string, 'records')
            
            # Query MP for all entries corresponding to the elements
            entries = m.query({"elements": subelements}, ["material_id"])
            
            # Add entries to the list if not there
            missing = []
            for entry in entries:
                if entry['material_id'] not in have and entry['material_id'] not in missing:
                    missing.append(entry['material_id'])
            #print('Missing', len(missing), elements_string, 'records')
            
            # Download missing entries
            entries = m.query({"material_id": {"$in": missing}}, ['material_id', 'cif'])
            
            # Convert cif to poscar and save
            for entry in entries:
                struct = pmg.Structure.from_str(entry['cif'], fmt='cif')
                struct = pmg.symmetry.analyzer.SpacegroupAnalyzer(struct).get_conventional_standard_structure()
                system = am.load('pymatgen_Structure', struct)
                system = system.normalize()
                structure_file = os.path.join(comp_directory, entry['material_id']+'.poscar')
                system.dump('poscar', f=structure_file)
                print('Added', entry['material_id'])

### Get reference structures

In [6]:
mp_api_key_location = 'C:\\users\\lmh1\\Documents\\Materials Project\\API key.txt'

with open(mp_api_key_location) as f:
    mp_api_key = f.read()

for elements in elements_set:
    get_mp_structures(elements.split(), api_key=mp_api_key)

KeyboardInterrupt: 

## 3. Fetch OQMD reference structures

### Define get_oqmd_structures() function

In [7]:
def get_oqmd_structures(elements, lib_directory=None):
    """
    Accesses the Materials Project and downloads all structures for a list of
    elements as poscar files.
    
    Parameters
    ----------
    elements : list
        A list of element symbols.
    lib_directory : str
        Path to the lib_directory to save the poscar files to.  Default uses
        the iprPy library/dft_structures directory.
    """
    # Function-specific imports
    import requests
    
    # Define subset generator
    def subsets(fullset):
        for i, item in enumerate(fullset):
            yield [item]
            if len(fullset) > 1:
                for subset in subsets(fullset[i+1:]):
                    yield [item] + subset
    
    # Get default lib_directory
    if lib_directory is None:
        lib_directory = os.path.join(os.path.dirname(iprPy.rootdir), 'library', 'ref')
    lib_directory = os.path.abspath(lib_directory)
    
    # Set comp_directory
    elements.sort()
    have = []
    for subelements in subsets(elements):
        elements_string = '-'.join(subelements)
        comp_directory = os.path.join(lib_directory, elements_string)
        if not os.path.isdir(comp_directory):
            os.makedirs(comp_directory)
        
        # Build list of downloaded entries
        for fname in glob.iglob(os.path.join(comp_directory, 'oqmd-*.poscar')):
            have.append(os.path.splitext(os.path.basename(fname))[0])
    #print('Have', len(have), 'records')
    
    # Build list of missing OQMD entries
    elements_string = '-'.join(elements)
    
    composition_r = requests.get('http://oqmd.org/materials/composition/' + elements_string)
    composition_html = composition_r.text
    
    missing = []
    count = 0
    while True:
        count += 1
        try:
            start = composition_html.index('href="/materials/entry/') + len('href="/materials/entry/')
        except:
            break
        else:
            end = start + composition_html[start:].index('">')
            entry_number = composition_html[start:end]
            composition_html = composition_html[end+2:]
            entry_id = 'oqmd-'+entry_number
            if entry_id not in have and entry_id not in missing:
                missing.append(entry_id)
        if count > 100:
            raise ValueError('Loop likely infinite')
    #print('Missing', len(missing), 'records')
    
    # Download missing entries
    for entry_id in missing:
        entry_number = entry_id.replace('oqmd-', '')
        entry_r = requests.get('http://oqmd.org/materials/entry/' + entry_number)
        entry_html = entry_r.text
        
        start = entry_html.index('href="/materials/structure/') + len('href="/materials/structure/')
        end = start + entry_html[start:].index('">')
        structure_number = entry_html[start:end]
        
        try:
            structure_url = 'http://oqmd.org/materials/export/conventional/poscar/' + structure_number
            structure_r = requests.get(structure_url)
            structure_r.raise_for_status()
        except:
            try:
                structure_url = 'http://oqmd.org/materials/export/primitive/poscar/' + structure_number
                structure_r = requests.get(structure_url)
                structure_r.raise_for_status()
            except:
                continue
        
        # Save poscar
        poscar = structure_r.text
        system = am.load('poscar', poscar)
        system = system.normalize()
        elements_string = '-'.join(system.symbols)
        structure_file = os.path.join(lib_directory, elements_string, entry_id + '.poscar')
        
        with open(structure_file, 'w') as f:
            f.write(poscar)
        print('Added', entry_id)

### Get reference structures

In [8]:
for elements in elements_set:
    try:
        get_oqmd_structures(elements.split())
    except:
        print('Failed:', elements)

Failed: C H O
Failed: Al Cu Fe Mg Si
Added oqmd-10260
Added oqmd-20106
Added oqmd-300994
Added oqmd-311536
Added oqmd-302817
Added oqmd-307103
Added oqmd-313359
Failed: Ni Ti V
Failed: Cd Hg S Se Te Zn
Added oqmd-3167
Added oqmd-20737
Added oqmd-3721
Added oqmd-358132
Added oqmd-306404
Added oqmd-352803
Added oqmd-300866
Added oqmd-301423
Added oqmd-311408
Added oqmd-311965
Failed: meta_TWIP
Added oqmd-647437
Added oqmd-26737
Added oqmd-15976
Added oqmd-15977
Added oqmd-304812
Added oqmd-298224
Added oqmd-22454
Added oqmd-308758
Added oqmd-304353
Added oqmd-314895
Added oqmd-5286
Added oqmd-2065
Added oqmd-5012
Added oqmd-2392
Added oqmd-3214
Added oqmd-24931
Added oqmd-3213
Added oqmd-11536
Added oqmd-11535
Added oqmd-6967
Added oqmd-24787
Added oqmd-3448
Added oqmd-24290
Added oqmd-3447
Added oqmd-6968
Added oqmd-5346
Added oqmd-3971
Added oqmd-676672
Failed: C Fe Ti
Added oqmd-299733
Added oqmd-303985
Added oqmd-310275
Added oqmd-305561
Added oqmd-314527
Added oqmd-30265
Added oqmd-

## 4. Run crystal_space_group calculations on the reference structures

In [9]:
calculation = iprPy.load_calculation('crystal_space_group')

input_dict = {}
input_dict['buildcombos'] = ['crystalprototype load_file', 
                             'atomicreference load_file']

database.prepare(run_directory, calculation, **input_dict)
database.check_records(calculation.record_style)

In database style local at C:\Users\lmh1\Documents\calculations\ipr\master :
- 1823 of style calculation_crystal_space_group
 - 0 are complete
 - 1823 still to run
 - 0 issued errors


In [10]:
database.runner(run_directory)

Runner started with pid 3012
No simulations left to run


In [11]:
database.check_records(calculation.record_style)

In database style local at C:\Users\lmh1\Documents\calculations\ipr\master :
- 1823 of style calculation_crystal_space_group
 - 1823 are complete
 - 0 still to run
 - 0 issued errors


## 5. Match prototypes to references

### Get results and split into reference and prototype

In [12]:
results_df = database.get_records_df(style=calculation.record_style, full=True, flat=True)


In [13]:
reference_df = results_df[results_df.family+'.poscar'==results_df.load_file].reset_index()
prototype_df = results_df[results_df.family+'.json'==results_df.load_file].reset_index()

### Match based on space group information

In [14]:
try:
    match_df = pd.read_csv('reference_prototype_match.csv')
except:
    match_df = pd.DataFrame()

if len(reference_df) != len(match_df):
    # Match based on Pearson symbol and space group number
    match_df = []
    for reference in reference_df.itertuples():
        match_dict = {}
        match_dict['reference'] = reference.family
        match_dict['site'], match_dict['number'] = reference.family.split('-')
        match_dict['number'] = int(match_dict['number'])
        matches = prototype_df[((reference.pearson_symbol == prototype_df.pearson_symbol)
                               &(reference.spacegroup_number == prototype_df.spacegroup_number))]
        if len(matches) == 1:
            match_dict['prototype'] = matches.iloc[0].family
            match_dict['ref_wykoff'] = reference.wykoff_letters
        elif len(matches) == 0:
            match_dict['prototype'] = np.nan
        else:
            match_dict['prototype'] = 'multiple'
        match_df.append(match_dict)
    match_df = pd.DataFrame(match_df)

    # Check known equivalent Wykoff sites for prototypes
    match_df.loc[(match_df.prototype=='A1--Cu--fcc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A2--W--bcc') & (~match_df.ref_wykoff.isin(['a'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A3--Mg--hcp') & (~match_df.ref_wykoff.isin(['b', 'c', 'd'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=="A3'--alpha-La--double-hcp") & (~match_df.ref_wykoff.isin(['a b', 'a c', 'a d'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A4--C--dc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A5--beta-Sn') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A6--In--bct') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A7--alpha-As') & (~match_df.ref_wykoff.isin(['c'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='Ah--alpha-Po--sc') & (~match_df.ref_wykoff.isin(['a', 'b'])),
                 'prototype'] = np.nan
    match_df.loc[(match_df.prototype=='A15--beta-W') & (~match_df.ref_wykoff.isin(['a c', 'a d'])),
                 'prototype'] = np.nan

    match_df = match_df.sort_values(['site', 'number']).reset_index()[['reference', 'prototype']]
    match_df.to_csv('reference_prototype_match.csv', index=False)