# Generate webpage content for calculation_point_defect_formation records

This Notebook is designed for reading finished calculation_point_defect_formation records and generating the associated webpage content.

#### Library imports

In [1]:
# Standard Python libraries
from __future__ import print_function
import glob
import os
from collections import OrderedDict
from copy import deepcopy

from IPython.core.display import display, HTML

# pandas.pydata.org
import pandas as pd

# http://www.numpy.org/
import numpy as np

# https://github.com/usnistgov/DataModelDict
from DataModelDict import DataModelDict as DM

# https://github.com/usnistgov/atomman
import atomman as am
import atomman.unitconvert as uc

# https://github.com/usnistgov/iprPy
import iprPy

import analysis

#### Plotting library imports

In [2]:
# https://bokeh.pydata.org/
import bokeh
from bokeh.plotting import figure, output_file, show
from bokeh.embed import components
from bokeh.resources import Resources, CDN
from bokeh.io import output_notebook
from bokeh.models import Range1d
print('bokeh version =', bokeh.__version__)
output_notebook()

bokeh version = 0.12.7


## 1. Read Calculation Data

This section reads in raw data from a database. 

## 1. Raw Data

This section reads in or generates the raw_data associated with the calculation. 

### 1.1 Initialize database

- __dbasename__ is used here to predefine different dbase settings
- __dbase__ is the iprPy.Database object to use for accessing a database

In [3]:
dbasename = 'iprhub'

# 'local' is a local directory
if   dbasename == 'local':
    dbase = iprPy.Database('local',   host='C:\Users\lmh1\Documents\calculations\ipr\library')

# 'test' is a local directory for testing 
if   dbasename == 'test':
    dbase = iprPy.Database('local',   host='C:\Users\lmh1\Documents\calculations\ipr\library_test')
    
# 'curator' is a local MDCS curator
elif dbasename == 'curator':
    dbase = iprPy.Database('curator', host='http://127.0.0.1:8000/', 
                                      user='admin', 
                                      pswd='admin')

# 'iprhub' is the remote MDCS curator at iprhub
elif dbasename == 'iprhub':
    dbase = iprPy.Database('curator', host='https://iprhub.nist.gov/', 
                                      user='lmh1',
                                      pswd='C:/users/lmh1/documents/iprhub/iprhub_password.txt',
                                      cert='C:/users/lmh1/documents/iprhub/iprhub-ca.pem')
else:
    raise ValueError('unknown dbasename ' + dbasename)

### 1.2 Access records

In [4]:
proto_df = dbase.get_records_df(style='crystal_prototype')
print(str(len(proto_df)) + ' prototype records loaded')

19 prototype records loaded


In [5]:
pot_df = dbase.get_records_df(style='potential_LAMMPS')
print(str(len(pot_df)) + ' potential records loaded')

156 potential records loaded


In [6]:
raw_df = dbase.get_records_df(style='calculation_point_defect_formation')
print(str(len(raw_df)) + ' calculation records loaded')

5650 calculation records loaded


### 1.3 Check errors

In [7]:
if 'error' in raw_df:
    for error in np.unique(raw_df[pd.notnull(raw_df.error)].error):
        print(error)
        print()

Traceback (most recent call last):
  File "calc_point_defect_static.py", line 407, in <module>
    main(*sys.argv[1:])
  File "calc_point_defect_static.py", line 54, in main
    dmax = input_dict['maxatommotion'])
  File "calc_point_defect_static.py", line 218, in pointdefect
    system_ptd = lmp.atom_dump.load(last_dump_file)
  File "c:\users\lmh1\documents\python-packages\atomman\atomman\lammps\atom_dump.py", line 205, in load
    raise ValueError('Failed to properly load dump file '+str(data)[:50])
ValueError: Failed to properly load dump file atom.10000

Traceback (most recent call last):
  File "calc_point_defect_static.py", line 407, in <module>
    main(*sys.argv[1:])
  File "calc_point_defect_static.py", line 54, in main
    dmax = input_dict['maxatommotion'])
  File "calc_point_defect_static.py", line 218, in pointdefect
    system_ptd = lmp.atom_dump.load(last_dump_file)
  File "c:\users\lmh1\documents\python-packages\atomman\atomman\lammps\atom_dump.py", line 205, in 

## 2. Process Data

This section processes and refines the data.

### 2.1 Identify composition

We need to identify the composition of each calculation so that we can collect duplicates and filter out artificial compounds.

- __counts__ is a dictionary counting the number of times each atype appears in a crystal prototype's unit cell (i.e. the number of symmetry equivalent sites)

In [8]:
counts = {}
for i, prototype in proto_df.iterrows():
    model = DM(dbase.get_record(name=prototype.id, style='crystal_prototype').content)
    counts[prototype.id] = np.unique(model.finds('component'), return_counts=True)[1]

- __comp_refine()__ takes a list of symbols and count of how many times each symbol appears in a structure and generates a composition string.__comp_refine__ takes a list of symbols and count of how many times each symbol appears in a structure and generates a composition string.

In [9]:
def comp_refine(symbols, counts):
    """Takes a list of symbols and count of how many times each symbol appears and generates a composition string."""
    primes = [2,3,5,7,11,13,17,19,23,29,31,37,41,43,47]
    
    sym_dict = {}
    for i in xrange(len(symbols)):
        sym_dict[symbols[i]] = counts[i]
    
    for prime in primes:
        if max(sym_dict.values()) < prime:
            break
        
        while True:
            breaktime = False
            for value in sym_dict.values():
                if value % prime != 0:
                    breaktime = True
                    break
            if breaktime:
                break
            for key in sym_dict:
                sym_dict[key] /= prime
    
    composition=''
    for key in sorted(sym_dict):
        if sym_dict[key] > 0:
            composition += key
            if sym_dict[key] != 1:
                composition += str(sym_dict[key])
            
    return composition       

In [10]:
compositions = []
for i, calc in raw_df.iterrows():
    compositions.append(comp_refine(calc.symbols, counts[calc.family]))
raw_df = raw_df.assign(composition=compositions)

In [11]:
raw_df.keys()

Index([u'E_f', u'LAMMPS_version', u'calc_key', u'calc_script',
       u'centrosummation', u'db_vect_shift', u'energytolerance', u'error',
       u'family', u'forcetolerance', u'iprPy_version', u'load_file',
       u'load_options', u'load_style', u'maxatommotion', u'maxevaluations',
       u'maxiterations', u'natoms', u'pointdefect_id', u'pointdefect_key',
       u'position_shift', u'potential_LAMMPS_id', u'potential_LAMMPS_key',
       u'potential_id', u'potential_key', u'reconfigured', u'sizemults',
       u'status', u'symbols', u'composition'],
      dtype='object')

In [15]:
Al_df = raw_df[(raw_df.family=='A1--Cu--fcc') 
             & (raw_df.pointdefect_id=='A1--Cu--fcc--vacancy') 
             & (raw_df.composition == 'Al')
              ]

In [21]:
raw_df.iloc[0].energytolerance

0.0

In [18]:
Al_df[[u'potential_LAMMPS_id', u'E_f', u'reconfigured']].sort_values(u'potential_LAMMPS_id')

Unnamed: 0,potential_LAMMPS_id,E_f,reconfigured
1513,1995--Angelo-J-E--Ni-Al-H--LAMMPS--ipr1,0.524655,False
4425,1995--Angelo-J-E--Ni-Al-H--LAMMPS--ipr1,0.524655,False
2367,1996--Farkas-D--Nb-Ti-Al--LAMMPS--ipr1,1.168996,False
3305,1996--Farkas-D--Nb-Ti-Al--LAMMPS--ipr1,1.168996,False
209,1997--Liu-X-Y--Al-Mg--LAMMPS--ipr1,2.6e-05,False
3650,1997--Liu-X-Y--Al-Mg--LAMMPS--ipr1,0.688162,False
1561,1997--Liu-X-Y--Al-Mg--LAMMPS--ipr1,0.688162,False
4402,1997--Liu-X-Y--Al-Mg--LAMMPS--ipr1,,
4128,1998--Liu-X-Y--Al-Mg--LAMMPS--ipr1,0.681849,False
4925,1998--Liu-X-Y--Al-Mg--LAMMPS--ipr1,0.681849,False


### 2.2 Identify current ipr potentials 

In [12]:
# Extract versionstyle and versionnumber from potential implementation ids
versionstyle = []
versionnumber = []
for name in pot_df['id'].values:
    version = name.split('--')[-1]
    try:
        versionnumber.append(int(version[-1]))
    except:
        versionnumber.append(np.nan)
        versionstyle.append(version)
    else:
        versionstyle.append(version[:-1])

pot_df['versionstyle'] = versionstyle
pot_df['versionnumber'] = versionnumber

# Loop through unique potential id's
includeid = []
for pot_id in np.unique(pot_df.pot_id.values):
    check_df = pot_df[pot_df.pot_id == pot_id]
    check_df = check_df[check_df.versionstyle == 'ipr']
    check_df = check_df[check_df.versionnumber == check_df.versionnumber.max()]
    if len(check_df) == 1:
        includeid.append(check_df['id'].values[0])
    elif len(check_df) > 1:
        raise ValueError('Bad currentIPR check for '+pot_id)

# Identify current IPR potentials
raw_df['currentIPR'] = raw_df.potential_LAMMPS_id.isin(includeid)

### 2.4 Remove unwanted calculations

Here is where we filter out unwanted entries (i.e. rows).

- __df__ is the dataframe during/after processing and refining

In [13]:
raw_df.keys()

Index([u'E_f', u'LAMMPS_version', u'calc_key', u'calc_script',
       u'centrosummation', u'db_vect_shift', u'energytolerance', u'error',
       u'family', u'forcetolerance', u'iprPy_version', u'load_file',
       u'load_options', u'load_style', u'maxatommotion', u'maxevaluations',
       u'maxiterations', u'natoms', u'pointdefect_id', u'pointdefect_key',
       u'position_shift', u'potential_LAMMPS_id', u'potential_LAMMPS_key',
       u'potential_id', u'potential_key', u'reconfigured', u'sizemults',
       u'status', u'symbols', u'composition', u'currentIPR'],
      dtype='object')

In [14]:
df = deepcopy(raw_df)

# Ignore unfinished or error calculations
df = df[df.status == 'finished']

# Ignore any implementations that are not current IPR implementations
df = df[df.currentIPR == True]

# Ignore false compounds (where # of unique symbols != # of symbols)
df = df[df.symbols.apply(lambda x: len(np.unique(x))) == df.symbols.apply(lambda x: len(x))] 

# Ignore duplicate compounds
ignore = set()
for i in xrange(len(df)):
    trunc = df.iloc[i+1:]
    matches = trunc.calc_key[  (trunc.potential_id == df.iloc[i].potential_id) 
                             & (trunc.family == df.iloc[i].family) 
                             & (trunc.composition == df.iloc[i].composition)
                             & (trunc.pointdefect_id == df.iloc[i].pointdefect_id)
                             & np.isclose(trunc.E_f, df.iloc[i].E_f, atol=1e-6, rtol=0.0)
                            ].tolist()
    ignore = ignore.union(matches)
df = df[~df.calc_key.isin(ignore)]

df.reset_index(drop=True, inplace=True)
print(str(len(df)) + ' records after filtering')

2903 records after filtering


### 2.5 Filter out extra data

Here, we limit the DataFrame to only the data that we care about (i.e. columns).

- __headers__ gives the list of data columns from raw_data to include in and how they should be renamed in data.

In [15]:
#                        raw names       new names
headers = OrderedDict([ ('potential_id', 'potential'  ),
                        ('family',       'family'     ),
                        ('composition',  'composition'),
                        ('pointdefect_id',   'pointdefect'),
                        ('E_f',     'E_f'),
                        ('reconfigured', 'reconfigured'),
                      ])

df = pd.DataFrame(df, columns=headers.keys())
df.rename(columns=headers, inplace=True)
df

Unnamed: 0,potential,family,composition,pointdefect,E_f,reconfigured
0,1989--Adams-J-B--Au,A1--Cu--fcc,Au,A1--Cu--fcc--octahedral-interstitial,2.635993e+00,False
1,1997--Liu-X-Y--Al-Mg,A3--Mg--hcp,Mg,A3--Mg--hcp--basal-octahedral-interstitial,1.533866e+00,True
2,2007--Mendelev-M-I--Zr-2,A3--Mg--hcp,Zr,A3--Mg--hcp--basal-tetrahedral-interstitial,2.863770e+00,True
3,2006--Sun-D-Y--Mg,A3--Mg--hcp,Mg,A3--Mg--hcp--basal-tetrahedral-interstitial,1.935852e+00,True
4,2013--Marinica-M-C--W-3,A2--W--bcc,W,A2--W--bcc--111-dumbbell,1.015395e+01,False
5,2015--Purja-Pun-G-P--Al-Co,A1--Cu--fcc,Al,A1--Cu--fcc--110-dumbbell,2.906437e+00,False
6,2008--Fortini-A--Ru,A3--Mg--hcp,Ru,A3--Mg--hcp--basal-tetrahedral-interstitial,4.908266e+00,True
7,2016--Zhang-Y--Ni-Nb,A1--Cu--fcc,Ni,A1--Cu--fcc--111-dumbbell,4.603151e+00,False
8,2015--Purja-Pun-G-P--Ni-Al-Co,A1--Cu--fcc,Ni,A1--Cu--fcc--2nn-divacancy,5.359408e-300,False
9,2013--Hale-L-M--Pd-Ag-H-Morse,A1--Cu--fcc,Ag,A1--Cu--fcc--1nn-divacancy,2.090427e+00,False


## 3. HTML Tables

This section takes the processed data and generates per_potential html tables.

In [16]:
html_info_file = 'html info.html'
with open(html_info_file) as f:
    html_info = f.read()
display(HTML(html_info))

In [20]:
table_style_file = '../calculation_system_relax/webtablestyle.html'

In [21]:
showSelectionScript = '<script type="text/javascript" src="analysis/showSelection.js"></script>\n'

In [22]:
with open(table_style_file) as f:
    table_style = f.read() 

In [23]:
def gen_ptd_table(df, potential):
    
    headers = OrderedDict([ ('pointdefect', 'Point Defect'),
                            ('E_f', '<i>E<sub>f</sub></i> (eV)'),
                            ('reconfigured', 'Reconfigured'),
                            ])
    def float_fmt(value):
        return '%8.4f' % value
    
    pot_df = df[df.potential==potential]
    tables = OrderedDict()
    
    for composition in np.unique(pot_df.composition):
        comp_df = pot_df[pot_df.composition==composition]
        for family in np.unique(comp_df.family):
            table_df = comp_df[comp_df.family==family].sort_values('E_f')
            table_df = pd.DataFrame(table_df, columns=headers.keys())

            table_df.E_f = uc.get_in_units(table_df.E_f, 'eV')
            names = []
            for pointdefect in table_df.pointdefect:
                names.append(pointdefect.replace(family+'--', ''))
            table_df.pointdefect = names

            table_df.rename(columns=headers, inplace=True)
            table_df.reset_index(drop=True, inplace=True)
            html = table_df.to_html(index=False, float_format=float_fmt, escape=False, classes='datatable')
            tables[composition + ' ' + family] = html

    return 'Select structure: ' + analysis.showSelection('Point', tables)

In [25]:
html = showSelectionScript+table_style
html += gen_ptd_table(df, '2009--Purja-Pun-G-P--Ni-Al')
display(HTML(html))

Point Defect,Ef (eV),Reconfigured
vacancy,0.6753,False
1nn-divacancy,1.3615,False
2nn-divacancy,1.378,False
100-dumbbell,2.5862,False
octahedral-interstitial,2.787,False
crowdion-interstitial,2.8998,False
110-dumbbell,2.9064,False
111-dumbbell,3.0013,False
tetrahedral-interstitial,3.0894,False

Point Defect,Ef (eV),Reconfigured
vacancy,1.5714,False
1nn-divacancy,2.9754,False
2nn-divacancy,3.1251,False
100-dumbbell,3.9509,False
111-dumbbell,4.2185,False
110-dumbbell,4.2833,False
crowdion-interstitial,4.2838,False
octahedral-interstitial,4.3094,False
tetrahedral-interstitial,4.4309,False


## 4. Generate for all potentials

#### Generation parameters

- __savedir__ is the directory where the files will be saved.

In [26]:
savedir = '../webcontent/perpotential'

In [27]:
# Loop over all potentials
for potential in np.unique(df.potential):
    print(potential)

    # Generate html content
    html = html_info
    html += gen_ptd_table(df, potential)
    
    with open(os.path.join(savedir, potential, 'Point.html'), 'w') as f:
        f.write(html)

1985--Foiles-S-M--Ni-Cu
1987--Ackland-G-J--Ag
1987--Ackland-G-J--Au
1987--Ackland-G-J--Cu
1987--Ackland-G-J--Mo
1987--Ackland-G-J--Ni
1989--Adams-J-B--Ag
1989--Adams-J-B--Au
1989--Adams-J-B--Cu
1989--Adams-J-B--Ni
1989--Adams-J-B--Pd
1989--Adams-J-B--Pt
1992--Ackland-G-J--Ti
1995--Angelo-J-E--Ni-Al-H
1996--Farkas-D--Nb-Ti-Al
1997--Ackland-G-J--Fe
1997--Liu-X-Y--Al-Mg
1998--Liu-X-Y--Al-Mg
1999--Liu-X-Y--Al-Cu
1999--Mishin-Y--Al
1999--Mishin-Y--Ni
2000--Landa-A--Al-Pb
2000--Sturgeon-J-B--Al
2001--Mishin-Y--Cu-1
2002--Mishin-Y--Ni-Al
2003--Han-S--Cs
2003--Han-S--K
2003--Han-S--Li
2003--Han-S--Na
2003--Han-S--Rb
2003--Han-S--V
2003--Han-S--W
2003--Hoyt-J-J--Cu-Pb
2003--Li-Y-H--Ta
2003--Mendelev-M-I--Fe-2
2003--Mendelev-M-I--Fe-5
2003--Zope-R-R--Al
2003--Zope-R-R--Ti-Al
2004--Ackland-G-J--Fe-P
2004--Liu-X-Y--Al
2004--Mishin-Y--Ni-Al
2004--Zhou-X-W--Ag
2004--Zhou-X-W--Al
2004--Zhou-X-W--Au
2004--Zhou-X-W--Co
2004--Zhou-X-W--Cu
2004--Zhou-X-W--Fe
2004--Zhou-X-W--Mg
2004--Zhou-X-W--Mo
2004--Zh