# GBML-Pandas demo.

Creates descriptor dataframe using the data fetched from materialsproject database and use gbml to make predictions.
This is a simplified translation of gbml/elasticity.py.

Require: pandas package

Author: Kiran Mathew

In [1]:
import os
import json
from collections import defaultdict

import numpy as np

import pandas as pd

from pymatgen.core.periodic_table import Element
from pymatgen.core.composition import Composition
from pymatgen.ext.matproj import MPRester

import gbml

Read the data file containing the "atom in a box" energy for the elements

In [2]:
DATAFILE_AIAB = os.path.join(os.path.dirname(gbml.__file__), "data", "element_aiab_energy.json")

aiab_energy_dict = {}
try:
    with open(DATAFILE_AIAB,'r') as json_file:
            aiab_energy_dict = json.load(json_file)
except:
    print "failed to open the data file"

In [3]:
def holder_mean(x, w, p):
    """
    generalized mean
    """
    return np.power(np.dot(w, np.power(x, p)), 1./p)

def get_aiab_energy(element):
    """
    return atom in a box energy for the given element
    """
    d = aiab_energy_dict.get(str(element), None)
    if d:
        return d[0]
    return None

def append_data(df, d, axis=1):
    """
    concatenate the given data to the DataFrame along the specified axis
    
    Args:
        df (DataFrame): input DataFrame
        d (dict): data to be appended as dict. The keys become the column names
        axis (int): concat direction
    """
    df2 = pd.DataFrame(d, index=df.index)
    return pd.concat([df, df2], axis=axis)

Get the raw data from materials project database and convert to pandas DataFrame.

In [4]:
mpids = ["mp-10003","mp-10010","mp-10015","mp-10021","mp-26","mp-10018","mp-19306"]
properties=["pretty_formula", "nsites", "volume", "energy_per_atom"]

data = defaultdict(list)

MAPI_KEY = os.environ.get("MAPI_KEY", "")
with MPRester(MAPI_KEY) as mpr:
    for entry in mpr.query(criteria={"task_id": {"$in": mpids}}, properties=properties):
        for p in properties:
            data[p].append(entry[p])
            
df = pd.DataFrame(data, index=mpids)
df.head(20)

Unnamed: 0,energy_per_atom,nsites,pretty_formula,volume
mp-10003,-9.174498,12,Nb4CoSi,194.512816
mp-10010,-6.300609,5,Al(CoSi)2,61.957195
mp-10015,-8.66026,2,SiOs,25.915606
mp-10021,-4.09311,1,Ac,45.38462
mp-26,-3.026048,2,Ga,38.007666
mp-10018,-6.70962,14,Fe3O4,155.341182
mp-19306,-4.925772,4,La,148.59786


Add processed data(average electroneagtivity, average rows etc) to the DataFrame

In [5]:
d =defaultdict(list)

for formula in df.pretty_formula:
    comp = Composition(formula)
    rows = []
    el_negs = []
    weights = []
    energies = []
    for el in comp:
        rows.append(el.row)
        el_negs.append(el.X)
        weights.append(comp.get_atomic_fraction(el))
        energies.append(get_aiab_energy(el))
    d["rows_avg"].append(holder_mean(rows, weights, 1.0))
    d["X_avg"].append(holder_mean(rows, weights, -4.0))
    d["reference_energy"].append(np.average(energies, weights=weights))
    
df = append_data(df, d)
df.head(20)

Unnamed: 0,energy_per_atom,nsites,pretty_formula,volume,X_avg,reference_energy,rows_avg
mp-10003,-9.174498,12,Nb4CoSi,194.512816,4.034238,-2.567305,4.5
mp-10010,-6.300609,5,Al(CoSi)2,61.957195,3.249399,-1.154403,3.4
mp-10015,-8.66026,2,SiOs,25.915606,3.513958,-1.86715,4.5
mp-10021,-4.09311,1,Ac,45.38462,9.0,-0.075384,9.0
mp-26,-3.026048,2,Ga,38.007666,4.0,-0.341093,4.0
mp-10018,-6.70962,14,Fe3O4,155.341182,2.274133,-2.428358,2.857143
mp-19306,-4.925772,4,La,148.59786,8.0,-0.691017,8.0


Pre-process volume and energy data

In [6]:
df["log_volume_per_atom"] = np.log10(df.volume/df.nsites)
df["energy_per_atom"] = df.energy_per_atom - df.reference_energy
df.head(20)

Unnamed: 0,energy_per_atom,nsites,pretty_formula,volume,X_avg,reference_energy,rows_avg,log_volume_per_atom
mp-10003,-6.607193,12,Nb4CoSi,194.512816,4.034238,-2.567305,4.5,1.209767
mp-10010,-5.146206,5,Al(CoSi)2,61.957195,3.249399,-1.154403,3.4,1.093122
mp-10015,-6.79311,2,SiOs,25.915606,3.513958,-1.86715,4.5,1.112531
mp-10021,-4.017725,1,Ac,45.38462,9.0,-0.075384,9.0,1.656909
mp-26,-2.684955,2,Ga,38.007666,4.0,-0.341093,4.0,1.278841
mp-10018,-4.281262,14,Fe3O4,155.341182,2.274133,-2.428358,2.857143,1.045159
mp-19306,-4.234755,4,La,148.59786,8.0,-0.691017,8.0,1.569953


Drop columns that are not needed

In [7]:
df.drop(["pretty_formula","volume", "nsites", "reference_energy"], axis=1, inplace=True)
df.head(20)

Unnamed: 0,energy_per_atom,X_avg,rows_avg,log_volume_per_atom
mp-10003,-6.607193,4.034238,4.5,1.209767
mp-10010,-5.146206,3.249399,3.4,1.093122
mp-10015,-6.79311,3.513958,4.5,1.112531
mp-10021,-4.017725,9.0,9.0,1.656909
mp-26,-2.684955,4.0,4.0,1.278841
mp-10018,-4.281262,2.274133,2.857143,1.045159
mp-19306,-4.234755,8.0,8.0,1.569953


In [8]:
cols = df.columns.tolist()
print(cols)

['energy_per_atom', 'X_avg', 'rows_avg', 'log_volume_per_atom']


Rearrange the columns

In [9]:
cols = [cols[3], cols[2], cols[0], cols[1] ]
df = df[cols]
df.head(20)

Unnamed: 0,log_volume_per_atom,rows_avg,energy_per_atom,X_avg
mp-10003,1.209767,4.5,-6.607193,4.034238
mp-10010,1.093122,3.4,-5.146206,3.249399
mp-10015,1.112531,4.5,-6.79311,3.513958
mp-10021,1.656909,9.0,-4.017725,9.0
mp-26,1.278841,4.0,-2.684955,4.0
mp-10018,1.045159,2.857143,-4.281262,2.274133
mp-19306,1.569953,8.0,-4.234755,8.0


Use the python interface to the gbml library to make predictions

In [10]:
import gbml.core

num_predictions = len(mpids)
k_descriptors = df.values

k_predictions = np.empty(num_predictions)

# Make predictions
k_filename = os.path.join(os.path.dirname(gbml.__file__), "data", "gbml-K-v1.00.data")
gbml.core.predict(k_filename, num_predictions, k_descriptors, k_predictions)

k_list = np.power(10.0, k_predictions).tolist()
print k_list

[158.55828155096452, 128.85790396876027, 256.8286476414063, 26.070192490354323, 34.19468644815877, 134.74407265598936, 37.774659850972974]
