# Move Thermo data from MP Thermo to an MPContribs project

---
## Header

#### Global variables

In [1]:
PROJECT = 'Corrections'

#### Imports

In [2]:
import os
from pprint import pprint
from pathlib import Path
import re
from tqdm import tqdm
import numpy as np
import xlrd
from monty.serialization import loadfn, dumpfn

#### Set Working Directory

In [3]:
workdir = Path(re.sub(r"(?<={})[\w\W]*".format(PROJECT), "", str(Path.cwd())))
os.chdir(workdir)

data_dir = workdir / '2_raw data'
pipeline_dir = workdir / '3_data analysis' / '2_pipeline'
output_dir = workdir / '3_data analysis' / '3_output'

---
## Main Code

## Set up the project

In [4]:
from mpcontribs.client import Client
name = 'experimental_thermo' # this should be your project, see from the project URL
client = Client() # uses MPCONTRIBS_API_KEY envvar

MaterialsProjectCompatibility will be updated with new correction classes as well as new values of corrections and uncertainties in 2020
  def get_pourbaix_entries(self, chemsys, solid_compat=MaterialsProjectCompatibility(), use_gibbs=False):


In [5]:
client.projects.update_entry(
    pk="experimental_thermo", project={"other": 
                                         {"ΔHᶠ": "Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^2/2 + C*t^3/3 + D*t^4/4 − E/t + F − H",
                                          "ΔGᶠ": "Gibbs free energy of formation from the elements.",
                                         "S": "Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G",
                                          "Cₚ": "Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2",
                                         "polynomial": "Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.",
                                          "ΔT": "Range of temperatures over which polynomial coefficients are valid.",
                                          "composition": "String representation of pymatgen Composition of the material.",
                                          "phase": "Material phase, e.g. 'gas', 'liquid', 'solid', 'monoclinic', etc."
                                         }
                                        }
).result()

{'other': {'ΔHᶠ': 'Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^2/2 + C*t^3/3 + D*t^4/4 − E/t + F − H',
  'ΔGᶠ': 'Gibbs free energy of formation from the elements.',
  'S': 'Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G',
  'Cₚ': 'Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2',
  'polynomial': 'Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.',
  'ΔT': 'Range of temperatures over which polynomial coefficients are valid.',
  'composition': 'String representation of pymatgen Composition of the material.',
  'phase': "Material phase, e.g. 'gas', 'liquid', 'solid', 'monoclinic', etc."}}

In [6]:
client.projects.update_entry(
    pk="experimental_thermo", project={"authors": "Various authors (see references). Data compiled by the Materials Project team."
                                        }
).result()

{'authors': 'Various authors (see references). Data compiled by the Materials Project team.'}

In [7]:
client.projects.update_entry(
    pk="experimental_thermo", project={"title": "Thermochemistry Data"
                                        }
).result()

{'title': 'Thermochemistry Data'}

In [8]:
client.projects.update_entry(
    pk="experimental_thermo", project={"unique_identifiers": True
                                        }
).result()

{'unique_identifiers': True}

In [9]:
client.projects.update_entry(
    pk="experimental_thermo", project={"references": [
    {"label":"Kubaschewski", "url":"https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109"},
    {"label":"NIST", "url":"https://janaf.nist.gov/"},]}
).result()

{'references': [{'label': 'Kubaschewski',
   'url': 'https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109'},
  {'label': 'NIST', 'url': 'https://janaf.nist.gov/'}]}

### Set the column order for display

In [53]:
# set order of columns and their desired units
columns = [
    {"path": "data.phase"},
    {"path": "data.composition"},
    {"path": "data.compound"},
    {"path": "data.0K.ΔHᶠ", "unit": "kJ/mol"},
    {"path": "data.0K.ΔGᶠ", "unit": "kJ/mol"},
    {"path": "data.0K.S", "unit": "J/degK/mol"},
    {"path": "data.0K.Cₚ", "unit": "J/degK/mol"},
    {"path": "data.298K.ΔHᶠ", "unit": "kJ/mol"},
    {"path": "data.298K.ΔGᶠ", "unit": "kJ/mol"},
    {"path": "data.298K.S", "unit": "J/degK/mol"},
    {"path": "data.298K.Cₚ", "unit": "J/degK/mol"},
    {"path": "data.polynomial.A"},
    {"path": "data.polynomial.B"},
    {"path": "data.polynomial.C"},
    {"path": "data.polynomial.D"},
    {"path": "data.polynomial.E"},
    {"path": "data.polynomial.F"},
    {"path": "data.polynomial.G"},
    {"path": "data.polynomial.H"},
    {"path": "data.ΔT.A.min", "unit": "degK"},
    {"path": "data.ΔT.B.min", "unit": "degK"},
    {"path": "data.ΔT.C.min", "unit": "degK"},
    {"path": "data.ΔT.D.min", "unit": "degK"},
    {"path": "data.ΔT.E.min", "unit": "degK"},
    {"path": "data.ΔT.F.min", "unit": "degK"},
    {"path": "data.ΔT.G.min", "unit": "degK"},
    {"path": "data.ΔT.H.min", "unit": "degK"},
    {"path": "data.ΔT.A.max", "unit": "degK"},
    {"path": "data.ΔT.B.max", "unit": "degK"},
    {"path": "data.ΔT.C.max", "unit": "degK"},
    {"path": "data.ΔT.D.max", "unit": "degK"},
    {"path": "data.ΔT.E.max", "unit": "degK"},
    {"path": "data.ΔT.F.max", "unit": "degK"},
    {"path": "data.ΔT.G.max", "unit": "degK"},
    {"path": "data.ΔT.H.max", "unit": "degK"},
    {"path": "data.method", "unit": "kJ/mol"},
    {"path": "data.reference", "unit": "kJ/mol"},  
]
client.projects.update_entry(
    pk=name, project={"columns": columns}
).result()

{'columns': [{'path': 'data.phase', 'min': nan, 'max': nan, 'unit': 'NaN'},
  {'path': 'data.composition', 'min': nan, 'max': nan, 'unit': 'NaN'},
  {'path': 'data.compound', 'min': nan, 'max': nan, 'unit': 'NaN'},
  {'path': 'data.0K.ΔHᶠ', 'min': nan, 'max': nan, 'unit': 'kJ/mol'},
  {'path': 'data.0K.ΔGᶠ', 'min': nan, 'max': nan, 'unit': 'kJ/mol'},
  {'path': 'data.0K.S', 'min': nan, 'max': nan, 'unit': 'J/degK/mol'},
  {'path': 'data.0K.Cₚ', 'min': nan, 'max': nan, 'unit': 'J/degK/mol'},
  {'path': 'data.298K.ΔHᶠ', 'min': nan, 'max': nan, 'unit': 'kJ/mol'},
  {'path': 'data.298K.ΔGᶠ', 'min': nan, 'max': nan, 'unit': 'kJ/mol'},
  {'path': 'data.298K.S', 'min': nan, 'max': nan, 'unit': 'J/degK/mol'},
  {'path': 'data.298K.Cₚ', 'min': nan, 'max': nan, 'unit': 'J/degK/mol'},
  {'path': 'data.polynomial.A', 'min': nan, 'max': nan, 'unit': 'NaN'},
  {'path': 'data.polynomial.B', 'min': nan, 'max': nan, 'unit': 'NaN'},
  {'path': 'data.polynomial.C', 'min': nan, 'max': nan, 'unit': 'NaN'},

In [11]:
client.get_project(name)

{'name': 'experimental_thermo',
 'is_public': False,
 'title': 'Thermochemistry Data',
 'owner': 'RKingsbury@lbl.gov',
 'is_approved': True,
 'unique_identifiers': True,
 'long_title': 'Experimental Thermochemistry Database',
 'authors': 'Various authors (see references). Data compiled by the Materials Project team.',
 'description': 'This project contains experimental thermochemical data for solids',
 'references': [{'label': 'Kubaschewski',
   'url': 'https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109'},
  {'label': 'NIST', 'url': 'https://janaf.nist.gov/'}],
 'other': {'Cₚ': 'Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2',
  'S': 'Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G',
  'T': 'Temperature',
  'polynomial': 'Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.',
  'ΔHᶠ': 'Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^

## Collect the MP Thermochemical Data

Use the `MPRester()` to retrieve all thermochemical data currently hosted on materialsproject.org

### Get a list of all unique formulas in MP

In [None]:
binaries = knowhere_mats.distinct("pretty_formula", {"nelements": {"$lte": 2}})

In [None]:
ternary_plus = knowhere_mats.distinct("pretty_formula", {"nelements": {"$gte": 3}})

### Pull `ThermoData` objects from MPRester

In [None]:
all_thermo = []
with MPRester() as a:
    for f in tqdm(binaries):
        try:
            all_thermo.extend(a.get_exp_thermo_data(f))
        except:
            continue

In [None]:
len(all_thermo)

In [None]:
type(all_thermo[9549])

In [None]:
all_thermo = []
with MPRester() as a:
    for f in tqdm(ternaries):
        try:
            all_thermo.extend(a.get_exp_thermo_data(f))
        except:
            continue

In [None]:
#all_thermo = []
with MPRester() as a:
    for f in tqdm(ternary_plus):
        all_thermo.extend(a.get_exp_thermo_data(f))

In [None]:
dumpfn(all_thermo, output_dir / '2020-08-07 all MP Thermo data.json')

In [12]:
all_thermo = loadfn(output_dir / '2020-08-07 all MP Thermo data.json')

### Convert `ThermoData` into a pandas dataframe

In [13]:
all_thermo[0].as_dict()

{'@module': 'pymatgen.analysis.thermochemistry',
 '@class': 'ThermoData',
 'type': 'S',
 'formula': 'Zr',
 'compound_name': 'Zr',
 'phaseinfo': 'hcp',
 'value': 0.039,
 'temp_range': [298, 298],
 'method': '',
 'ref': 'O. Kubaschewski, C. Alcock, P. Spencer, Materials Thermochemistry, 6th ed., Oxford, Pergamom Press, 1993.',
 'uncertainty': 0.0}

In [14]:
import pandas as pd
mpthermo_df = pd.DataFrame([t.as_dict() for t in all_thermo])

In [15]:
# drop the unneeded columns
mpthermo_df = mpthermo_df.drop('@module', axis=1)
mpthermo_df = mpthermo_df.drop('@class', axis=1)

In [16]:
mpthermo_df[mpthermo_df["formula"] == "Ag"]

Unnamed: 0,type,formula,compound_name,phaseinfo,value,temp_range,method,ref,uncertainty
355,fH,Ag,Ag,fcc,-0.0,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",0.0
356,D,Ag,Ag,fcc,0.0,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
357,fH,Ag,Ag,gas,284.9,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",0.0
358,C,Ag,Ag,gas,0.0,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
359,D,Ag,Ag,gas,0.0,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
360,S,Ag,Ag,fcc,0.0426,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",0.0
361,A,Ag,Ag,fcc,0.0213,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
362,B,Ag,Ag,fcc,9e-06,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
363,C,Ag,Ag,fcc,1.51,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",
364,S,Ag,Ag,gas,0.1729,"[298, 298]",,"O. Kubaschewski, C. Alcock, P. Spencer, Materi...",0.0


### Each unique type of data needs to be a column

In [17]:
# what unique types of data do we have?
mpthermo_df.type.unique()

array(['S', 'fH', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'cH'],
      dtype=object)

### Each unique phase needs to be nested under formula

In [18]:
# what unique types of data do we have?
mpthermo_df.phaseinfo.unique()

array(['hcp', 'gas', 'liquid', 'orth', 'solid', 'hex', 'bcc', 'tetrag',
       'rhomb', 'cubic', 'graph', 'diam', 'fcc', 'beta', '', 'white',
       'red', 'liq', 'alpha', 'ps.hex', 'monocl', 'orth./1', 'cryst',
       'a -qtz', 'a -cris', 'rutile', 'anatas', 'pyrite', 'trigon',
       'hemat', 'amorph', 'ortho', 'cement', 'magnet', 'trig', 'gas.',
       'bctet', 'a', 'r.tet', 'tetr', '#-qtz', 'wollas', 'ps.wol',
       'tet/cu', 'sider', 'clino', 'scheel', 'calcit', 'tricl', 'spinel',
       'olivin', 'larnit', 'rhodon', 'forst', 'celest', 'tricli', 'fayal',
       'magnes', 'baryte', 'hexag', 'kyanit', 'andal', 'sillim', 'rankin',
       'nit.ba', 'dolom'], dtype=object)

### Create a pandas `Series` object with a multiindex and a dict of the data we need

In [19]:
from pymatgen import Composition

def create_dict(data):
    ret = {}
    comp = Composition(data.formula.unique()[0])
    
    ret["project"] = name
    ret["is_public"] = False
    ret["identifier"] = comp.reduced_formula
    ret["data"] = {}
    ret["data"]["compound"] = data.compound_name.unique()[0]
    ret["data"]["composition"] = str(comp)
    ret["data"]["phase"] = data.phaseinfo.unique()[0]
    ret["data"]["reference"] = data.ref.unique()[0]
    
    for t in data.type.unique():
        
        # set the base dictionary key
        if t in ["A", "B", "C", "D", "E", "F", "G", "H"]:
            if not ret["data"].get("polynomial"):
                ret["data"]["polynomial"] = {}
            
            if not ret["data"].get("ΔT"):
                ret["data"]["ΔT"] = {}

            base_dict = ret["data"]["polynomial"]
            col = t
            unit = "dimensionless"
            base_dict[col] = {}
            ret["data"]["ΔT"][col] = {"min": "{} K".format(data[data["type"]==t]["temp_range"].values[0][0]),
                                   "max": "{} K".format(data[data["type"]==t]["temp_range"].values[0][1])}
            
        else:
            if data[data["type"]==t]["temp_range"].values[0] == [298, 298]:
                if not ret["data"].get("298K"):
                    ret["data"]["298K"]= {}
                base_dict = ret["data"]["298K"]
            else:
                print("Type: {}, T: {}".format(t, data[data["type"]==t]["temp_range"].values[0]))
                       
            if t == "S":
                unit = 'kJ/degK/mol'
                col = "S"
            elif t =="fH":
                col = "ΔHᶠ"
                unit = "kJ/mol"
            else:
                col = t
                unit = "dimensionless"
            
            base_dict[col] = {}

        # find value, uncertainty, method, unit
        base_dict[col]= "{:0.5g} {}".format(data[data["type"]==t]["value"].values[0], unit)
        
        if data[data["type"]==t]["method"].values[0] != "":
            if not ret["data"].get("method"):
                ret["data"]["method"] = {}
            ret["data"]["method"][col] = data[data["type"]==t]["method"].values[0]
            
#         if not np.isnan(data[data["type"]==t]["uncertainty"].values[0]):
#             base_dict[col]["uncertainty"] = data[data["type"]==t]["uncertainty"].values[0]
        
        
            
#         if t in ["S", "fH"]:
#             base_dict[col]["units"] = unit

    
    return ret
    

new_df = mpthermo_df.groupby(["formula","compound_name","phaseinfo","ref"]).apply(create_dict)
mpthermo_contribs = list(new_df)

In [20]:
mpthermo_contribs[0]

{'project': 'experimental_thermo',
 'is_public': False,
 'identifier': 'Ag',
 'data': {'compound': 'Ag',
  'composition': 'Ag1',
  'phase': 'fcc',
  'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, Materials Thermochemistry, 6th ed., Oxford, Pergamom Press, 1993.',
  '298K': {'ΔHᶠ': '-0 kJ/mol', 'S': '0.0426 kJ/degK/mol'},
  'polynomial': {'D': '0 dimensionless',
   'A': '0.0213 dimensionless',
   'B': '8.54e-06 dimensionless',
   'C': '1.51 dimensionless'},
  'ΔT': {'D': {'min': '298 K', 'max': '298 K'},
   'A': {'min': '298 K', 'max': '298 K'},
   'B': {'min': '298 K', 'max': '298 K'},
   'C': {'min': '298 K', 'max': '298 K'}}}}

#### Reshape the dict so that data is nested under a key for each phase

In [21]:
reshaped = []

from itertools import groupby

for formula, group in groupby(mpthermo_contribs, key=lambda d: d["identifier"]):
    new_dict ={}
    new_dict["project"] = name
    new_dict["is_public"] = False
    new_dict["identifier"] = formula
    new_dict["data"] = {}
    
    for d in group:
        if not new_dict.get("composition"):
            new_dict["composition"] = d["data"]["composition"]
        
        del d["data"]["composition"]

        phase = d["data"].get("phase", "n/a")
        if phase == "":
            phase = "n/a"

        new_dict["data"][phase] = d["data"]
        if phase != "n/a":
            del new_dict["data"][phase]["phase"]

    reshaped.append(new_dict)

In [22]:
import pprint
pprint.pprint(reshaped[0])

{'composition': 'Ag1',
 'data': {'fcc': {'298K': {'S': '0.0426 kJ/degK/mol', 'ΔHᶠ': '-0 kJ/mol'},
                  'compound': 'Ag',
                  'polynomial': {'A': '0.0213 dimensionless',
                                 'B': '8.54e-06 dimensionless',
                                 'C': '1.51 dimensionless',
                                 'D': '0 dimensionless'},
                  'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, '
                               'Materials Thermochemistry, 6th ed., Oxford, '
                               'Pergamom Press, 1993.',
                  'ΔT': {'A': {'max': '298 K', 'min': '298 K'},
                         'B': {'max': '298 K', 'min': '298 K'},
                         'C': {'max': '298 K', 'min': '298 K'},
                         'D': {'max': '298 K', 'min': '298 K'}}},
          'gas': {'298K': {'S': '0.173 kJ/degK/mol', 'ΔHᶠ': '284.9 kJ/mol'},
                  'compound': 'Silver',
                  'method': {'S': 'Revi

## NIST JANAF Data

#### Load the JANAF data from a CSV file

In [23]:
import pandas
janaf_df= pandas.read_csv(data_dir / "2020-08-10 JANAF data from Ayush/mpcontribs_janaf_thermo.csv")

In [24]:
janaf_df.head(20)

Unnamed: 0.1,Unnamed: 0,Formula,Name,Phase,Link,Cp_0,Cp_298,S_0,S_298,DeltaH_0,DeltaH_298,DeltaG_0,DeltaG_298
0,0,Al,Aluminum,ref,https://janaf.nist.gov/tables/Al-001.txt,0.0,24.209,0.0,28.275,0.0,0.0,0.0,0.0
1,1,Al,Aluminum,cr,https://janaf.nist.gov/tables/Al-002.txt,0.0,24.209,0.0,28.275,0.0,0.0,0.0,0.0
2,2,Al,Aluminum,"cr,l",https://janaf.nist.gov/tables/Al-004.txt,0.0,24.209,0.0,28.275,0.0,0.0,0.0,0.0
3,3,AlBr3,Aluminum Bromide,cr,https://janaf.nist.gov/tables/Al-010.txt,0.0,100.578,0.0,180.216,-491896.0,-511285.0,-491896.0,-488515.0
4,4,AlBr3,Aluminum Bromide,"cr,l",https://janaf.nist.gov/tables/Al-012.txt,0.0,100.578,0.0,180.216,-491896.0,-511285.0,-491896.0,-488515.0
5,5,AlClO,Aluminum Chloride Oxide,cr,https://janaf.nist.gov/tables/Al-019.txt,,56.902,,54.392,,-793286.0,,-737235.0
6,6,AlCl3,Aluminum Chloride,cr,https://janaf.nist.gov/tables/Al-025.txt,0.0,91.128,0.0,109.286,-704300.0,-705632.0,-704300.0,-630018.0
7,7,AlCl3,Aluminum Chloride,"cr,l",https://janaf.nist.gov/tables/Al-027.txt,0.0,91.128,0.0,109.286,-704300.0,-705632.0,-704300.0,-630018.0
8,8,AlCl4K,Potassium Tetrachloroaluminate,cr,https://janaf.nist.gov/tables/Al-029.txt,,156.482,,196.648,,-1196624.0,,-1094521.0
9,9,AlCl4Na,Sodium Tetrachloroaluminate,cr,https://janaf.nist.gov/tables/Al-030.txt,,154.975,,188.28,,-1142232.0,,-996454.0


### Create a list of dicts for the contributions in the JANAF dataframe

In [25]:
def create_dict(data):
    
    ret = {}
    ret["project"] = name
    ret["is_public"] = False    
    ret["data"] = {}
    
    try:
        comp = Composition(data.Formula.unique()[0])
        ret["identifier"] = comp.reduced_formula
        ret["data"]["composition"] = str(comp)
    except:
        print('problem')
        ret["identifier"] = data.Formula.unique()[0]
        ret["data"]["composition"] = data.Formula.unique()[0]
        
    ret["data"]["compound"] = data.Name.unique()[0]
    ret["data"]["phase"] = data.Phase.unique()[0]
    ret["data"]["reference"] = data.Link.unique()[0].replace('txt','html')
    
    ret["data"]["0K"] = {"ΔHᶠ": "{:0.6g} {}".format(data["DeltaH_0"].values[0]/1000, "kJ/mol"),
                          "ΔGᶠ": "{:0.6g} {}".format(data["DeltaG_0"].values[0]/1000, "kJ/mol"),
                         "S": "{:0.6g} {}".format(data["S_0"].values[0], "J/degK/mol"),
                          "Cₚ": "{:0.6g} {}".format(data["Cp_0"].values[0], "J/degK/mol"),
                         }
    
    ret["data"]["298K"] = {"ΔHᶠ": "{:0.6g} {}".format(data["DeltaH_298"].values[0]/1000, "kJ/mol"),
                          "ΔGᶠ": "{:0.6g} {}".format(data["DeltaG_298"].values[0]/1000, "kJ/mol"),
                         "S": "{:0.6g} {}".format(data["S_298"].values[0], "J/degK/mol"),
                          "Cₚ": "{:0.6g} {}".format(data["Cp_298"].values[0], "J/degK/mol"),
                         }

    return ret
    

new_df = janaf_df.groupby(["Formula","Name","Phase"]).apply(create_dict)
janaf_contribs = list(new_df)



problem


In [26]:
pprint.pprint(janaf_contribs[10])

{'data': {'0K': {'Cₚ': 'nan J/degK/mol',
                 'S': 'nan J/degK/mol',
                 'ΔGᶠ': 'nan kJ/mol',
                 'ΔHᶠ': 'nan kJ/mol'},
          '298K': {'Cₚ': '81.385 J/degK/mol',
                   'S': '50.626 J/degK/mol',
                   'ΔGᶠ': '-1572.97 kJ/mol',
                   'ΔHᶠ': '-1666.49 kJ/mol'},
          'composition': 'Al2 O3',
          'compound': 'Aluminum Oxide, Delta',
          'phase': 'cr',
          'reference': 'https://janaf.nist.gov/tables/Al-097.html'},
 'identifier': 'Al2O3',
 'is_public': False,
 'project': 'experimental_thermo'}


#### Reshape the dict so that data is nested under a key for each phase

In [27]:
reshaped_janaf = []

from itertools import groupby

for formula, group in groupby(janaf_contribs, key=lambda d: d["identifier"]):
    new_dict ={}
    new_dict["project"] = name
    new_dict["is_public"] = False
    new_dict["identifier"] = formula
    new_dict["data"] = {}
    
    for d in group:
        if not new_dict.get("composition"):
            new_dict["composition"] = d["data"]["composition"]
        
                
        del d["data"]["composition"]
            
        phase = d["data"].get("phase", "n/a")
        if phase == "":
            phase = "n/a"

        new_dict["data"][phase] = d["data"]
        if phase != "n/a":
            del new_dict["data"][phase]["phase"]
        
    reshaped_janaf.append(new_dict)

In [28]:
import pprint
pprint.pprint(reshaped_janaf[0])

{'composition': 'Al1',
 'data': {'cr': {'0K': {'Cₚ': '0 J/degK/mol',
                        'S': '0 J/degK/mol',
                        'ΔGᶠ': '0 kJ/mol',
                        'ΔHᶠ': '0 kJ/mol'},
                 '298K': {'Cₚ': '24.209 J/degK/mol',
                          'S': '28.275 J/degK/mol',
                          'ΔGᶠ': '0 kJ/mol',
                          'ΔHᶠ': '0 kJ/mol'},
                 'compound': 'Aluminum',
                 'reference': 'https://janaf.nist.gov/tables/Al-002.html'},
          'cr,l': {'0K': {'Cₚ': '0 J/degK/mol',
                          'S': '0 J/degK/mol',
                          'ΔGᶠ': '0 kJ/mol',
                          'ΔHᶠ': '0 kJ/mol'},
                   '298K': {'Cₚ': '24.209 J/degK/mol',
                            'S': '28.275 J/degK/mol',
                            'ΔGᶠ': '0 kJ/mol',
                            'ΔHᶠ': '0 kJ/mol'},
                   'compound': 'Aluminum',
                   'reference': 'https://janaf.nist.g

In [29]:
import pprint
pprint.pprint(reshaped[0])

{'composition': 'Ag1',
 'data': {'fcc': {'298K': {'S': '0.0426 kJ/degK/mol', 'ΔHᶠ': '-0 kJ/mol'},
                  'compound': 'Ag',
                  'polynomial': {'A': '0.0213 dimensionless',
                                 'B': '8.54e-06 dimensionless',
                                 'C': '1.51 dimensionless',
                                 'D': '0 dimensionless'},
                  'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, '
                               'Materials Thermochemistry, 6th ed., Oxford, '
                               'Pergamom Press, 1993.',
                  'ΔT': {'A': {'max': '298 K', 'min': '298 K'},
                         'B': {'max': '298 K', 'min': '298 K'},
                         'C': {'max': '298 K', 'min': '298 K'},
                         'D': {'max': '298 K', 'min': '298 K'}}},
          'gas': {'298K': {'S': '0.173 kJ/degK/mol', 'ΔHᶠ': '284.9 kJ/mol'},
                  'compound': 'Silver',
                  'method': {'S': 'Revi

### Merge the JANAF data with the MP Thermo data

In [30]:
all_contribs = reshaped[:]

count=0
for d in reshaped_janaf:
    # is this identifier already in mp thermo?
    if d["identifier"] in [e["identifier"] for e in reshaped]:
        # add the new NIST phases
        target_entry = [e for e in reshaped if e["identifier"] == d["identifier"]][0]
        for k,v in d["data"].items():
            if target_entry["data"].get(k):
                print("Warning: phase {} already exists for id {} in MP Thermo data! Skipping.".format(k, d["identifier"]))
                count+=1
                continue
            target_entry["data"][k] = v
    else:
        all_contribs.append(d)

print("Skipped {} duplicate entries".format(count))

Skipped 0 duplicate entries


In [31]:
pprint.pprint(all_contribs[0])

{'composition': 'Ag1',
 'data': {'fcc': {'298K': {'S': '0.0426 kJ/degK/mol', 'ΔHᶠ': '-0 kJ/mol'},
                  'compound': 'Ag',
                  'polynomial': {'A': '0.0213 dimensionless',
                                 'B': '8.54e-06 dimensionless',
                                 'C': '1.51 dimensionless',
                                 'D': '0 dimensionless'},
                  'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, '
                               'Materials Thermochemistry, 6th ed., Oxford, '
                               'Pergamom Press, 1993.',
                  'ΔT': {'A': {'max': '298 K', 'min': '298 K'},
                         'B': {'max': '298 K', 'min': '298 K'},
                         'C': {'max': '298 K', 'min': '298 K'},
                         'D': {'max': '298 K', 'min': '298 K'}}},
          'gas': {'298K': {'S': '0.173 kJ/degK/mol', 'ΔHᶠ': '284.9 kJ/mol'},
                  'compound': 'Silver',
                  'method': {'S': 'Revi

In [32]:
## Fix the position of the composition key
for e in all_contribs:
    e["data"]["composition"] = e["composition"]
    del e["composition"]

#### Remap phase keys that contain punctuation

In [33]:
replace = {"#-qtz":"βqtz",
           "a": "α",
           "a -cris":"αcrys",
           "a -qtz":"αqtz",
           "nit.ba": "nitba",
           "orth./1":"orth",
           "ortho":"orth",
           "r.tet":"rtet",
           "tet/cu":"tetcu",
           "n/a":"none",
           "cr,l":"crl"
          }

In [34]:
for e in all_contribs:
    for k in replace.keys():
        if e["data"].get(k):
            e["data"][replace[k]] = e["data"].pop(k)

In [35]:
pprint.pprint(all_contribs[0])

{'data': {'composition': 'Ag1',
          'fcc': {'298K': {'S': '0.0426 kJ/degK/mol', 'ΔHᶠ': '-0 kJ/mol'},
                  'compound': 'Ag',
                  'polynomial': {'A': '0.0213 dimensionless',
                                 'B': '8.54e-06 dimensionless',
                                 'C': '1.51 dimensionless',
                                 'D': '0 dimensionless'},
                  'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, '
                               'Materials Thermochemistry, 6th ed., Oxford, '
                               'Pergamom Press, 1993.',
                  'ΔT': {'A': {'max': '298 K', 'min': '298 K'},
                         'B': {'max': '298 K', 'min': '298 K'},
                         'C': {'max': '298 K', 'min': '298 K'},
                         'D': {'max': '298 K', 'min': '298 K'}}},
          'gas': {'298K': {'S': '0.173 kJ/degK/mol', 'ΔHᶠ': '284.9 kJ/mol'},
                  'compound': 'Silver',
                  'method': {'

### Reshape data again so that each formula+phase is a unique contribution with a unique identifier

In [36]:
new_contribs = []
for d in all_contribs:
    # unpack each identifier into unique identifiers with formula+phase
    for k,v in d["data"].items():
        new_d={}
        if k == 'composition':
            continue
        new_d["identifier"] = str(d["identifier"]+"-"+k)
        new_d["formula"] = d["identifier"]
        new_d["is_public"] = True
        new_d["project"] = d["project"]
        new_d["data"] = v
        new_d["data"]["phase"] = k
        new_d["data"]["composition"] = d["data"]["composition"]
        new_contribs.append(new_d)

    

In [37]:
pprint.pprint(new_contribs[0])

{'data': {'298K': {'S': '0.0426 kJ/degK/mol', 'ΔHᶠ': '-0 kJ/mol'},
          'composition': 'Ag1',
          'compound': 'Ag',
          'phase': 'fcc',
          'polynomial': {'A': '0.0213 dimensionless',
                         'B': '8.54e-06 dimensionless',
                         'C': '1.51 dimensionless',
                         'D': '0 dimensionless'},
          'reference': 'O. Kubaschewski, C. Alcock, P. Spencer, Materials '
                       'Thermochemistry, 6th ed., Oxford, Pergamom Press, '
                       '1993.',
          'ΔT': {'A': {'max': '298 K', 'min': '298 K'},
                 'B': {'max': '298 K', 'min': '298 K'},
                 'C': {'max': '298 K', 'min': '298 K'},
                 'D': {'max': '298 K', 'min': '298 K'}}},
 'formula': 'Ag',
 'identifier': 'Ag-fcc',
 'is_public': True,
 'project': 'experimental_thermo'}


In [38]:
pprint.pprint(new_contribs[2])

{'data': {'298K': {'S': '0.04255 kJ/degK/mol'},
          'composition': 'Ag1',
          'compound': 'Silver',
          'method': {'S': 'Review'},
          'phase': 'solid',
          'reference': 'Cox, Wagman, et al., 1984Cox, J.D.; Wagman, D.D.; '
                       'Medvedev, V.A.,CODATA Key Values for Thermodynamics, '
                       'Hemisphere Publishing Corp., New York, 1984, 1.  '
                       'CODATA Review value'},
 'formula': 'Ag',
 'identifier': 'Ag-solid',
 'is_public': True,
 'project': 'experimental_thermo'}


In [39]:
dumpfn(new_contribs, pipeline_dir / "2020-08-31_new_thermo_contribs.json")

In [46]:
new_contribs = loadfn(pipeline_dir / "2020-08-31_new_thermo_contribs.json")

### Clean `nan` out of the contribs

In [47]:
for d in new_contribs:
    if d["data"].get("0K"):
        if all(["nan" in v for k,v in d["data"]["0K"].items()]):
            del d["data"]["0K"]
            print("deleted {}".format(d["identifier"]))

deleted Al2O3-cr
deleted AlClO-cr
deleted AlI3-cr
deleted AlI3-crl
deleted BHO2-cr
deleted BaBr2-cr
deleted BaBr2-crl
deleted Be2C-cr
deleted Be2C-crl
deleted BeBr2-cr
deleted BeI2-cr
deleted BeI2-crl
deleted BeO-cr
deleted BeS-cr
deleted BeSO4-cr
deleted Ca-cr
deleted CaBr2-cr
deleted CaBr2-crl
deleted CoF3-cr
deleted Cr2N-cr
deleted CsF-cr
deleted CsF-crl
deleted CsHO-cr
deleted CsHO-crl
deleted CuCl-cr
deleted CuCl-crl
deleted Fe-cr
deleted Fe2(SO4)3-cr
deleted FeBr2-cr
deleted FeBr2-crl
deleted H4IN-cr
deleted Hf-cr
deleted HgBr-cr
deleted HgBr2-cr
deleted HgBr2-crl
deleted HgCl2-cr
deleted HgCl2-crl
deleted HgF-cr
deleted HgF2-cr
deleted HgF2-crl
deleted HgI-cr
deleted HgI-crl
deleted HgI2-cr
deleted HgI2-crl
deleted K2B4O7-cr
deleted K2B4O7-crl
deleted K2O-cr
deleted K2O2-cr
deleted K2S-cr
deleted K2S-crl
deleted K2SO4-cr
deleted K3AlF6-cr
deleted KBF4-cr
deleted KBF4-crl
deleted KH-cr
deleted Li2B4O7-cr
deleted Li2B4O7-crl
deleted Li2BeF4-cr
deleted Li2BeF4-crl
deleted Li2O2-cr


In [48]:
for d in new_contribs:
    if d["data"].get("298K"):
        if all(["nan" in v for k,v in d["data"]["298K"].items()]):
            del d["data"]["298K"]
            print("deleted {}".format(d["identifier"]))

In [49]:
for d in new_contribs:
    if d["data"].get("298K"):
        if all(["nan" in v or "0 " in v for k,v in d["data"]["298K"].items()]):
            del d["data"]["298K"]
            print("deleted {}".format(d["identifier"]))

deleted CCl3F-gas


In [50]:
for d in new_contribs:
    if d["data"].get("0K"):
        if all(["nan" in v or "0 " in v for k,v in d["data"]["0K"].items()]):
            del d["data"]["0K"]
            print("deleted {}".format(d["identifier"]))

deleted Al-cr
deleted Al-ref
deleted Al-crl
deleted B-ref
deleted B-cr
deleted B-crl
deleted Ba-cr
deleted Ba-ref
deleted Ba-crl
deleted Be-cr
deleted Be-ref
deleted Be-crl
deleted Br-ref
deleted Br-crl
deleted C-ref
deleted Ca-ref
deleted Ca-crl
deleted Cl2-ref
deleted Co-cr
deleted Co-ref
deleted Co-crl
deleted Cr-cr
deleted Cr-ref
deleted Cr-crl
deleted Cs-cr
deleted Cs-ref
deleted Cs-crl
deleted Cu-cr
deleted Cu-ref
deleted Cu-crl
deleted F2-ref
deleted Fe-ref
deleted Fe-crl
deleted Ga-cr
deleted Ga-ref
deleted Ga-crl
deleted H2-ref
deleted Hf-ref
deleted Hf-crl
deleted Hg-ref
deleted Hg-crl
deleted I-cr
deleted I-ref
deleted I-crl
deleted K-cr
deleted K-ref
deleted K-crl
deleted Li-cr
deleted Li-ref
deleted Li-crl
deleted Mg-cr
deleted Mg-ref
deleted Mg-crl
deleted Mn-cr
deleted Mn-ref
deleted Mn-crl
deleted Mo-cr
deleted Mo-ref
deleted Mo-crl
deleted MoCl4-crl
deleted N2-ref
deleted Na-cr
deleted Na-ref
deleted Na-crl
deleted Nb-cr
deleted Nb-ref
deleted Nb-crl
deleted Ni-cr
dele

### Fix `nan` values for the NIST electron gas

In [51]:
for d in new_contribs:
    if d["identifier"] == "e--ref":
        del d["data"]["0K"]["ΔGᶠ"]
        del d["data"]["0K"]["ΔHᶠ"]
        del d["data"]["0K"]["S"]

### Submit both datasets to MPContribs

In [85]:
# need to delete contributions first due to unique_identifiers=False
client.delete_contributions(name)
#client.submit_contributions(new_contribs, per_page=10)#, skip_dupe_check=True)

HBox(children=(FloatProgress(value=0.0, max=2100.0), HTML(value='')))




In [78]:
len(new_contribs)

2160

In [86]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

for chunk in tqdm(chunks(new_contribs, 10, total=len(new_contribs)/10)):
    try:
        client.contributions.create_entries(contributions=chunk).result()
    except:
        print(chunk)
        break

216it [19:33,  5.43s/it]


### Appendix
----

In [None]:
break