# Move Thermo data from MP Thermo to an MPContribs project

## Header

#### Global variables

In [None]:
PROJECT = 'Corrections'

#### Imports

In [None]:
import os
from pprint import pprint
from pathlib import Path
import re
from tqdm import tqdm
import numpy as np
import xlrd
from monty.serialization import loadfn, dumpfn

#### Set Working Directory

In [None]:
workdir = Path(re.sub(r"(?<={})[\w\W]*".format(PROJECT), "", str(Path.cwd())))
os.chdir(workdir)

data_dir = workdir / '2_raw data'
pipeline_dir = workdir / '3_data analysis' / '2_pipeline'
output_dir = workdir / '3_data analysis' / '3_output'

---
## Main Code

## Set up the project

In [None]:
from mpcontribs.client import Client
name = 'experimental_thermo' # this should be your project, see from the project URL
client = Client() # uses MPCONTRIBS_API_KEY envvar

In [None]:
client.projects.update_entry(
    pk="experimental_thermo", project={"other": 
                                         {"ΔHᶠ": "Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^2/2 + C*t^3/3 + D*t^4/4 − E/t + F − H",
                                          "ΔGᶠ": "Gibbs free energy of formation from the elements.",
                                         "S": "Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G",
                                          "Cₚ": "Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2",
                                         "polynomial": "Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.",
                                          "ΔT": "Range of temperatures over which polynomial coefficients are valid.",
                                          "composition": "String representation of pymatgen Composition of the material.",
                                          "phase": "Material phase, e.g. 'gas', 'liquid', 'solid', 'monoclinic', etc."
                                         }
                                        }
).result()

In [None]:
client.projects.update_entry(
    pk="experimental_thermo", project={"authors": "Various authors (see references). Data compiled by the Materials Project team."
                                        }
).result()

In [None]:
client.projects.update_entry(
    pk="experimental_thermo", project={"title": "Thermochemistry Data"
                                        }
).result()

In [None]:
client.projects.update_entry(
    pk="experimental_thermo", project={"unique_identifiers": True
                                        }
).result()

In [None]:
client.projects.update_entry(
    pk="experimental_thermo", project={"references": [
    {"label":"Kubaschewski", "url":"https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109"},
    {"label":"NIST", "url":"https://janaf.nist.gov/"},]}
).result()

### Set the column order for display

In [None]:
# set order of columns and their desired units
columns = [
    {"path": "data.phase"},
    {"path": "data.composition"},
    {"path": "data.compound"},
    {"path": "data.0K.ΔHᶠ", "unit": "kJ/mol"},
    {"path": "data.0K.ΔGᶠ", "unit": "kJ/mol"},
    {"path": "data.0K.S", "unit": "J/degK/mol"},
    {"path": "data.0K.Cₚ", "unit": "J/degK/mol"},
    {"path": "data.298K.ΔHᶠ", "unit": "kJ/mol"},
    {"path": "data.298K.ΔGᶠ", "unit": "kJ/mol"},
    {"path": "data.298K.S", "unit": "J/degK/mol"},
    {"path": "data.298K.Cₚ", "unit": "J/degK/mol"},
    {"path": "data.polynomial.A"},
    {"path": "data.polynomial.B"},
    {"path": "data.polynomial.C"},
    {"path": "data.polynomial.D"},
    {"path": "data.polynomial.E"},
    {"path": "data.polynomial.F"},
    {"path": "data.polynomial.G"},
    {"path": "data.polynomial.H"},
    {"path": "data.ΔT.A.min", "unit": "degK"},
    {"path": "data.ΔT.B.min", "unit": "degK"},
    {"path": "data.ΔT.C.min", "unit": "degK"},
    {"path": "data.ΔT.D.min", "unit": "degK"},
    {"path": "data.ΔT.E.min", "unit": "degK"},
    {"path": "data.ΔT.F.min", "unit": "degK"},
    {"path": "data.ΔT.G.min", "unit": "degK"},
    {"path": "data.ΔT.H.min", "unit": "degK"},
    {"path": "data.ΔT.A.max", "unit": "degK"},
    {"path": "data.ΔT.B.max", "unit": "degK"},
    {"path": "data.ΔT.C.max", "unit": "degK"},
    {"path": "data.ΔT.D.max", "unit": "degK"},
    {"path": "data.ΔT.E.max", "unit": "degK"},
    {"path": "data.ΔT.F.max", "unit": "degK"},
    {"path": "data.ΔT.G.max", "unit": "degK"},
    {"path": "data.ΔT.H.max", "unit": "degK"},
    {"path": "data.method", "unit": "kJ/mol"},
    {"path": "data.reference", "unit": "kJ/mol"},  
]
client.projects.update_entry(
    pk=name, project={"columns": columns}
).result()

In [None]:
client.get_project(name)

## Collect the MP Thermochemical Data

Use the `MPRester()` to retrieve all thermochemical data currently hosted on materialsproject.org

### Get a list of all unique formulas in MP

In [None]:
binaries = knowhere_mats.distinct("pretty_formula", {"nelements": {"$lte": 2}})

In [None]:
ternary_plus = knowhere_mats.distinct("pretty_formula", {"nelements": {"$gte": 3}})

### Pull `ThermoData` objects from MPRester

In [None]:
all_thermo = []
with MPRester() as a:
    for f in tqdm(binaries):
        try:
            all_thermo.extend(a.get_exp_thermo_data(f))
        except:
            continue

In [None]:
len(all_thermo)

In [None]:
type(all_thermo[9549])

In [None]:
all_thermo = []
with MPRester() as a:
    for f in tqdm(ternaries):
        try:
            all_thermo.extend(a.get_exp_thermo_data(f))
        except:
            continue

In [None]:
#all_thermo = []
with MPRester() as a:
    for f in tqdm(ternary_plus):
        all_thermo.extend(a.get_exp_thermo_data(f))

In [None]:
dumpfn(all_thermo, output_dir / '2020-08-07 all MP Thermo data.json')

In [None]:
all_thermo = loadfn(output_dir / '2020-08-07 all MP Thermo data.json')

### Convert `ThermoData` into a pandas dataframe

In [None]:
all_thermo[0].as_dict()

In [None]:
import pandas as pd
mpthermo_df = pd.DataFrame([t.as_dict() for t in all_thermo])

In [None]:
# drop the unneeded columns
mpthermo_df = mpthermo_df.drop('@module', axis=1)
mpthermo_df = mpthermo_df.drop('@class', axis=1)

In [None]:
mpthermo_df[mpthermo_df["formula"] == "Ag"]

### Each unique type of data needs to be a column

In [None]:
# what unique types of data do we have?
mpthermo_df.type.unique()

### Each unique phase needs to be nested under formula

In [None]:
# what unique types of data do we have?
mpthermo_df.phaseinfo.unique()

### Create a pandas `Series` object with a multiindex and a dict of the data we need

In [None]:
from pymatgen import Composition

def create_dict(data):
    ret = {}
    comp = Composition(data.formula.unique()[0])
    
    ret["project"] = name
    ret["is_public"] = False
    ret["identifier"] = comp.reduced_formula
    ret["data"] = {}
    ret["data"]["compound"] = data.compound_name.unique()[0]
    ret["data"]["composition"] = str(comp)
    ret["data"]["phase"] = data.phaseinfo.unique()[0]
    ret["data"]["reference"] = data.ref.unique()[0]
    
    for t in data.type.unique():
        
        # set the base dictionary key
        if t in ["A", "B", "C", "D", "E", "F", "G", "H"]:
            if not ret["data"].get("polynomial"):
                ret["data"]["polynomial"] = {}
            
            if not ret["data"].get("ΔT"):
                ret["data"]["ΔT"] = {}

            base_dict = ret["data"]["polynomial"]
            col = t
            unit = "dimensionless"
            base_dict[col] = {}
            ret["data"]["ΔT"][col] = {"min": "{} K".format(data[data["type"]==t]["temp_range"].values[0][0]),
                                   "max": "{} K".format(data[data["type"]==t]["temp_range"].values[0][1])}
            
        else:
            if data[data["type"]==t]["temp_range"].values[0] == [298, 298]:
                if not ret["data"].get("298K"):
                    ret["data"]["298K"]= {}
                base_dict = ret["data"]["298K"]
            else:
                print("Type: {}, T: {}".format(t, data[data["type"]==t]["temp_range"].values[0]))
                       
            if t == "S":
                unit = 'kJ/degK/mol'
                col = "S"
            elif t =="fH":
                col = "ΔHᶠ"
                unit = "kJ/mol"
            else:
                col = t
                unit = "dimensionless"
            
            base_dict[col] = {}

        # find value, uncertainty, method, unit
        base_dict[col]= "{:0.5g} {}".format(data[data["type"]==t]["value"].values[0], unit)
        
        if data[data["type"]==t]["method"].values[0] != "":
            if not ret["data"].get("method"):
                ret["data"]["method"] = {}
            ret["data"]["method"][col] = data[data["type"]==t]["method"].values[0]
            
#         if not np.isnan(data[data["type"]==t]["uncertainty"].values[0]):
#             base_dict[col]["uncertainty"] = data[data["type"]==t]["uncertainty"].values[0]
        
        
            
#         if t in ["S", "fH"]:
#             base_dict[col]["units"] = unit

    
    return ret
    

new_df = mpthermo_df.groupby(["formula","compound_name","phaseinfo","ref"]).apply(create_dict)
mpthermo_contribs = list(new_df)

In [None]:
mpthermo_contribs[0]

#### Reshape the dict so that data is nested under a key for each phase

In [None]:
reshaped = []

from itertools import groupby

for formula, group in groupby(mpthermo_contribs, key=lambda d: d["identifier"]):
    new_dict ={}
    new_dict["project"] = name
    new_dict["is_public"] = False
    new_dict["identifier"] = formula
    new_dict["data"] = {}
    
    for d in group:
        if not new_dict.get("composition"):
            new_dict["composition"] = d["data"]["composition"]
        
        del d["data"]["composition"]

        phase = d["data"].get("phase", "n/a")
        if phase == "":
            phase = "n/a"

        new_dict["data"][phase] = d["data"]
        if phase != "n/a":
            del new_dict["data"][phase]["phase"]

    reshaped.append(new_dict)

In [None]:
import pprint
pprint.pprint(reshaped[0])

## NIST JANAF Data

#### Load the JANAF data from a CSV file

In [None]:
import pandas
janaf_df= pandas.read_csv(data_dir / "2020-08-10 JANAF data from Ayush/mpcontribs_janaf_thermo.csv")

In [None]:
janaf_df.head(20)

### Create a list of dicts for the contributions in the JANAF dataframe

In [None]:
def create_dict(data):
    
    ret = {}
    ret["project"] = name
    ret["is_public"] = False    
    ret["data"] = {}
    
    try:
        comp = Composition(data.Formula.unique()[0])
        ret["identifier"] = comp.reduced_formula
        ret["data"]["composition"] = str(comp)
    except:
        print('problem')
        ret["identifier"] = data.Formula.unique()[0]
        ret["data"]["composition"] = data.Formula.unique()[0]
        
    ret["data"]["compound"] = data.Name.unique()[0]
    ret["data"]["phase"] = data.Phase.unique()[0]
    ret["data"]["reference"] = data.Link.unique()[0].replace('txt','html')
    
    ret["data"]["0K"] = {"ΔHᶠ": "{:0.6g} {}".format(data["DeltaH_0"].values[0]/1000, "kJ/mol"),
                          "ΔGᶠ": "{:0.6g} {}".format(data["DeltaG_0"].values[0]/1000, "kJ/mol"),
                         "S": "{:0.6g} {}".format(data["S_0"].values[0], "J/degK/mol"),
                          "Cₚ": "{:0.6g} {}".format(data["Cp_0"].values[0], "J/degK/mol"),
                         }
    
    ret["data"]["298K"] = {"ΔHᶠ": "{:0.6g} {}".format(data["DeltaH_298"].values[0]/1000, "kJ/mol"),
                          "ΔGᶠ": "{:0.6g} {}".format(data["DeltaG_298"].values[0]/1000, "kJ/mol"),
                         "S": "{:0.6g} {}".format(data["S_298"].values[0], "J/degK/mol"),
                          "Cₚ": "{:0.6g} {}".format(data["Cp_298"].values[0], "J/degK/mol"),
                         }

    return ret
    

new_df = janaf_df.groupby(["Formula","Name","Phase"]).apply(create_dict)
janaf_contribs = list(new_df)

In [None]:
pprint.pprint(janaf_contribs[10])

#### Reshape the dict so that data is nested under a key for each phase

In [None]:
reshaped_janaf = []

from itertools import groupby

for formula, group in groupby(janaf_contribs, key=lambda d: d["identifier"]):
    new_dict ={}
    new_dict["project"] = name
    new_dict["is_public"] = False
    new_dict["identifier"] = formula
    new_dict["data"] = {}
    
    for d in group:
        if not new_dict.get("composition"):
            new_dict["composition"] = d["data"]["composition"]
        
                
        del d["data"]["composition"]
            
        phase = d["data"].get("phase", "n/a")
        if phase == "":
            phase = "n/a"

        new_dict["data"][phase] = d["data"]
        if phase != "n/a":
            del new_dict["data"][phase]["phase"]
        
    reshaped_janaf.append(new_dict)

In [None]:
import pprint
pprint.pprint(reshaped_janaf[0])

In [None]:
import pprint
pprint.pprint(reshaped[0])

### Merge the JANAF data with the MP Thermo data

In [None]:
all_contribs = reshaped[:]

count=0
for d in reshaped_janaf:
    # is this identifier already in mp thermo?
    if d["identifier"] in [e["identifier"] for e in reshaped]:
        # add the new NIST phases
        target_entry = [e for e in reshaped if e["identifier"] == d["identifier"]][0]
        for k,v in d["data"].items():
            if target_entry["data"].get(k):
                print("Warning: phase {} already exists for id {} in MP Thermo data! Skipping.".format(k, d["identifier"]))
                count+=1
                continue
            target_entry["data"][k] = v
    else:
        all_contribs.append(d)

print("Skipped {} duplicate entries".format(count))

In [None]:
pprint.pprint(all_contribs[0])

In [None]:
## Fix the position of the composition key
for e in all_contribs:
    e["data"]["composition"] = e["composition"]
    del e["composition"]

#### Remap phase keys that contain punctuation

In [None]:
replace = {"#-qtz":"βqtz",
           "a": "α",
           "a -cris":"αcrys",
           "a -qtz":"αqtz",
           "nit.ba": "nitba",
           "orth./1":"orth",
           "ortho":"orth",
           "r.tet":"rtet",
           "tet/cu":"tetcu",
           "n/a":"none",
           "cr,l":"crl"
          }

In [None]:
for e in all_contribs:
    for k in replace.keys():
        if e["data"].get(k):
            e["data"][replace[k]] = e["data"].pop(k)

In [None]:
pprint.pprint(all_contribs[0])

### Reshape data again so that each formula+phase is a unique contribution with a unique identifier

In [None]:
new_contribs = []
for d in all_contribs:
    # unpack each identifier into unique identifiers with formula+phase
    for k,v in d["data"].items():
        new_d={}
        if k == 'composition':
            continue
        new_d["identifier"] = str(d["identifier"]+"-"+k)
        new_d["formula"] = d["identifier"]
        new_d["is_public"] = True
        new_d["project"] = d["project"]
        new_d["data"] = v
        new_d["data"]["phase"] = k
        new_d["data"]["composition"] = d["data"]["composition"]
        new_contribs.append(new_d)

    

In [None]:
pprint.pprint(new_contribs[0])

In [None]:
pprint.pprint(new_contribs[2])

In [None]:
dumpfn(new_contribs, pipeline_dir / "2020-08-31_new_thermo_contribs.json")

In [None]:
new_contribs = loadfn(pipeline_dir / "2020-08-31_new_thermo_contribs.json")

### Clean `nan` out of the contribs

In [None]:
for d in new_contribs:
    if d["data"].get("0K"):
        if all(["nan" in v for k,v in d["data"]["0K"].items()]):
            del d["data"]["0K"]
            print("deleted {}".format(d["identifier"]))

In [None]:
for d in new_contribs:
    if d["data"].get("298K"):
        if all(["nan" in v for k,v in d["data"]["298K"].items()]):
            del d["data"]["298K"]
            print("deleted {}".format(d["identifier"]))

In [None]:
for d in new_contribs:
    if d["data"].get("298K"):
        if all(["nan" in v or "0 " in v for k,v in d["data"]["298K"].items()]):
            del d["data"]["298K"]
            print("deleted {}".format(d["identifier"]))

In [None]:
for d in new_contribs:
    if d["data"].get("0K"):
        if all(["nan" in v or "0 " in v for k,v in d["data"]["0K"].items()]):
            del d["data"]["0K"]
            print("deleted {}".format(d["identifier"]))

### Fix `nan` values for the NIST electron gas

In [None]:
for d in new_contribs:
    if d["identifier"] == "e--ref":
        del d["data"]["0K"]["ΔGᶠ"]
        del d["data"]["0K"]["ΔHᶠ"]
        del d["data"]["0K"]["S"]

### Submit both datasets to MPContribs

In [None]:
# need to delete contributions first due to unique_identifiers=False
client.delete_contributions(name)
#client.submit_contributions(new_contribs, per_page=10)#, skip_dupe_check=True)

In [None]:
len(new_contribs)

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

for chunk in tqdm(chunks(new_contribs, 10, total=len(new_contribs)/10)):
    try:
        client.contributions.create_entries(contributions=chunk).result()
    except:
        print(chunk)
        break