From 4d4f850c28936a5c697cfb1de6b7b95a07b9b6e9 Mon Sep 17 00:00:00 2001
From: Ryan Kingsbury <RKingsbury@lbl.gov>
Date: Fri, 1 Apr 2022 12:03:59 -0700
Subject: [PATCH] add notebook for uploading MP and NIST thermo data

---
 .../experimental_thermo.ipynb                 | 1068 +++++++++++++++++
 1 file changed, 1068 insertions(+)
 create mode 100644 mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb

diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb
new file mode 100644
index 000000000..a02e07d84
--- /dev/null
+++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb
@@ -0,0 +1,1068 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Move Thermo data from MP Thermo to an MPContribs project"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Header"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Global variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PROJECT = 'Corrections'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pprint import pprint\n",
+    "from pathlib import Path\n",
+    "import re\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import xlrd\n",
+    "from monty.serialization import loadfn, dumpfn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Set Working Directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workdir = Path(re.sub(r\"(?<={})[\\w\\W]*\".format(PROJECT), \"\", str(Path.cwd())))\n",
+    "os.chdir(workdir)\n",
+    "\n",
+    "data_dir = workdir / '2_raw data'\n",
+    "pipeline_dir = workdir / '3_data analysis' / '2_pipeline'\n",
+    "output_dir = workdir / '3_data analysis' / '3_output'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## Main Code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Set up the project"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mpcontribs.client import Client\n",
+    "name = 'experimental_thermo' # this should be your project, see from the project URL\n",
+    "client = Client() # uses MPCONTRIBS_API_KEY envvar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.projects.update_entry(\n",
+    "    pk=\"experimental_thermo\", project={\"other\": \n",
+    "                                         {\"ΔHᶠ\": \"Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^2/2 + C*t^3/3 + D*t^4/4 − E/t + F − H\",\n",
+    "                                          \"ΔGᶠ\": \"Gibbs free energy of formation from the elements.\",\n",
+    "                                         \"S\": \"Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G\",\n",
+    "                                          \"Cₚ\": \"Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2\",\n",
+    "                                         \"polynomial\": \"Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.\",\n",
+    "                                          \"ΔT\": \"Range of temperatures over which polynomial coefficients are valid.\",\n",
+    "                                          \"composition\": \"String representation of pymatgen Composition of the material.\",\n",
+    "                                          \"phase\": \"Material phase, e.g. 'gas', 'liquid', 'solid', 'monoclinic', etc.\"\n",
+    "                                         }\n",
+    "                                        }\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.projects.update_entry(\n",
+    "    pk=\"experimental_thermo\", project={\"authors\": \"Various authors (see references). Data compiled by the Materials Project team.\"\n",
+    "                                        }\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.projects.update_entry(\n",
+    "    pk=\"experimental_thermo\", project={\"title\": \"Thermochemistry Data\"\n",
+    "                                        }\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.projects.update_entry(\n",
+    "    pk=\"experimental_thermo\", project={\"unique_identifiers\": True\n",
+    "                                        }\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.projects.update_entry(\n",
+    "    pk=\"experimental_thermo\", project={\"references\": [\n",
+    "    {\"label\":\"Kubaschewski\", \"url\":\"https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109\"},\n",
+    "    {\"label\":\"NIST\", \"url\":\"https://janaf.nist.gov/\"},]}\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set the column order for display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set order of columns and their desired units\n",
+    "columns = [\n",
+    "    {\"path\": \"data.phase\"},\n",
+    "    {\"path\": \"data.composition\"},\n",
+    "    {\"path\": \"data.compound\"},\n",
+    "    {\"path\": \"data.0K.ΔHᶠ\", \"unit\": \"kJ/mol\"},\n",
+    "    {\"path\": \"data.0K.ΔGᶠ\", \"unit\": \"kJ/mol\"},\n",
+    "    {\"path\": \"data.0K.S\", \"unit\": \"J/degK/mol\"},\n",
+    "    {\"path\": \"data.0K.Cₚ\", \"unit\": \"J/degK/mol\"},\n",
+    "    {\"path\": \"data.298K.ΔHᶠ\", \"unit\": \"kJ/mol\"},\n",
+    "    {\"path\": \"data.298K.ΔGᶠ\", \"unit\": \"kJ/mol\"},\n",
+    "    {\"path\": \"data.298K.S\", \"unit\": \"J/degK/mol\"},\n",
+    "    {\"path\": \"data.298K.Cₚ\", \"unit\": \"J/degK/mol\"},\n",
+    "    {\"path\": \"data.polynomial.A\"},\n",
+    "    {\"path\": \"data.polynomial.B\"},\n",
+    "    {\"path\": \"data.polynomial.C\"},\n",
+    "    {\"path\": \"data.polynomial.D\"},\n",
+    "    {\"path\": \"data.polynomial.E\"},\n",
+    "    {\"path\": \"data.polynomial.F\"},\n",
+    "    {\"path\": \"data.polynomial.G\"},\n",
+    "    {\"path\": \"data.polynomial.H\"},\n",
+    "    {\"path\": \"data.ΔT.A.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.B.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.C.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.D.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.E.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.F.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.G.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.H.min\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.A.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.B.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.C.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.D.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.E.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.F.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.G.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.ΔT.H.max\", \"unit\": \"degK\"},\n",
+    "    {\"path\": \"data.method\", \"unit\": \"kJ/mol\"},\n",
+    "    {\"path\": \"data.reference\", \"unit\": \"kJ/mol\"},  \n",
+    "]\n",
+    "client.projects.update_entry(\n",
+    "    pk=name, project={\"columns\": columns}\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.get_project(name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Collect the MP Thermochemical Data\n",
+    "\n",
+    "Use the `MPRester()` to retrieve all thermochemical data currently hosted on materialsproject.org"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Get a list of all unique formulas in MP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "binaries = knowhere_mats.distinct(\"pretty_formula\", {\"nelements\": {\"$lte\": 2}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ternary_plus = knowhere_mats.distinct(\"pretty_formula\", {\"nelements\": {\"$gte\": 3}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Pull `ThermoData` objects from MPRester"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_thermo = []\n",
+    "with MPRester() as a:\n",
+    "    for f in tqdm(binaries):\n",
+    "        try:\n",
+    "            all_thermo.extend(a.get_exp_thermo_data(f))\n",
+    "        except:\n",
+    "            continue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(all_thermo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(all_thermo[9549])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_thermo = []\n",
+    "with MPRester() as a:\n",
+    "    for f in tqdm(ternaries):\n",
+    "        try:\n",
+    "            all_thermo.extend(a.get_exp_thermo_data(f))\n",
+    "        except:\n",
+    "            continue"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#all_thermo = []\n",
+    "with MPRester() as a:\n",
+    "    for f in tqdm(ternary_plus):\n",
+    "        all_thermo.extend(a.get_exp_thermo_data(f))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dumpfn(all_thermo, output_dir / '2020-08-07 all MP Thermo data.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_thermo = loadfn(output_dir / '2020-08-07 all MP Thermo data.json')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Convert `ThermoData` into a pandas dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_thermo[0].as_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "mpthermo_df = pd.DataFrame([t.as_dict() for t in all_thermo])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop the unneeded columns\n",
+    "mpthermo_df = mpthermo_df.drop('@module', axis=1)\n",
+    "mpthermo_df = mpthermo_df.drop('@class', axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mpthermo_df[mpthermo_df[\"formula\"] == \"Ag\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Each unique type of data needs to be a column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# what unique types of data do we have?\n",
+    "mpthermo_df.type.unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Each unique phase needs to be nested under formula"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# what unique types of data do we have?\n",
+    "mpthermo_df.phaseinfo.unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a pandas `Series` object with a multiindex and a dict of the data we need"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pymatgen import Composition\n",
+    "\n",
+    "def create_dict(data):\n",
+    "    ret = {}\n",
+    "    comp = Composition(data.formula.unique()[0])\n",
+    "    \n",
+    "    ret[\"project\"] = name\n",
+    "    ret[\"is_public\"] = False\n",
+    "    ret[\"identifier\"] = comp.reduced_formula\n",
+    "    ret[\"data\"] = {}\n",
+    "    ret[\"data\"][\"compound\"] = data.compound_name.unique()[0]\n",
+    "    ret[\"data\"][\"composition\"] = str(comp)\n",
+    "    ret[\"data\"][\"phase\"] = data.phaseinfo.unique()[0]\n",
+    "    ret[\"data\"][\"reference\"] = data.ref.unique()[0]\n",
+    "    \n",
+    "    for t in data.type.unique():\n",
+    "        \n",
+    "        # set the base dictionary key\n",
+    "        if t in [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"H\"]:\n",
+    "            if not ret[\"data\"].get(\"polynomial\"):\n",
+    "                ret[\"data\"][\"polynomial\"] = {}\n",
+    "            \n",
+    "            if not ret[\"data\"].get(\"ΔT\"):\n",
+    "                ret[\"data\"][\"ΔT\"] = {}\n",
+    "\n",
+    "            base_dict = ret[\"data\"][\"polynomial\"]\n",
+    "            col = t\n",
+    "            unit = \"dimensionless\"\n",
+    "            base_dict[col] = {}\n",
+    "            ret[\"data\"][\"ΔT\"][col] = {\"min\": \"{} K\".format(data[data[\"type\"]==t][\"temp_range\"].values[0][0]),\n",
+    "                                   \"max\": \"{} K\".format(data[data[\"type\"]==t][\"temp_range\"].values[0][1])}\n",
+    "            \n",
+    "        else:\n",
+    "            if data[data[\"type\"]==t][\"temp_range\"].values[0] == [298, 298]:\n",
+    "                if not ret[\"data\"].get(\"298K\"):\n",
+    "                    ret[\"data\"][\"298K\"]= {}\n",
+    "                base_dict = ret[\"data\"][\"298K\"]\n",
+    "            else:\n",
+    "                print(\"Type: {}, T: {}\".format(t, data[data[\"type\"]==t][\"temp_range\"].values[0]))\n",
+    "                       \n",
+    "            if t == \"S\":\n",
+    "                unit = 'kJ/degK/mol'\n",
+    "                col = \"S\"\n",
+    "            elif t ==\"fH\":\n",
+    "                col = \"ΔHᶠ\"\n",
+    "                unit = \"kJ/mol\"\n",
+    "            else:\n",
+    "                col = t\n",
+    "                unit = \"dimensionless\"\n",
+    "            \n",
+    "            base_dict[col] = {}\n",
+    "\n",
+    "        # find value, uncertainty, method, unit\n",
+    "        base_dict[col]= \"{:0.5g} {}\".format(data[data[\"type\"]==t][\"value\"].values[0], unit)\n",
+    "        \n",
+    "        if data[data[\"type\"]==t][\"method\"].values[0] != \"\":\n",
+    "            if not ret[\"data\"].get(\"method\"):\n",
+    "                ret[\"data\"][\"method\"] = {}\n",
+    "            ret[\"data\"][\"method\"][col] = data[data[\"type\"]==t][\"method\"].values[0]\n",
+    "            \n",
+    "#         if not np.isnan(data[data[\"type\"]==t][\"uncertainty\"].values[0]):\n",
+    "#             base_dict[col][\"uncertainty\"] = data[data[\"type\"]==t][\"uncertainty\"].values[0]\n",
+    "        \n",
+    "        \n",
+    "            \n",
+    "#         if t in [\"S\", \"fH\"]:\n",
+    "#             base_dict[col][\"units\"] = unit\n",
+    "\n",
+    "    \n",
+    "    return ret\n",
+    "    \n",
+    "\n",
+    "new_df = mpthermo_df.groupby([\"formula\",\"compound_name\",\"phaseinfo\",\"ref\"]).apply(create_dict)\n",
+    "mpthermo_contribs = list(new_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mpthermo_contribs[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Reshape the dict so that data is nested under a key for each phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reshaped = []\n",
+    "\n",
+    "from itertools import groupby\n",
+    "\n",
+    "for formula, group in groupby(mpthermo_contribs, key=lambda d: d[\"identifier\"]):\n",
+    "    new_dict ={}\n",
+    "    new_dict[\"project\"] = name\n",
+    "    new_dict[\"is_public\"] = False\n",
+    "    new_dict[\"identifier\"] = formula\n",
+    "    new_dict[\"data\"] = {}\n",
+    "    \n",
+    "    for d in group:\n",
+    "        if not new_dict.get(\"composition\"):\n",
+    "            new_dict[\"composition\"] = d[\"data\"][\"composition\"]\n",
+    "        \n",
+    "        del d[\"data\"][\"composition\"]\n",
+    "\n",
+    "        phase = d[\"data\"].get(\"phase\", \"n/a\")\n",
+    "        if phase == \"\":\n",
+    "            phase = \"n/a\"\n",
+    "\n",
+    "        new_dict[\"data\"][phase] = d[\"data\"]\n",
+    "        if phase != \"n/a\":\n",
+    "            del new_dict[\"data\"][phase][\"phase\"]\n",
+    "\n",
+    "    reshaped.append(new_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint\n",
+    "pprint.pprint(reshaped[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## NIST JANAF Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load the JANAF data from a CSV file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas\n",
+    "janaf_df= pandas.read_csv(data_dir / \"2020-08-10 JANAF data from Ayush/mpcontribs_janaf_thermo.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "janaf_df.head(20)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a list of dicts for the contributions in the JANAF dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_dict(data):\n",
+    "    \n",
+    "    ret = {}\n",
+    "    ret[\"project\"] = name\n",
+    "    ret[\"is_public\"] = False    \n",
+    "    ret[\"data\"] = {}\n",
+    "    \n",
+    "    try:\n",
+    "        comp = Composition(data.Formula.unique()[0])\n",
+    "        ret[\"identifier\"] = comp.reduced_formula\n",
+    "        ret[\"data\"][\"composition\"] = str(comp)\n",
+    "    except:\n",
+    "        print('problem')\n",
+    "        ret[\"identifier\"] = data.Formula.unique()[0]\n",
+    "        ret[\"data\"][\"composition\"] = data.Formula.unique()[0]\n",
+    "        \n",
+    "    ret[\"data\"][\"compound\"] = data.Name.unique()[0]\n",
+    "    ret[\"data\"][\"phase\"] = data.Phase.unique()[0]\n",
+    "    ret[\"data\"][\"reference\"] = data.Link.unique()[0].replace('txt','html')\n",
+    "    \n",
+    "    ret[\"data\"][\"0K\"] = {\"ΔHᶠ\": \"{:0.6g} {}\".format(data[\"DeltaH_0\"].values[0]/1000, \"kJ/mol\"),\n",
+    "                          \"ΔGᶠ\": \"{:0.6g} {}\".format(data[\"DeltaG_0\"].values[0]/1000, \"kJ/mol\"),\n",
+    "                         \"S\": \"{:0.6g} {}\".format(data[\"S_0\"].values[0], \"J/degK/mol\"),\n",
+    "                          \"Cₚ\": \"{:0.6g} {}\".format(data[\"Cp_0\"].values[0], \"J/degK/mol\"),\n",
+    "                         }\n",
+    "    \n",
+    "    ret[\"data\"][\"298K\"] = {\"ΔHᶠ\": \"{:0.6g} {}\".format(data[\"DeltaH_298\"].values[0]/1000, \"kJ/mol\"),\n",
+    "                          \"ΔGᶠ\": \"{:0.6g} {}\".format(data[\"DeltaG_298\"].values[0]/1000, \"kJ/mol\"),\n",
+    "                         \"S\": \"{:0.6g} {}\".format(data[\"S_298\"].values[0], \"J/degK/mol\"),\n",
+    "                          \"Cₚ\": \"{:0.6g} {}\".format(data[\"Cp_298\"].values[0], \"J/degK/mol\"),\n",
+    "                         }\n",
+    "\n",
+    "    return ret\n",
+    "    \n",
+    "\n",
+    "new_df = janaf_df.groupby([\"Formula\",\"Name\",\"Phase\"]).apply(create_dict)\n",
+    "janaf_contribs = list(new_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(janaf_contribs[10])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Reshape the dict so that data is nested under a key for each phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reshaped_janaf = []\n",
+    "\n",
+    "from itertools import groupby\n",
+    "\n",
+    "for formula, group in groupby(janaf_contribs, key=lambda d: d[\"identifier\"]):\n",
+    "    new_dict ={}\n",
+    "    new_dict[\"project\"] = name\n",
+    "    new_dict[\"is_public\"] = False\n",
+    "    new_dict[\"identifier\"] = formula\n",
+    "    new_dict[\"data\"] = {}\n",
+    "    \n",
+    "    for d in group:\n",
+    "        if not new_dict.get(\"composition\"):\n",
+    "            new_dict[\"composition\"] = d[\"data\"][\"composition\"]\n",
+    "        \n",
+    "                \n",
+    "        del d[\"data\"][\"composition\"]\n",
+    "            \n",
+    "        phase = d[\"data\"].get(\"phase\", \"n/a\")\n",
+    "        if phase == \"\":\n",
+    "            phase = \"n/a\"\n",
+    "\n",
+    "        new_dict[\"data\"][phase] = d[\"data\"]\n",
+    "        if phase != \"n/a\":\n",
+    "            del new_dict[\"data\"][phase][\"phase\"]\n",
+    "        \n",
+    "    reshaped_janaf.append(new_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint\n",
+    "pprint.pprint(reshaped_janaf[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint\n",
+    "pprint.pprint(reshaped[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Merge the JANAF data with the MP Thermo data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_contribs = reshaped[:]\n",
+    "\n",
+    "count=0\n",
+    "for d in reshaped_janaf:\n",
+    "    # is this identifier already in mp thermo?\n",
+    "    if d[\"identifier\"] in [e[\"identifier\"] for e in reshaped]:\n",
+    "        # add the new NIST phases\n",
+    "        target_entry = [e for e in reshaped if e[\"identifier\"] == d[\"identifier\"]][0]\n",
+    "        for k,v in d[\"data\"].items():\n",
+    "            if target_entry[\"data\"].get(k):\n",
+    "                print(\"Warning: phase {} already exists for id {} in MP Thermo data! Skipping.\".format(k, d[\"identifier\"]))\n",
+    "                count+=1\n",
+    "                continue\n",
+    "            target_entry[\"data\"][k] = v\n",
+    "    else:\n",
+    "        all_contribs.append(d)\n",
+    "\n",
+    "print(\"Skipped {} duplicate entries\".format(count))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(all_contribs[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Fix the position of the composition key\n",
+    "for e in all_contribs:\n",
+    "    e[\"data\"][\"composition\"] = e[\"composition\"]\n",
+    "    del e[\"composition\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Remap phase keys that contain punctuation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "replace = {\"#-qtz\":\"βqtz\",\n",
+    "           \"a\": \"α\",\n",
+    "           \"a -cris\":\"αcrys\",\n",
+    "           \"a -qtz\":\"αqtz\",\n",
+    "           \"nit.ba\": \"nitba\",\n",
+    "           \"orth./1\":\"orth\",\n",
+    "           \"ortho\":\"orth\",\n",
+    "           \"r.tet\":\"rtet\",\n",
+    "           \"tet/cu\":\"tetcu\",\n",
+    "           \"n/a\":\"none\",\n",
+    "           \"cr,l\":\"crl\"\n",
+    "          }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for e in all_contribs:\n",
+    "    for k in replace.keys():\n",
+    "        if e[\"data\"].get(k):\n",
+    "            e[\"data\"][replace[k]] = e[\"data\"].pop(k)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(all_contribs[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Reshape data again so that each formula+phase is a unique contribution with a unique identifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_contribs = []\n",
+    "for d in all_contribs:\n",
+    "    # unpack each identifier into unique identifiers with formula+phase\n",
+    "    for k,v in d[\"data\"].items():\n",
+    "        new_d={}\n",
+    "        if k == 'composition':\n",
+    "            continue\n",
+    "        new_d[\"identifier\"] = str(d[\"identifier\"]+\"-\"+k)\n",
+    "        new_d[\"formula\"] = d[\"identifier\"]\n",
+    "        new_d[\"is_public\"] = True\n",
+    "        new_d[\"project\"] = d[\"project\"]\n",
+    "        new_d[\"data\"] = v\n",
+    "        new_d[\"data\"][\"phase\"] = k\n",
+    "        new_d[\"data\"][\"composition\"] = d[\"data\"][\"composition\"]\n",
+    "        new_contribs.append(new_d)\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(new_contribs[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(new_contribs[2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dumpfn(new_contribs, pipeline_dir / \"2020-08-31_new_thermo_contribs.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_contribs = loadfn(pipeline_dir / \"2020-08-31_new_thermo_contribs.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clean `nan` out of the contribs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for d in new_contribs:\n",
+    "    if d[\"data\"].get(\"0K\"):\n",
+    "        if all([\"nan\" in v for k,v in d[\"data\"][\"0K\"].items()]):\n",
+    "            del d[\"data\"][\"0K\"]\n",
+    "            print(\"deleted {}\".format(d[\"identifier\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for d in new_contribs:\n",
+    "    if d[\"data\"].get(\"298K\"):\n",
+    "        if all([\"nan\" in v for k,v in d[\"data\"][\"298K\"].items()]):\n",
+    "            del d[\"data\"][\"298K\"]\n",
+    "            print(\"deleted {}\".format(d[\"identifier\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for d in new_contribs:\n",
+    "    if d[\"data\"].get(\"298K\"):\n",
+    "        if all([\"nan\" in v or \"0 \" in v for k,v in d[\"data\"][\"298K\"].items()]):\n",
+    "            del d[\"data\"][\"298K\"]\n",
+    "            print(\"deleted {}\".format(d[\"identifier\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for d in new_contribs:\n",
+    "    if d[\"data\"].get(\"0K\"):\n",
+    "        if all([\"nan\" in v or \"0 \" in v for k,v in d[\"data\"][\"0K\"].items()]):\n",
+    "            del d[\"data\"][\"0K\"]\n",
+    "            print(\"deleted {}\".format(d[\"identifier\"]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fix `nan` values for the NIST electron gas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for d in new_contribs:\n",
+    "    if d[\"identifier\"] == \"e--ref\":\n",
+    "        del d[\"data\"][\"0K\"][\"ΔGᶠ\"]\n",
+    "        del d[\"data\"][\"0K\"][\"ΔHᶠ\"]\n",
+    "        del d[\"data\"][\"0K\"][\"S\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Submit both datasets to MPContribs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# need to delete contributions first due to unique_identifiers=False\n",
+    "client.delete_contributions(name)\n",
+    "#client.submit_contributions(new_contribs, per_page=10)#, skip_dupe_check=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(new_contribs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def chunks(lst, n):\n",
+    "    \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n",
+    "    for i in range(0, len(lst), n):\n",
+    "        yield lst[i:i + n]\n",
+    "\n",
+    "for chunk in tqdm(chunks(new_contribs, 10, total=len(new_contribs)/10)):\n",
+    "    try:\n",
+    "        client.contributions.create_entries(contributions=chunk).result()\n",
+    "    except:\n",
+    "        print(chunk)\n",
+    "        break"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}