From 4d4f850c28936a5c697cfb1de6b7b95a07b9b6e9 Mon Sep 17 00:00:00 2001 From: Ryan Kingsbury Date: Fri, 1 Apr 2022 12:03:59 -0700 Subject: [PATCH] add notebook for uploading MP and NIST thermo data --- .../experimental_thermo.ipynb | 1068 +++++++++++++++++ 1 file changed, 1068 insertions(+) create mode 100644 mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb diff --git a/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb b/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb new file mode 100644 index 000000000..a02e07d84 --- /dev/null +++ b/mpcontribs-portal/notebooks/contribs.materialsproject.org/experimental_thermo.ipynb @@ -0,0 +1,1068 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Move Thermo data from MP Thermo to an MPContribs project" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Header" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Global variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = 'Corrections'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pprint import pprint\n", + "from pathlib import Path\n", + "import re\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "import xlrd\n", + "from monty.serialization import loadfn, dumpfn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Set Working Directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workdir = Path(re.sub(r\"(?<={})[\\w\\W]*\".format(PROJECT), \"\", str(Path.cwd())))\n", + "os.chdir(workdir)\n", + "\n", + "data_dir = workdir / '2_raw data'\n", + "pipeline_dir = workdir / '3_data analysis' / '2_pipeline'\n", + "output_dir = workdir / '3_data analysis' / '3_output'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Main Code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up the project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mpcontribs.client import Client\n", + "name = 'experimental_thermo' # this should be your project, see from the project URL\n", + "client = Client() # uses MPCONTRIBS_API_KEY envvar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.projects.update_entry(\n", + " pk=\"experimental_thermo\", project={\"other\": \n", + " {\"ΔHᶠ\": \"Enthalpy of formation from the elements. Polynomial: H° − H°298.15= A*t + B*t^2/2 + C*t^3/3 + D*t^4/4 − E/t + F − H\",\n", + " \"ΔGᶠ\": \"Gibbs free energy of formation from the elements.\",\n", + " \"S\": \"Absolute entropy. Polynomial: S° = A*ln(t) + B*t + C*t^2/2 + D*t^3/3 − E/(2*t^2) + G\",\n", + " \"Cₚ\": \"Specific heat capacity. Polynomial: Cp° = A + B*t + C*t^2 + D*t^3 + E/t^2\",\n", + " \"polynomial\": \"Coefficients for polynomials used to calculate temperature-dependent values of ΔHᶠ, S, or Cₚ.\",\n", + " \"ΔT\": \"Range of temperatures over which polynomial coefficients are valid.\",\n", + " \"composition\": \"String representation of pymatgen Composition of the material.\",\n", + " \"phase\": \"Material phase, e.g. 'gas', 'liquid', 'solid', 'monoclinic', etc.\"\n", + " }\n", + " }\n", + ").result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.projects.update_entry(\n", + " pk=\"experimental_thermo\", project={\"authors\": \"Various authors (see references). Data compiled by the Materials Project team.\"\n", + " }\n", + ").result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.projects.update_entry(\n", + " pk=\"experimental_thermo\", project={\"title\": \"Thermochemistry Data\"\n", + " }\n", + ").result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.projects.update_entry(\n", + " pk=\"experimental_thermo\", project={\"unique_identifiers\": True\n", + " }\n", + ").result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.projects.update_entry(\n", + " pk=\"experimental_thermo\", project={\"references\": [\n", + " {\"label\":\"Kubaschewski\", \"url\":\"https://www.worldcat.org/title/materials-thermochemistry/oclc/26724109\"},\n", + " {\"label\":\"NIST\", \"url\":\"https://janaf.nist.gov/\"},]}\n", + ").result()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set the column order for display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set order of columns and their desired units\n", + "columns = [\n", + " {\"path\": \"data.phase\"},\n", + " {\"path\": \"data.composition\"},\n", + " {\"path\": \"data.compound\"},\n", + " {\"path\": \"data.0K.ΔHᶠ\", \"unit\": \"kJ/mol\"},\n", + " {\"path\": \"data.0K.ΔGᶠ\", \"unit\": \"kJ/mol\"},\n", + " {\"path\": \"data.0K.S\", \"unit\": \"J/degK/mol\"},\n", + " {\"path\": \"data.0K.Cₚ\", \"unit\": \"J/degK/mol\"},\n", + " {\"path\": \"data.298K.ΔHᶠ\", \"unit\": \"kJ/mol\"},\n", + " {\"path\": \"data.298K.ΔGᶠ\", \"unit\": \"kJ/mol\"},\n", + " {\"path\": \"data.298K.S\", \"unit\": \"J/degK/mol\"},\n", + " {\"path\": \"data.298K.Cₚ\", \"unit\": \"J/degK/mol\"},\n", + " {\"path\": \"data.polynomial.A\"},\n", + " {\"path\": \"data.polynomial.B\"},\n", + " {\"path\": \"data.polynomial.C\"},\n", + " {\"path\": \"data.polynomial.D\"},\n", + " {\"path\": \"data.polynomial.E\"},\n", + " {\"path\": \"data.polynomial.F\"},\n", + " {\"path\": \"data.polynomial.G\"},\n", + " {\"path\": \"data.polynomial.H\"},\n", + " {\"path\": \"data.ΔT.A.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.B.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.C.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.D.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.E.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.F.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.G.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.H.min\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.A.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.B.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.C.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.D.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.E.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.F.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.G.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.ΔT.H.max\", \"unit\": \"degK\"},\n", + " {\"path\": \"data.method\", \"unit\": \"kJ/mol\"},\n", + " {\"path\": \"data.reference\", \"unit\": \"kJ/mol\"}, \n", + "]\n", + "client.projects.update_entry(\n", + " pk=name, project={\"columns\": columns}\n", + ").result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.get_project(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Collect the MP Thermochemical Data\n", + "\n", + "Use the `MPRester()` to retrieve all thermochemical data currently hosted on materialsproject.org" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get a list of all unique formulas in MP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "binaries = knowhere_mats.distinct(\"pretty_formula\", {\"nelements\": {\"$lte\": 2}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ternary_plus = knowhere_mats.distinct(\"pretty_formula\", {\"nelements\": {\"$gte\": 3}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pull `ThermoData` objects from MPRester" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_thermo = []\n", + "with MPRester() as a:\n", + " for f in tqdm(binaries):\n", + " try:\n", + " all_thermo.extend(a.get_exp_thermo_data(f))\n", + " except:\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(all_thermo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(all_thermo[9549])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_thermo = []\n", + "with MPRester() as a:\n", + " for f in tqdm(ternaries):\n", + " try:\n", + " all_thermo.extend(a.get_exp_thermo_data(f))\n", + " except:\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#all_thermo = []\n", + "with MPRester() as a:\n", + " for f in tqdm(ternary_plus):\n", + " all_thermo.extend(a.get_exp_thermo_data(f))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dumpfn(all_thermo, output_dir / '2020-08-07 all MP Thermo data.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_thermo = loadfn(output_dir / '2020-08-07 all MP Thermo data.json')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert `ThermoData` into a pandas dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_thermo[0].as_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "mpthermo_df = pd.DataFrame([t.as_dict() for t in all_thermo])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# drop the unneeded columns\n", + "mpthermo_df = mpthermo_df.drop('@module', axis=1)\n", + "mpthermo_df = mpthermo_df.drop('@class', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpthermo_df[mpthermo_df[\"formula\"] == \"Ag\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Each unique type of data needs to be a column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# what unique types of data do we have?\n", + "mpthermo_df.type.unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Each unique phase needs to be nested under formula" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# what unique types of data do we have?\n", + "mpthermo_df.phaseinfo.unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a pandas `Series` object with a multiindex and a dict of the data we need" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pymatgen import Composition\n", + "\n", + "def create_dict(data):\n", + " ret = {}\n", + " comp = Composition(data.formula.unique()[0])\n", + " \n", + " ret[\"project\"] = name\n", + " ret[\"is_public\"] = False\n", + " ret[\"identifier\"] = comp.reduced_formula\n", + " ret[\"data\"] = {}\n", + " ret[\"data\"][\"compound\"] = data.compound_name.unique()[0]\n", + " ret[\"data\"][\"composition\"] = str(comp)\n", + " ret[\"data\"][\"phase\"] = data.phaseinfo.unique()[0]\n", + " ret[\"data\"][\"reference\"] = data.ref.unique()[0]\n", + " \n", + " for t in data.type.unique():\n", + " \n", + " # set the base dictionary key\n", + " if t in [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\", \"G\", \"H\"]:\n", + " if not ret[\"data\"].get(\"polynomial\"):\n", + " ret[\"data\"][\"polynomial\"] = {}\n", + " \n", + " if not ret[\"data\"].get(\"ΔT\"):\n", + " ret[\"data\"][\"ΔT\"] = {}\n", + "\n", + " base_dict = ret[\"data\"][\"polynomial\"]\n", + " col = t\n", + " unit = \"dimensionless\"\n", + " base_dict[col] = {}\n", + " ret[\"data\"][\"ΔT\"][col] = {\"min\": \"{} K\".format(data[data[\"type\"]==t][\"temp_range\"].values[0][0]),\n", + " \"max\": \"{} K\".format(data[data[\"type\"]==t][\"temp_range\"].values[0][1])}\n", + " \n", + " else:\n", + " if data[data[\"type\"]==t][\"temp_range\"].values[0] == [298, 298]:\n", + " if not ret[\"data\"].get(\"298K\"):\n", + " ret[\"data\"][\"298K\"]= {}\n", + " base_dict = ret[\"data\"][\"298K\"]\n", + " else:\n", + " print(\"Type: {}, T: {}\".format(t, data[data[\"type\"]==t][\"temp_range\"].values[0]))\n", + " \n", + " if t == \"S\":\n", + " unit = 'kJ/degK/mol'\n", + " col = \"S\"\n", + " elif t ==\"fH\":\n", + " col = \"ΔHᶠ\"\n", + " unit = \"kJ/mol\"\n", + " else:\n", + " col = t\n", + " unit = \"dimensionless\"\n", + " \n", + " base_dict[col] = {}\n", + "\n", + " # find value, uncertainty, method, unit\n", + " base_dict[col]= \"{:0.5g} {}\".format(data[data[\"type\"]==t][\"value\"].values[0], unit)\n", + " \n", + " if data[data[\"type\"]==t][\"method\"].values[0] != \"\":\n", + " if not ret[\"data\"].get(\"method\"):\n", + " ret[\"data\"][\"method\"] = {}\n", + " ret[\"data\"][\"method\"][col] = data[data[\"type\"]==t][\"method\"].values[0]\n", + " \n", + "# if not np.isnan(data[data[\"type\"]==t][\"uncertainty\"].values[0]):\n", + "# base_dict[col][\"uncertainty\"] = data[data[\"type\"]==t][\"uncertainty\"].values[0]\n", + " \n", + " \n", + " \n", + "# if t in [\"S\", \"fH\"]:\n", + "# base_dict[col][\"units\"] = unit\n", + "\n", + " \n", + " return ret\n", + " \n", + "\n", + "new_df = mpthermo_df.groupby([\"formula\",\"compound_name\",\"phaseinfo\",\"ref\"]).apply(create_dict)\n", + "mpthermo_contribs = list(new_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mpthermo_contribs[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reshape the dict so that data is nested under a key for each phase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reshaped = []\n", + "\n", + "from itertools import groupby\n", + "\n", + "for formula, group in groupby(mpthermo_contribs, key=lambda d: d[\"identifier\"]):\n", + " new_dict ={}\n", + " new_dict[\"project\"] = name\n", + " new_dict[\"is_public\"] = False\n", + " new_dict[\"identifier\"] = formula\n", + " new_dict[\"data\"] = {}\n", + " \n", + " for d in group:\n", + " if not new_dict.get(\"composition\"):\n", + " new_dict[\"composition\"] = d[\"data\"][\"composition\"]\n", + " \n", + " del d[\"data\"][\"composition\"]\n", + "\n", + " phase = d[\"data\"].get(\"phase\", \"n/a\")\n", + " if phase == \"\":\n", + " phase = \"n/a\"\n", + "\n", + " new_dict[\"data\"][phase] = d[\"data\"]\n", + " if phase != \"n/a\":\n", + " del new_dict[\"data\"][phase][\"phase\"]\n", + "\n", + " reshaped.append(new_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "pprint.pprint(reshaped[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NIST JANAF Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load the JANAF data from a CSV file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas\n", + "janaf_df= pandas.read_csv(data_dir / \"2020-08-10 JANAF data from Ayush/mpcontribs_janaf_thermo.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "janaf_df.head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a list of dicts for the contributions in the JANAF dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_dict(data):\n", + " \n", + " ret = {}\n", + " ret[\"project\"] = name\n", + " ret[\"is_public\"] = False \n", + " ret[\"data\"] = {}\n", + " \n", + " try:\n", + " comp = Composition(data.Formula.unique()[0])\n", + " ret[\"identifier\"] = comp.reduced_formula\n", + " ret[\"data\"][\"composition\"] = str(comp)\n", + " except:\n", + " print('problem')\n", + " ret[\"identifier\"] = data.Formula.unique()[0]\n", + " ret[\"data\"][\"composition\"] = data.Formula.unique()[0]\n", + " \n", + " ret[\"data\"][\"compound\"] = data.Name.unique()[0]\n", + " ret[\"data\"][\"phase\"] = data.Phase.unique()[0]\n", + " ret[\"data\"][\"reference\"] = data.Link.unique()[0].replace('txt','html')\n", + " \n", + " ret[\"data\"][\"0K\"] = {\"ΔHᶠ\": \"{:0.6g} {}\".format(data[\"DeltaH_0\"].values[0]/1000, \"kJ/mol\"),\n", + " \"ΔGᶠ\": \"{:0.6g} {}\".format(data[\"DeltaG_0\"].values[0]/1000, \"kJ/mol\"),\n", + " \"S\": \"{:0.6g} {}\".format(data[\"S_0\"].values[0], \"J/degK/mol\"),\n", + " \"Cₚ\": \"{:0.6g} {}\".format(data[\"Cp_0\"].values[0], \"J/degK/mol\"),\n", + " }\n", + " \n", + " ret[\"data\"][\"298K\"] = {\"ΔHᶠ\": \"{:0.6g} {}\".format(data[\"DeltaH_298\"].values[0]/1000, \"kJ/mol\"),\n", + " \"ΔGᶠ\": \"{:0.6g} {}\".format(data[\"DeltaG_298\"].values[0]/1000, \"kJ/mol\"),\n", + " \"S\": \"{:0.6g} {}\".format(data[\"S_298\"].values[0], \"J/degK/mol\"),\n", + " \"Cₚ\": \"{:0.6g} {}\".format(data[\"Cp_298\"].values[0], \"J/degK/mol\"),\n", + " }\n", + "\n", + " return ret\n", + " \n", + "\n", + "new_df = janaf_df.groupby([\"Formula\",\"Name\",\"Phase\"]).apply(create_dict)\n", + "janaf_contribs = list(new_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint.pprint(janaf_contribs[10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reshape the dict so that data is nested under a key for each phase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reshaped_janaf = []\n", + "\n", + "from itertools import groupby\n", + "\n", + "for formula, group in groupby(janaf_contribs, key=lambda d: d[\"identifier\"]):\n", + " new_dict ={}\n", + " new_dict[\"project\"] = name\n", + " new_dict[\"is_public\"] = False\n", + " new_dict[\"identifier\"] = formula\n", + " new_dict[\"data\"] = {}\n", + " \n", + " for d in group:\n", + " if not new_dict.get(\"composition\"):\n", + " new_dict[\"composition\"] = d[\"data\"][\"composition\"]\n", + " \n", + " \n", + " del d[\"data\"][\"composition\"]\n", + " \n", + " phase = d[\"data\"].get(\"phase\", \"n/a\")\n", + " if phase == \"\":\n", + " phase = \"n/a\"\n", + "\n", + " new_dict[\"data\"][phase] = d[\"data\"]\n", + " if phase != \"n/a\":\n", + " del new_dict[\"data\"][phase][\"phase\"]\n", + " \n", + " reshaped_janaf.append(new_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "pprint.pprint(reshaped_janaf[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint\n", + "pprint.pprint(reshaped[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merge the JANAF data with the MP Thermo data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_contribs = reshaped[:]\n", + "\n", + "count=0\n", + "for d in reshaped_janaf:\n", + " # is this identifier already in mp thermo?\n", + " if d[\"identifier\"] in [e[\"identifier\"] for e in reshaped]:\n", + " # add the new NIST phases\n", + " target_entry = [e for e in reshaped if e[\"identifier\"] == d[\"identifier\"]][0]\n", + " for k,v in d[\"data\"].items():\n", + " if target_entry[\"data\"].get(k):\n", + " print(\"Warning: phase {} already exists for id {} in MP Thermo data! Skipping.\".format(k, d[\"identifier\"]))\n", + " count+=1\n", + " continue\n", + " target_entry[\"data\"][k] = v\n", + " else:\n", + " all_contribs.append(d)\n", + "\n", + "print(\"Skipped {} duplicate entries\".format(count))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint.pprint(all_contribs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Fix the position of the composition key\n", + "for e in all_contribs:\n", + " e[\"data\"][\"composition\"] = e[\"composition\"]\n", + " del e[\"composition\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Remap phase keys that contain punctuation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "replace = {\"#-qtz\":\"βqtz\",\n", + " \"a\": \"α\",\n", + " \"a -cris\":\"αcrys\",\n", + " \"a -qtz\":\"αqtz\",\n", + " \"nit.ba\": \"nitba\",\n", + " \"orth./1\":\"orth\",\n", + " \"ortho\":\"orth\",\n", + " \"r.tet\":\"rtet\",\n", + " \"tet/cu\":\"tetcu\",\n", + " \"n/a\":\"none\",\n", + " \"cr,l\":\"crl\"\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for e in all_contribs:\n", + " for k in replace.keys():\n", + " if e[\"data\"].get(k):\n", + " e[\"data\"][replace[k]] = e[\"data\"].pop(k)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint.pprint(all_contribs[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reshape data again so that each formula+phase is a unique contribution with a unique identifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_contribs = []\n", + "for d in all_contribs:\n", + " # unpack each identifier into unique identifiers with formula+phase\n", + " for k,v in d[\"data\"].items():\n", + " new_d={}\n", + " if k == 'composition':\n", + " continue\n", + " new_d[\"identifier\"] = str(d[\"identifier\"]+\"-\"+k)\n", + " new_d[\"formula\"] = d[\"identifier\"]\n", + " new_d[\"is_public\"] = True\n", + " new_d[\"project\"] = d[\"project\"]\n", + " new_d[\"data\"] = v\n", + " new_d[\"data\"][\"phase\"] = k\n", + " new_d[\"data\"][\"composition\"] = d[\"data\"][\"composition\"]\n", + " new_contribs.append(new_d)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint.pprint(new_contribs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint.pprint(new_contribs[2])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dumpfn(new_contribs, pipeline_dir / \"2020-08-31_new_thermo_contribs.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_contribs = loadfn(pipeline_dir / \"2020-08-31_new_thermo_contribs.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean `nan` out of the contribs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for d in new_contribs:\n", + " if d[\"data\"].get(\"0K\"):\n", + " if all([\"nan\" in v for k,v in d[\"data\"][\"0K\"].items()]):\n", + " del d[\"data\"][\"0K\"]\n", + " print(\"deleted {}\".format(d[\"identifier\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for d in new_contribs:\n", + " if d[\"data\"].get(\"298K\"):\n", + " if all([\"nan\" in v for k,v in d[\"data\"][\"298K\"].items()]):\n", + " del d[\"data\"][\"298K\"]\n", + " print(\"deleted {}\".format(d[\"identifier\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for d in new_contribs:\n", + " if d[\"data\"].get(\"298K\"):\n", + " if all([\"nan\" in v or \"0 \" in v for k,v in d[\"data\"][\"298K\"].items()]):\n", + " del d[\"data\"][\"298K\"]\n", + " print(\"deleted {}\".format(d[\"identifier\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for d in new_contribs:\n", + " if d[\"data\"].get(\"0K\"):\n", + " if all([\"nan\" in v or \"0 \" in v for k,v in d[\"data\"][\"0K\"].items()]):\n", + " del d[\"data\"][\"0K\"]\n", + " print(\"deleted {}\".format(d[\"identifier\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fix `nan` values for the NIST electron gas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for d in new_contribs:\n", + " if d[\"identifier\"] == \"e--ref\":\n", + " del d[\"data\"][\"0K\"][\"ΔGᶠ\"]\n", + " del d[\"data\"][\"0K\"][\"ΔHᶠ\"]\n", + " del d[\"data\"][\"0K\"][\"S\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit both datasets to MPContribs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# need to delete contributions first due to unique_identifiers=False\n", + "client.delete_contributions(name)\n", + "#client.submit_contributions(new_contribs, per_page=10)#, skip_dupe_check=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(new_contribs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def chunks(lst, n):\n", + " \"\"\"Yield successive n-sized chunks from lst.\"\"\"\n", + " for i in range(0, len(lst), n):\n", + " yield lst[i:i + n]\n", + "\n", + "for chunk in tqdm(chunks(new_contribs, 10, total=len(new_contribs)/10)):\n", + " try:\n", + " client.contributions.create_entries(contributions=chunk).result()\n", + " except:\n", + " print(chunk)\n", + " break" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}