In [1]:
import json
import re
from pymatgen.core import Composition, Element
import pprint as pp
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle as pkl
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import sys, os
sys.path.append('/home/jupyter/CJK/TempTime')
from data_extraction_rxn_classification_all_targets import *
from sklearn.dummy import DummyRegressor

In [2]:
all_elements = [Element.from_Z(i).symbol for i in range(1, 119)]

In [3]:
with open('/home/jupyter/CJK/TempTime/data/solid-state_dataset_20200713.json') as f:
    ss_data= json.load(f)
ss_reactions = ss_data["reactions"]
ss_extracted, ss_precursor_nomenclature = extract_solidstate(ss_reactions, max_pre=5)

Returning extracted data of 51574/31782 reactions.


In [4]:
with open('/home/jupyter/CJK/TempTime/data/ss_extracted_NO_IMPUTATION_precs_all_targets.pkl', 'rb') as f:
    papers = pkl.load(f)

In [5]:
papers = [x for x in papers if not np.isnan(x['temp_time_vector'][5])]

In [6]:
papers

[{'DOI': '10.1149/1.1828243',
  'target': 'NdCoO3',
  'precursors': ['Co3O4', 'Nd2O3'],
  'operation_types': ['StartingSynthesis',
   'HeatingOperation',
   'MixingOperation',
   'ShapingOperation',
   'HeatingOperation',
   'MixingOperation',
   'MixingOperation',
   'DryingOperation',
   'HeatingOperation',
   'MixingOperation',
   'ShapingOperation',
   'HeatingOperation'],
  'operation_tokens': ['synthesized',
   'fired',
   'reground',
   'pressed',
   'sintered',
   'regrinding',
   'dissolved',
   'dried',
   'fired',
   'ground',
   'pressed',
   'sintered'],
  'operation_times': [None,
   12,
   None,
   None,
   24,
   None,
   None,
   None,
   5,
   None,
   None,
   24],
  'operation_temps': [None,
   900.0,
   None,
   None,
   1200.0,
   None,
   None,
   None,
   500.0,
   None,
   None,
   1200.0],
  'temp_time_vector': array([  nan, 1200.,   nan,   nan,   nan,   24.,   nan,   nan])},
 {'DOI': '10.1149/1.1828243',
  'target': 'Sr0.5Nd0.5CoO3',
  'precursors': ['Co3O4',

In [7]:
targets = [x['target'] for x in papers]
precursors = [x['precursors'] for x in papers]
temps = [x['temp_time_vector'][5] for x in papers]
dois = [x['DOI'] for x in papers]

In [8]:
# get means
only_ss_rxns = []
for target, precs, temps, doi in zip(targets, precursors, temps, dois):
    if len(precs) == 0:
        continue
    found = False
    for result in only_ss_rxns:
        if result["target"] == target and set(result["precursors"]) == set(precs):
            result["ss_sinter_temps"].append(temps)
            result["DOIs"].append(doi)
            found = True
    if not found:
        new_result = {}
        new_result["target"] = target
        new_result["precursors"] = precs
        new_result["ss_sinter_temps"] = [temps]
        new_result["DOIs"] = [doi]
        only_ss_rxns.append(new_result)

In [10]:
times = [np.mean(x['ss_sinter_temps']) for x in only_ss_rxns]
X = np.zeros(len(times))
times = np.array(times)
X = np.array(X)

In [11]:
X, times = shuffle(X, times, random_state=42)

In [12]:
kf = KFold(n_splits=10, shuffle=False)
maes = []
rmses = []
r2s = []
mres = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = times[train_index], times[test_index]
    dummy_regr = DummyRegressor(strategy="mean")
    dummy_regr.fit(X_train, y_train)
    preds = dummy_regr.predict(X_test)
    

    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    mre = np.mean((np.abs(y_test-preds)/y_test)*100)
    
    maes.append(mae)
    rmses.append(rmse)
    r2s.append(r2)
    mres.append(mre)
    

In [13]:
print(np.mean(maes), np.std(maes))
print(np.mean(rmses), np.std(rmses))
print(np.mean(r2s), np.std(r2s))
print(np.mean(mres), np.std(mres))

8.336394074429613 0.27721324773874767
11.111730335104058 0.4470473828223249
-0.0011752857461819755 0.0015930586258989994
173.92459043166807 4.619971213979469
