### Data Cleaning

data cleaning and normalisation for the extracted thermoelectric database

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from pprint import pprint
import numpy.polynomial.polynomial as poly

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth',500)

In [316]:
out_path = ""

In [317]:
out_prefix = ""

In [54]:
# take in the database from it's raw csv form, following thermoelectric parsing (te_parse)
database_path = "v8_full_raw_database_with_metadata.xlsx"  # "example_database.csv"
df = pd.read_excel(database_path)
df.head()

Unnamed: 0,compound_name,model,raw_value,raw_units,value,units,temp_value,temp_units,room_temperature,editing,...,pressure_value,pressure_units,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa
0,['Ca0.96Dy0.02Yb0.02MnO3'],ThermCond,1.47,Wm-1K-1,[1.47],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[973.0],Kelvin^(1.0),-,-,...,-,-,-,-,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
1,['Ca0.98Dy0.02MnO3'],ThermCond,2.20,Wm-1K-1,[2.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[973.0],Kelvin^(1.0),-,-,...,-,-,-,-,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
2,['Bi2Te3'],ThermCond,∼ 1.2,Wm−1K−1,[1.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),-,-,room temperature,-,...,-,-,-,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
3,['Cu-doped Bi2Te3'],ThermCond,∼ 0.7,Wm−1K−1,[0.7],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[300.0],Kelvin^(1.0),-,-,...,-,-,parallel directions,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
4,['Sb2Te3'],ThermCond,∼ 1.15,Wm−1K−1,[1.15],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),-,-,room temperature,-,...,-,-,in-plane direction,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no


In [55]:
df.drop("publisher oa yop".split(), axis=1, inplace=True)

### Metadata (September)
#### (adding Publihser, Open Access, and Date tags)
used custom functions instead of the metadata from chemdataextractor which rely on the filename
filename structure = "article-" + DOI with hyphens instead of backslashes + file extension (.txt or .xml or .html)

#### Publisher

In [56]:
extension_to_publisher = {'xml':'Elsevier', 'html':'RSC', 'txt':'Springer'}

In [57]:
df['publisher'] = df.doi.apply(lambda x: extension_to_publisher[x.rsplit('.',1)[1]]).copy()

#### Open Access

In [58]:
with open('open_access_filenames_list.json','r') as f:
    oa_dois = json.load(f)

In [59]:
df.doi

0               10.1016-j.jssc.2014.12.010.xml
1               10.1016-j.jssc.2014.12.010.xml
2                      10.1039-d0ta00240b.html
3                      10.1039-d0ta00240b.html
4                      10.1039-d0ta00240b.html
                         ...                  
23265           10.1016-j.jmat.2015.04.005.xml
23266           10.1007-s13391-018-00107-6.txt
23267       10.1016-j.ceramint.2015.07.031.xml
23268                  10.1039-c7cs00315c.html
23269    10.1016-j.measurement.2018.10.059.xml
Name: doi, Length: 23270, dtype: object

In [60]:
# stupid way to get access type from doi using information already found in db_extended
df['oa'] = df.doi.apply(lambda x: "yes" if "article-" + x in oa_dois else "no")

In [61]:
df.oa.value_counts()

no     21387
yes     1883
Name: oa, dtype: int64

In [62]:
df.groupby(['publisher']).oa.value_counts(dropna=False)

publisher  oa 
Elsevier   no     13156
           yes      585
RSC        no      5395
           yes      662
Springer   no      2836
           yes      636
Name: oa, dtype: int64

#### Date

In [63]:
with open('dates_dictionary.json','r') as jj:
    dates_dict = json.load(jj)

In [64]:
df['yop'] = df.doi.apply(lambda x: dates_dict[x] if x in dates_dict.keys() else np.nan).copy()

In [65]:
df.yop.replace('unkown', np.nan, inplace = True)  # YES I don't kown how to spell

In [66]:
df.yop.value_counts(dropna=False)

2020    2990
2019    2583
2018    2399
2017    2169
2016    1980
2015    1848
2014    1184
2013    1098
2012     708
2020     693
2011     540
2010     526
2019     463
2009     430
2018     359
2017     353
2016     317
2014     292
2015     273
2005     236
2008     222
2013     197
2007     192
2006     176
2011     140
2012     131
2003     117
2001      99
2004      95
2002      81
2010      58
2009      47
2000      43
2007      35
2008      31
1999      28
1998      24
2006      23
1997      16
2021      14
2001      11
2000      11
2003       9
2004       9
1997       8
2002       5
1998       3
2005       2
1999       2
Name: yop, dtype: int64

In [69]:
# Springer titles weren't collected during extraction, so add separately

with open('springer_titles_dictionary.json', 'r') as fp:
    springer_titles = json.load(fp)
print(len(springer_titles))

15371


In [70]:
def add_springer_titles(df):
    if df.title == "title_fail":
        return springer_titles[df.doi]
    else:
        return df.title

In [71]:
df.title = df.apply(add_springer_titles, axis=1).copy()

In [72]:
df['doi title publisher oa yop'.split()].head()

Unnamed: 0,doi,title,publisher,oa,yop
0,10.1016-j.jssc.2014.12.010.xml,Influence of rare-earth elements doping on the...,Elsevier,no,2015
1,10.1016-j.jssc.2014.12.010.xml,Influence of rare-earth elements doping on the...,Elsevier,no,2015
2,10.1039-d0ta00240b.html,Layered materials with 2D connectivity for the...,RSC,no,2020
3,10.1039-d0ta00240b.html,Layered materials with 2D connectivity for the...,RSC,no,2020
4,10.1039-d0ta00240b.html,Layered materials with 2D connectivity for the...,RSC,no,2020


In [73]:
# last check for nans
df['publisher yop oa'.split()].isna().sum()

publisher    0
yop          0
oa           0
dtype: int64

In [74]:
ordered_columns = 'compound_name model raw_value raw_units value units temp_value temp_units room_temperature editing\
 exrept doi error process pressure_value pressure_units direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()

In [75]:
def check_columns(df_columns, ordered_columns):
    print("missing columns:", [c for c in df_columns if c not in ordered_columns], "\n")
    print("extra columns:", [c for c in ordered_columns if c not in df_columns])
check_columns(df.columns, ordered_columns)

missing columns: [] 

extra columns: []


In [76]:
# ordering and skimming
df = df[ordered_columns].copy()

In [77]:
# STATE BACKUP
df_metadata_bckp = df.copy()

In [78]:
# for recovery:
# df = df_metadata_bckp.copy()

In [79]:
df.model.value_counts()

ZT               11880
ThermCond         3756
Seebeck           2547
PF                2275
Conductivity2     1856
Resistivity        728
Conductivity       228
Name: model, dtype: int64

In [80]:
df.shape

(23270, 24)

### Cleaning 
##### (remove duplicates, entries without letters, and huge entires > 90 characters long)

In [81]:
# drop duplicates. check happens on exrept as well. So identical records, if coming from different sections 
# of a document are still kept
df = df.drop_duplicates()

In [82]:
try:
    df.drop(columns=['Unnamed: 0'], inplace=True)
except KeyError as e:
    print("no Unnamed: 0 column")

no Unnamed: 0 column


In [83]:
df.shape

(23168, 24)

In [84]:
df.head()

Unnamed: 0,compound_name,model,raw_value,raw_units,value,units,temp_value,temp_units,room_temperature,editing,...,pressure_value,pressure_units,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa
0,['Ca0.96Dy0.02Yb0.02MnO3'],ThermCond,1.47,Wm-1K-1,[1.47],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[973.0],Kelvin^(1.0),-,-,...,-,-,-,-,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
1,['Ca0.98Dy0.02MnO3'],ThermCond,2.20,Wm-1K-1,[2.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[973.0],Kelvin^(1.0),-,-,...,-,-,-,-,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
2,['Bi2Te3'],ThermCond,∼ 1.2,Wm−1K−1,[1.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),-,-,room temperature,-,...,-,-,-,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
3,['Cu-doped Bi2Te3'],ThermCond,∼ 0.7,Wm−1K−1,[0.7],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[300.0],Kelvin^(1.0),-,-,...,-,-,parallel directions,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
4,['Sb2Te3'],ThermCond,∼ 1.15,Wm−1K−1,[1.15],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),-,-,room temperature,-,...,-,-,in-plane direction,-,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no


In [86]:
df['clean_name'] = df.compound_name.apply(lambda x: x[2:-2].split("', '")) 

In [87]:
df.clean_name.apply(len).value_counts()

1    23168
Name: clean_name, dtype: int64

In [88]:
df['first_name'] = df.clean_name.apply(lambda x: x[0])

In [89]:
df.first_name.head(11)

0                      Ca0.96Dy0.02Yb0.02MnO3
1                            Ca0.98Dy0.02MnO3
2                                      Bi2Te3
3                             Cu-doped Bi2Te3
4                                      Sb2Te3
5                               Bi0.5Sb1.5Te3
6                                      Bi2Te3
7                                     BiSbSe3
8                                        BiSe
9                                      Bi2Se3
10    polycrystalline Ag0.01Sn0.99Se0.85S0.15
Name: first_name, dtype: object

In [90]:
# trying to remove some false positives

In [91]:
# Identify any entries without alphabetical letters

from string import ascii_letters as letters
def has_letters(s):
    for c in s:
        if c in letters:
            return True
    return False

df[~ df.compound_name.apply(has_letters)].shape

(15, 26)

In [92]:
# Remove entries without alphabetical letters
df = df[df.first_name.apply(has_letters)].copy()

In [93]:
# remove huge entries
df = df[df.first_name.apply(lambda x: False if len(x) > 79 else True)].copy()

In [94]:
df.shape

(23149, 26)

In [95]:
df[df.first_name.apply(lambda str: True if str in "SSP SPS PF S K".split() else False)]

Unnamed: 0,compound_name,model,raw_value,raw_units,value,units,temp_value,temp_units,room_temperature,editing,...,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa,clean_name,first_name
6335,['SSP'],ZT,0.5,-,[0.5],-,[400.0],Kelvin^(1.0),-,-,...,-,-,comma-level,zT,Improving the zT value of thermoelectrics by n...,RSC,2017,yes,[SSP],SSP


In [96]:
# remove some problematic entries, which are due to referencing (e.g. the sample containing 6 % CaTe.
# At some point we may be smart about that and use some coreference resolution by looking at the other
# extracted compounds from the same article and chekcing if the dopings match)

In [97]:
import re
# Number something(e.g. at wt etc.) % CEM(not spaces)
def check_problematic_form(str):
    if re.match(r"^\d+\s.+\s%\s[^\s]+$", str):
        return True
    return False
df[df.first_name.apply(check_problematic_form)]["first_name exrept".split()].head()

Unnamed: 0,first_name,exrept
218,20 wt % CNT,"(With 20 wt% CNT content, these composites exh..."
378,4 at % SmSe2,({ κtot at room - temperature }: 1.29 W m−1 K−...
931,60 wt % Bi0.5Sb1.5Te3,({ at 43-300 °C) }: The electrical conductivi...
996,1 at % Na2S,(Note that the calculated lowest κlat in the 1...
1373,15 mol % Se,(An extremely low thermal conductivity ∼0.27 W...


In [98]:
# Number % CEM
def check_problematic_form2(str):
    if re.match(r"^\d+\s%\s[^\s]+$", str):
        return True
    return False
problematic2 = df[df.first_name.apply(check_problematic_form2)]["first_name exrept doi".split()]
#problematic2.to_excel(os.path.join(out_path, out_prefix + "problematic2.xlsx"))

In [99]:
# print for checking
problematic2.head()

Unnamed: 0,first_name,exrept,doi
478,6 % CaTe,( a room temperature κtotal of ∼2.98 W m−1 K−1...,10.1039-c1ee02297k.html
479,3 % BaTe,({ at room temperature }: A room temperature ...,10.1039-c1ee02297k.html
481,5 % CaTe,(The 5% CaTe sample exhibits the lowest room t...,10.1039-c1ee02297k.html
482,6 % CaTe,(The 6% CaTe sample also shows a similar κlat ...,10.1039-c1ee02297k.html
1276,8 % MgTe,({ lattice thermal conductivity at 923 K }: ∼0...,10.1039-c3ee42187b.html


In [100]:
# 1
df = df[~ df.first_name.apply(check_problematic_form)].copy()

In [101]:
df.shape

(23031, 26)

In [102]:
# 2
df = df[~ df.first_name.apply(check_problematic_form2)].copy()

In [103]:
df.shape

(22961, 26)

In [386]:
# STATE BACKUP
df_clean_bckp = df.copy()

In [387]:
# recovery:
# df = df_clean_bckp.copy()

#### Noting normalised_model, without conducting any units or value normalisation

In [104]:
#NORMALISE electrical models
# normalised_model essentially just reffers to the correct property
electrical_models = 'Conductivity Conductivity2 Resistivity'
df['normalised_model'] = df.model.apply(lambda x: 'Conductivity' if x in electrical_models else x)

In [106]:
ordered_columns = 'compound_name first_name normalised_model model raw_value raw_units value units temp_value room_temperature temp_units editing\
 exrept doi error process pressure_value pressure_units direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()

In [107]:
check_columns(df.columns, ordered_columns)

missing columns: ['clean_name'] 

extra columns: []


In [394]:
# ordering and skimming
df = df[ordered_columns].copy()

## Adding pressure

In [112]:
# pressure was added post-extraction by associating results with excerpts
presh = pd.read_csv("pressures_data.csv")

In [116]:
presh.results.isna()

0       True
1       True
2      False
3      False
4      False
       ...  
294     True
295     True
296     True
297     True
298     True
Name: results, Length: 299, dtype: bool

In [113]:
presh[~ presh.results.isna()].iloc[0].results[0]['Pressure']

TypeError: string indices must be integers

In [117]:
pp = presh.copy()

In [118]:
def simplify_pressure(s):
    if s:
        p = s[0]['Pressure']
        return [p['value'][0], p['raw_units'].replace("(","").replace(")","")]
    else:
        return s

In [119]:
pp['simple'] = pp.results.apply(simplify_pressure)

TypeError: 'float' object is not subscriptable

In [406]:
pp.head()

Unnamed: 0,exrept,results,simple
219,Chen et al. systematically studied the electri...,,
315,The 3D CNT network maintains an extremely low ...,,
540,"In another case of doping with Cu, Sn1-xCuxSe ...","[{'Pressure': {'raw_value': '6.0', 'raw_units'...","[6.0, GPa]"
689,The most popular and wide spread used metal di...,"[{'Pressure': {'raw_value': '33', 'raw_units':...","[33.0, GPa]"
690,the minimum total thermal conductivity of syn...,"[{'Pressure': {'raw_value': '2', 'raw_units': ...","[2.0, GPa]"


In [407]:
def recover_exrept(s):
    if "FROM" in s:
        return s.split("FROM: ")[0][1:-1]
    else:
        return s

In [408]:
def extract_pressure(s):
    r = recover_exrept(s)
    if r in pp.exrept.unique():
        return pp[pp.exrept == r].simple.values[0]
    else:
        return None

In [409]:
pp.shape

(299, 3)

In [410]:
# df.drop(["pressure"], axis=1, inplace=True)

In [180]:
df["pressure"]=""

In [411]:
# doing it via index was giving me weird stuff so went like this
df["pressure"] = df.exrept.apply(extract_pressure)

In [412]:
df[~ df.pressure.isna()]["compound_name exrept pressure".split()].head()

Unnamed: 0,compound_name,exrept,pressure
540,['Sn1-xCuxSe'],"(In another case of doping with Cu, Sn1-xCuxSe...","[6.0, GPa]"
689,['metal diboride'],(The most popular and wide spread used metal d...,"[33.0, GPa]"
690,['Bi2Te3'],( the minimum total thermal conductivity of sy...,"[2.0, GPa]"
1083,['Fe9 wt %'],(Values of thermal conductivity for an Fe9 wt%...,"[60.0, GPa]"
1366,['BiSbTe3 bulk'],( the minimum thermal conductivity can be obse...,"[2.5, GPa]"


In [413]:
pp.head()  # in agreement

Unnamed: 0,exrept,results,simple
219,Chen et al. systematically studied the electri...,,
315,The 3D CNT network maintains an extremely low ...,,
540,"In another case of doping with Cu, Sn1-xCuxSe ...","[{'Pressure': {'raw_value': '6.0', 'raw_units'...","[6.0, GPa]"
689,The most popular and wide spread used metal di...,"[{'Pressure': {'raw_value': '33', 'raw_units':...","[33.0, GPa]"
690,the minimum total thermal conductivity of syn...,"[{'Pressure': {'raw_value': '2', 'raw_units': ...","[2.0, GPa]"


In [414]:
# STATE BACKUP
df_with_pressure = df.copy()

In [415]:
# recover
df = df_with_pressure.copy()

## ~ Normalising: ~

### Getting real values

In [122]:
def make_number_list2(x):
    try:
        x_list = x[1:-1].split(',')

        return [float(n) for n in x_list]
    except:
        return np.nan

In [123]:
def make_temp_number_list(x):
    if x == '-':
        # this is not sufficient, since it doesn't account for extractions where there are both room temp
        # and value extractions. It's just a stepping stone
        return [295] 
    else:
        x_list = x[1:-1].split(',')
        return [float(n) for n in x_list]

In [124]:
def get_average_from_list(x):
    return (sum(x) / len(x))

In [125]:
df['temp_numbers'] = df.temp_value.apply(make_temp_number_list)  # just a stepping stone

In [126]:
count_dashed = (df.room_temperature != "-").sum()
if count_dashed == 0:
    print("WAIT! there seem to be no dashes in room temperature, please check that normalising will work.")
else:
    print(count_dashed)

5016


In [127]:
df['temp_value temp_numbers'.split()].head()
# temp_value is just a string, while temp numbers is a list of numbers

Unnamed: 0,temp_value,temp_numbers
0,[973.0],[973.0]
1,[973.0],[973.0]
2,-,[295]
3,[300.0],[300.0]
4,-,[295]


In [128]:
print(df.temp_value.apply(type).value_counts())
print(df.temp_numbers.apply(type).value_counts())

<class 'str'>    22961
Name: temp_value, dtype: int64
<class 'list'>    22961
Name: temp_numbers, dtype: int64


In [129]:
df.temp_units.value_counts()

Kelvin^(1.0)        16494
-                    4481
Celsius^(1.0)        1980
Fahrenheit^(1.0)        6
Name: temp_units, dtype: int64

### Normalising temperature

In [130]:
#takes in full df, so apply axis=1
def make_temp_normalised_list(df):
    # prioritise room temperature in the case where there is both room temp mention and numerical value!
    if df.room_temperature != "-":  # make sure we haven't replaced '-' with something else
        return [295]
    if df.temp_units == 'Celsius^(1.0)':
        return [t + 273 for t in df.temp_numbers]
    elif df.temp_units == 'Fahrenheit^(1.0)':
        return [(t - 32) * 5/9 + 273 for t in df.temp_numbers]

    else:  # if Kelvin
        return df.temp_numbers


In [131]:
df['normalised_temp_values'] = df.apply(make_temp_normalised_list, axis=1)

In [132]:
df.normalised_temp_values.head()

0    [973.0]
1    [973.0]
2      [295]
3    [300.0]
4      [295]
Name: normalised_temp_values, dtype: object

In [133]:
df.normalised_temp_values.apply(len).value_counts()

1    22292
2      669
Name: normalised_temp_values, dtype: int64

In [134]:
df["normalised_temp_avg"] = df.normalised_temp_values.apply(get_average_from_list)

In [135]:
df["temp_value room_temperature normalised_temp_values normalised_temp_avg".split()].head()
# keep normalised_temp_values and normalised_temp_avg.
# now 'temp_value', 'room_temperature', and 'temp_numbers' are redundant. 'temp_units' will soon be too

Unnamed: 0,temp_value,room_temperature,normalised_temp_values,normalised_temp_avg
0,[973.0],-,[973.0],973.0
1,[973.0],-,[973.0],973.0
2,-,room temperature,[295],295.0
3,[300.0],-,[300.0],300.0
4,-,room temperature,[295],295.0


In [136]:
# check
print(df.normalised_temp_values.apply(type).value_counts())

<class 'list'>    22961
Name: normalised_temp_values, dtype: int64


In [137]:
df["normalised_temp_units"] = "Kelvin^(1.0)"

In [138]:
df.columns

Index(['compound_name', 'model', 'raw_value', 'raw_units', 'value', 'units',
       'temp_value', 'temp_units', 'room_temperature', 'editing', 'exrept',
       'doi', 'error', 'process', 'pressure_value', 'pressure_units',
       'direction_of_measurement', 'labels', 'parser', 'specifier', 'title',
       'publisher', 'yop', 'oa', 'clean_name', 'first_name',
       'normalised_model', 'temp_numbers', 'normalised_temp_values',
       'normalised_temp_avg', 'normalised_temp_units'],
      dtype='object')

In [139]:
# STATE BACKUP
df_normalised_temp = df.copy()

In [140]:
# recover
df = df_normalised_temp.copy()

### Normalising models

In [141]:
#check for problems
(df[df.value.apply(make_number_list2).isna()])['value']

Series([], Name: value, dtype: object)

In [142]:
df['value_numbers'] = df.value.apply(make_number_list2)

In [143]:
df.value_numbers.apply(len).value_counts()

1    22217
2      742
3        2
Name: value_numbers, dtype: int64

In [144]:
df.model.unique()

array(['ThermCond', 'ZT', 'Resistivity', 'Seebeck', 'PF', 'Conductivity',
       'Conductivity2'], dtype=object)

In [145]:
for mod in df.model.unique():
    print(mod + ':')
    units = df[df.model == mod].units.unique()
    print(units)
    #print([normalise_units_prefix(u) for u in units])
    print()
    
# where there is temperature, everything is in Kelvin and Celsius, so we don't need any fancy conversions

ThermCond:
['Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 '(10^-1.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 '(10^-4.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 'Celsius^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 '(10^2.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 'WattsOverMeterKelvin^(1.0)'
 '(10^-6.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)'
 '(10^2.0) * Joule^(1.0)  Kelvin^(-1.0)  Meter^(-1.0)  Second^(-1.0)'
 '(10^-3.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)']

ZT:
['-']

Resistivity:
['(10^-2.0) * Meter^(1.0)  Ohm^(1.0)' '(10^-6.0) * Meter^(1.0)  Ohm^(1.0)'
 '(10^-8.0) * Meter^(1.0)  Ohm^(1.0)' '(10^1.0) * Meter^(1.0)  Ohm^(1.0)'
 'Meter^(1.0)  Ohm^(1.0)' '(10^4.0) * Meter^(1.0)  Ohm^(1.0)'
 '(10^-2.0) * MeterOhm^(1.0)' '(10^3.0) * Meter^(1.0)  Ohm^(1.0)'
 'MeterOhm^(1.0)' '(10^7.0) * Meter^(1.0)  Ohm^(1.0)']

Seebeck:
['(10^-6.0) * Kelvin^(-1.0)  Volt^(1.0)'
 '(10^-9.0) * Kelvin^(-1.0)  Volt^(1.0)'
 '(10^-3.0) * Kelvin^(-1.0)  Volt^(1.0)' 'Kelvin^(-1.0)  Volt^(1.0)'
 '(10^-6.

In [146]:
# Seebeck V/C is the same as V/K (change per kelvin = chenge per celsius)
df[df.units == '(10^-6.0) * Celsius^(-1.0)  Volt^(1.0)']['compound_name value_numbers units'.split()]

Unnamed: 0,compound_name,value_numbers,units
16591,['Bi90Sb10'],[-45.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
16795,['copper–nickel alloy'],[-35.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
16951,['MoTe2'],[-780.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
16952,['MoSe2'],[-900.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
16956,['In2O3'],[224.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
16957,['In2O3'],[162.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
17019,['bulk TlSbTe2'],[70.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
17497,['TlInSe2'],[107.0],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
17750,['Bi2Te3'],[35.5],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)
17751,['Fe2O3'],[-54.5],(10^-6.0) * Celsius^(-1.0) Volt^(1.0)


In [147]:
# check that our exponent extraction works

for mod in df.model.unique():
    print(mod + ':')
    units_list = df[df.model == mod].units.unique()
    for units in units_list:
        exponent_list = re.findall('\(10\^(\-?\d\d?).0\)', units)
        print(units, exponent_list)
    #print([normalise_units_prefix(u) for u in units])
    print()

ThermCond:
Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) []
(10^-1.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) ['-1']
(10^-4.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) ['-4']
Celsius^(-1.0)  Meter^(-1.0)  Watt^(1.0) []
(10^2.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) ['2']
WattsOverMeterKelvin^(1.0) []
(10^-6.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) ['-6']
(10^2.0) * Joule^(1.0)  Kelvin^(-1.0)  Meter^(-1.0)  Second^(-1.0) ['2']
(10^-3.0) * Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0) ['-3']

ZT:
- []

Resistivity:
(10^-2.0) * Meter^(1.0)  Ohm^(1.0) ['-2']
(10^-6.0) * Meter^(1.0)  Ohm^(1.0) ['-6']
(10^-8.0) * Meter^(1.0)  Ohm^(1.0) ['-8']
(10^1.0) * Meter^(1.0)  Ohm^(1.0) ['1']
Meter^(1.0)  Ohm^(1.0) []
(10^4.0) * Meter^(1.0)  Ohm^(1.0) ['4']
(10^-2.0) * MeterOhm^(1.0) ['-2']
(10^3.0) * Meter^(1.0)  Ohm^(1.0) ['3']
MeterOhm^(1.0) []
(10^7.0) * Meter^(1.0)  Ohm^(1.0) ['7']

Seebeck:
(10^-6.0) * Kelvin^(-1.0)  Volt^(1.0) ['-6']
(10^-9.0) * Kelvin^(-1.0)  Volt^(1.0) ['-9']
(10^-3.0) * Kelvin

In [148]:
def normalise_units_prefix(df):
    exponent_list = re.findall('\(10\^(\-?\d\d?).0\)', df.units) #find all the powers of 10 and return the exponent
    if exponent_list:
        return [v * 10**int(exponent_list[0]) for v in df.value_numbers]
    else:
        return df.value_numbers

In [149]:
df['normalised_values'] = df.apply(normalise_units_prefix, axis=1)

In [150]:
# fix the values for resistivity. What's the correct approach for ranges?
# Inverse and then average? or the other way round? 

def normalise_resistivity_values(df):
    try:
        if df.model == 'Resistivity':
            return [1.0 / v for v in df.normalised_values]
        else:
            return df.normalised_values
    except:
        return np.nan

In [151]:
#drop some wrong zero values to avoid zero division
print(len(df))
# use apply to compare to list entry to list
df = df[df.normalised_values.apply(lambda x: x != [0.0])]
print(len(df))

22961
22956


In [152]:
#check how many changes, must match resistivity total. The small difference is from values = 1. Checked
print((df.apply(normalise_resistivity_values, axis=1) == df.normalised_values).value_counts())
print((df.model == 'Resistivity').sum())
print(((df.model == 'Resistivity') & (df.normalised_values.apply(lambda x: x == [1.0]))).sum())

True     22245
False      711
dtype: int64
718
7


In [153]:
df.normalised_values = df.apply(normalise_resistivity_values, axis=1)

In [154]:
df.normalised_values.head()

0    [1.47]
1     [2.2]
2     [1.2]
3     [0.7]
4    [1.15]
Name: normalised_values, dtype: object

In [155]:
df['normalised_avg'] = df.normalised_values.apply(get_average_from_list)
# average of inverse, for resistivity extractions

In [156]:
df["value value_numbers units normalised_values normalised_avg".split()]

# keep normalised_values and normalised_avg.
# now 'value', and 'value_numbers' are redundant. 'units' will soon be too

Unnamed: 0,value,value_numbers,units,normalised_values,normalised_avg
0,[1.47],[1.47],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[1.47],1.470
1,[2.2],[2.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[2.2],2.200
2,[1.2],[1.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[1.2],1.200
3,[0.7],[0.7],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[0.7],0.700
4,[1.15],[1.15],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),[1.15],1.150
...,...,...,...,...,...
23265,[2010.0],[2010.0],(10^2.0) * Meter^(-1.0) Siemens^(1.0),[201000.0],201000.000
23266,[6.8],[6.8],(10^2.0) * Meter^(-1.0) Siemens^(1.0),[680.0],680.000
23267,[0.15],[0.15],(10^-1.0) * Meter^(-1.0) Siemens^(1.0),[0.015],0.015
23268,[40.0],[40.0],(10^2.0) * Meter^(-1.0) Siemens^(1.0),[4000.0],4000.000


#### Normalise units (new in v8)

In [157]:
norm_mod_dict = {'ThermCond': 'Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)',
 'ZT': '-',
 'Conductivity': 'Meter^(-1.0)  Siemens^(1.0)',
 'Seebeck': 'Kelvin^(-1.0)  Volt^(1.0)',
 'PF': 'Kelvin^(-2.0)  Meter^(-1.0)  Watt^(1.0)'}

norm_mod_dict    

{'ThermCond': 'Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)',
 'ZT': '-',
 'Conductivity': 'Meter^(-1.0)  Siemens^(1.0)',
 'Seebeck': 'Kelvin^(-1.0)  Volt^(1.0)',
 'PF': 'Kelvin^(-2.0)  Meter^(-1.0)  Watt^(1.0)'}

In [158]:
df.normalised_model.value_counts()

ZT              11710
ThermCond        3715
Conductivity     2768
Seebeck          2518
PF               2245
Name: normalised_model, dtype: int64

In [159]:
df.normalised_model.apply(lambda x: norm_mod_dict[x]).value_counts()  # must agree

-                                          11710
Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)     3715
Meter^(-1.0)  Siemens^(1.0)                 2768
Kelvin^(-1.0)  Volt^(1.0)                   2518
Kelvin^(-2.0)  Meter^(-1.0)  Watt^(1.0)     2245
Name: normalised_model, dtype: int64

In [160]:
df['normalised_units'] = df.normalised_model.apply(lambda x: norm_mod_dict[x])

In [161]:
df['compound_name model value units normalised_avg normalised_units'.split()].sample(22)

Unnamed: 0,compound_name,model,value,units,normalised_avg,normalised_units
13277,['PbCl2 doped Ag10Pb100Bi10Se120'],ZT,[0.86],-,0.86,-
11652,['BiCu0.975SeO'],ZT,[0.8],-,0.8,-
1248,['Ag0.8Pb22.5SbTe12S8'],ThermCond,[0.83],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),0.83,Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0)
3131,['monolayer MoS2'],ThermCond,[29.2],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),29.2,Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0)
12408,['V2O5'],ZT,[0.45],-,0.45,-
2420,['CoSb3'],ThermCond,[10.0],Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0),10.0,Kelvin^(-1.0) Meter^(-1.0) Watt^(1.0)
14094,['Cu2Se'],ZT,[1.6],-,1.6,-
4622,['n-type Bi2Te3 nanowire'],ZT,[0.25],-,0.25,-
12590,['doped ZnO'],ZT,[0.65],-,0.65,-
14808,['pristine Ag2Te bulk'],ZT,[0.9],-,0.9,-


In [162]:
df.model.value_counts()

ZT               11710
ThermCond         3715
Seebeck           2518
PF                2245
Conductivity2     1822
Resistivity        718
Conductivity       228
Name: model, dtype: int64

In [163]:
df.normalised_model.value_counts()

ZT              11710
ThermCond        3715
Conductivity     2768
Seebeck          2518
PF               2245
Name: normalised_model, dtype: int64

In [164]:
df = df.replace('-', np.nan)

In [165]:
df.first_name.head()

0    Ca0.96Dy0.02Yb0.02MnO3
1          Ca0.98Dy0.02MnO3
2                    Bi2Te3
3           Cu-doped Bi2Te3
4                    Sb2Te3
Name: first_name, dtype: object

### Thermal and ionic tagging

In [166]:
df[df.model == 'ThermCond'].specifier.unique()

array(['thermal conductivity', 'κlat', 'lattice thermal conductivity',
       'minimum κlat', 'thermal conductivities', 'κL', 'κ', 'κl',
       'κtotal', 'κtot', 'phonon thermal conductivity', 'κPL', 'κlatt',
       'minimum lattice thermal conductivity', 'highest κe',
       'lattice thermal conductivities', 'highest κtotal', 'κT',
       'minimum thermal conductivity', 'κPOA', 'highest κel', 'κe',
       'highest thermal conductivity', 'κL+κB', 'Thermal conductivity',
       'electronic thermal conductivity', 'κp', 'κph', 'κt',
       'minimum lattice thermal conductivity κmin', 'minimum κ',
       'averaged lattice thermal conductivity', 'κlattice', 'κele(x)',
       'Thermal conductivities', 'highest κ', 'average κtot', 'λLC',
       'peak κL', 'κcar', 'κTot', 'bulk thermal conductivity', 'κphonon',
       'minimum thermal conductivity λmin', 'κlh+κbh', 'λL', 'bulk κl',
       'average lattice thermal conductivity',
       'phonon thermal conductivities', 'λT',
       'maximum ther

In [167]:
def thermal_tagging(df):
    if df.normalised_model == "ThermCond":
        tag = "total"
        x = df.specifier
        
        if ('el' in x) or ('κe' in x) or ('κ_e' in x) or ('λe' in x) or ('λ_e' in x) :
            tag = 'electronic'
        if ('p' in x) or ('L' in x) or ('la' in x) or ('κl' in x) or ('κ_l' in x):
            tag = 'lattice'
        return tag
    else:
        return np.nan

In [168]:
df.columns[:5]

Index(['compound_name', 'model', 'raw_value', 'raw_units', 'value'], dtype='object')

In [169]:
df.insert(4, "type", df.apply(thermal_tagging, axis=1), True) #True is for inplace, but the paramter name ain't inplace

In [170]:
df.type.value_counts()

total         2299
lattice       1338
electronic      78
Name: type, dtype: int64

In [171]:
# Ionic conductivity tagging

# df.loc[condition, column_label] = new_value
# why does df.loc[condition].column_label = new_value not work?
df.loc[df.specifier.str.contains("[Ii]on"), "type"] = "ionic"

In [172]:
df.type.value_counts()

total         2299
lattice       1338
ionic          112
electronic      78
Name: type, dtype: int64

In [173]:
# check we have only tagged electronic conductivities
df[df.type == "ionic"].normalised_model.unique()

array(['Conductivity'], dtype=object)

In [174]:
# STATE BACKUP (before columns ditching)
df_polished_bckp = df.copy()

In [175]:
# recover
df = df_polished_bckp.copy()

In [182]:
ordered_columns = 'compound_name first_name\
 normalised_model model type\
 normalised_temp_values normalised_temp_avg normalised_temp_units temp_value\
 normalised_values normalised_avg normalised_units editing pressure\
 exrept doi error process direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()  # ditch the original value extractions (keep only original model)

# >>> KEPT TEMP VALUE because it is used in further cleaning 

In [183]:
# check if anything important is left out. Keep the old model in order to distinguish if necessary
check_columns(df.columns, ordered_columns)

missing columns: ['raw_value', 'raw_units', 'value', 'units', 'temp_units', 'room_temperature', 'pressure_value', 'pressure_units', 'clean_name', 'temp_numbers', 'value_numbers'] 

extra columns: []


In [184]:
# this was just to get the dois for Cooley rerun
# with open("/Users/ody/Desktop/SC/THESIS/py_tests/dois_with_records.json", "w") as f:
#    json.dump(db_extended.doi.unique().tolist(), f)

In [185]:
# order
df = df[ordered_columns].copy()

In [482]:
df.shape

(22956, 27)

### Further Cleaning

In [188]:
db = df.copy()

In [189]:
def temperature_gradient_problem(df):
    if re.search("(temperature (difference|gradient))|Δ", str(df.exrept)) and (df.normalised_temp_avg < 290):
        return True
    return False

In [190]:
def matching_temp_and_process(df):
    if isinstance(df.temp_value, str) and isinstance(df.process, str):
        temp_val = df.temp_value[1:-1].split(",")[0].split(".")[0]
        if temp_val in df.process:
            return True
    return False

In [191]:
db.head(2)

Unnamed: 0,compound_name,first_name,normalised_model,model,type,normalised_temp_values,normalised_temp_avg,normalised_temp_units,temp_value,normalised_values,...,error,process,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa
0,['Ca0.96Dy0.02Yb0.02MnO3'],Ca0.96Dy0.02Yb0.02MnO3,ThermCond,ThermCond,total,[973.0],973.0,Kelvin^(1.0),[973.0],[1.47],...,,,,,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
1,['Ca0.98Dy0.02MnO3'],Ca0.98Dy0.02MnO3,ThermCond,ThermCond,total,[973.0],973.0,Kelvin^(1.0),[973.0],[2.2],...,,,,,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no


In [192]:
db.shape

(22956, 26)

In [193]:
original_shape = db.shape
print("original length: ", original_shape[0])
db = db[~ db.apply(temperature_gradient_problem, axis=1)].copy()
print("length after gradient problem removal: ", db.shape[0])
db = db[~ db.compound_name.str.contains("temp", na=False)].copy()
print("length after temp name removal: ", db.shape[0])
db = db[~ db.apply(matching_temp_and_process, axis=1)].copy()
print("length after matching temp and process problem removal: ", db.shape[0])

original length:  22956
length after gradient problem removal:  22925
length after temp name removal:  22925
length after matching temp and process problem removal:  22820


In [194]:
db = db[~((db.normalised_temp_avg < 0) | (db.normalised_temp_avg > 2500))]

In [195]:
db.shape

(22812, 26)

In [196]:
db[(db.normalised_model != "Seebeck") & (db.normalised_avg < 0)]["first_name normalised_model normalised_avg".split()]

Unnamed: 0,first_name,normalised_model,normalised_avg
15754,Zinc oxide,Conductivity,-12.450593
15975,Ru2Ge3+x,Conductivity,-33.333333
16142,undoped CaMnO3 nanoparticle,Conductivity,-9.433962
16159,Zinc oxide,Conductivity,-12.450593
20982,Co3O4,PF,-4.0
21011,ScNiSb,PF,-1.1
21272,orthorhombic AlMgB14,Conductivity,-11.0
21861,"CY,,exp",Conductivity,-2.0
22529,Phosphoric acid doped polyaniline,Conductivity,-1000.0
22819,vanadium tellurite,Conductivity,-350.0


In [197]:
mod_names = "ZT ThermCond Conductivity PF Seebeck".split()

In [198]:
for mn in mod_names:
    print(mn, ":\t", db[db.normalised_model == mn].normalised_avg.min())
# is it okay to have neg values?

ZT :	 2.9999999999999995e-21
ThermCond :	 0.0001
Conductivity :	 -1000.0
PF :	 -4.0
Seebeck :	 -30000000.0


In [199]:
# remove PF neg values
db[~((db.normalised_model == "PF") & (db.normalised_avg < 0))]

Unnamed: 0,compound_name,first_name,normalised_model,model,type,normalised_temp_values,normalised_temp_avg,normalised_temp_units,temp_value,normalised_values,...,error,process,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa
0,['Ca0.96Dy0.02Yb0.02MnO3'],Ca0.96Dy0.02Yb0.02MnO3,ThermCond,ThermCond,total,[973.0],973.0,Kelvin^(1.0),[973.0],[1.47],...,,,,,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
1,['Ca0.98Dy0.02MnO3'],Ca0.98Dy0.02MnO3,ThermCond,ThermCond,total,[973.0],973.0,Kelvin^(1.0),[973.0],[2.2],...,,,,,comma-level,thermal conductivity,Influence of rare-earth elements doping on the...,Elsevier,2015,no
2,['Bi2Te3'],Bi2Te3,ThermCond,ThermCond,lattice,[295],295.0,Kelvin^(1.0),,[1.2],...,,,,,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
3,['Cu-doped Bi2Te3'],Cu-doped Bi2Te3,ThermCond,ThermCond,lattice,[300.0],300.0,Kelvin^(1.0),[300.0],[0.7],...,,,parallel directions,,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
4,['Sb2Te3'],Sb2Te3,ThermCond,ThermCond,lattice,[295],295.0,Kelvin^(1.0),,[1.15],...,,,in-plane direction,,comma-level,κlat,Layered materials with 2D connectivity for the...,RSC,2020,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23265,['FeSb1.8'],FeSb1.8,Conductivity,Conductivity2,,[100.0],100.0,Kelvin^(1.0),[100.0],[201000.0],...,,,,,comma-level,σ,Enhancement in thermoelectric properties of Fe...,Elsevier,2015,yes
23266,['PPy / g-C3N4'],PPy / g-C3N4,Conductivity,Conductivity2,,[298.0],298.0,Kelvin^(1.0),[25.0],[680.0],...,,,,,comma-level,electrical conductivity,High Performance Supercapacitor Applications a...,Springer,2019,no
23267,['CuAlO2'],CuAlO2,Conductivity,Conductivity2,,[295],295.0,Kelvin^(1.0),,[0.015],...,,sintered at 900 °C,,,comma-level,conductivity,Synthesis of CuAlO2 from chemically precipitat...,Elsevier,2015,no
23268,['SiO2 / Si'],SiO2 / Si,Conductivity,Conductivity2,,[295],295.0,Kelvin^(1.0),,[4000.0],...,,process,,,comma-level,conductivity,Surface-supported metal–organic framework thin...,RSC,2017,no


In [200]:
db.shape

(22812, 26)

In [201]:
db = db[~((db.normalised_model == "PF") & (db.normalised_avg < 0))].copy()

In [202]:
db.shape

(22810, 26)

In [203]:
db = db[~((db.normalised_model == "ZT") & (db.normalised_avg < 10**(-18)))]
# drop too small ZT values

In [204]:
db.shape

(22805, 26)

In [206]:
letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
def check(s):
    cc = 0
    for c in s:
        if c in letters:
            cc += 1
    if cc > 1:
        return False
    return True

In [207]:
db[((db.normalised_model == "PF") & (db.normalised_avg < 0))]

Unnamed: 0,compound_name,first_name,normalised_model,model,type,normalised_temp_values,normalised_temp_avg,normalised_temp_units,temp_value,normalised_values,...,error,process,direction_of_measurement,labels,parser,specifier,title,publisher,yop,oa


In [208]:
db.to_excel("normalised_database.xlsx")