## Data Cleaning, adding metadata, normalising etc.

data cleaning and normalisation for the extracted thermoelectric database

In [2]:
import pandas as pd
import numpy as np
import json
import os
import re
from pprint import pprint
from string import ascii_letters


In [1579]:
name_prefix = "example_" # prefix to be prepended to saved database name(s)

In [1580]:
# take in the database from it's raw csv form, following thermoelectric parsing (te_parse)
database_path = "example_database.csv"
df = pd.read_csv(database_path)


In [1581]:
# when working witha a database that has all five thermoelectric models,
# and wanting to aggregate the data after cleaning (for inference or comparison),
# then set this to True. Saves an intermediary format on which the "data_aggregation.ipynb" notebook runs.
save_intermediary = False

### Metadata
#### (adding Publihser, Open Access, and Date tags)
Using custom functions instead of the metadata from chemdataextractor. These rely on the filename.
filename structure = "article-" + DOI with hyphens instead of backslashes + file extension (.txt or .xml or .html)

#### Publisher

In [1582]:
extension_to_publisher = {'xml':'Elsevier', 'html':'RSC', 'txt':'Springer'}

In [1583]:
df['publisher'] = df.filename.apply(lambda x: extension_to_publisher[x.rsplit('.',1)[1]]).copy()

#### Open Access

In [8]:
with open(os.path.join(os.getcwd(), 'resources', 'open_access_filenames_list.json'),'r') as f:
    oa_filenames = json.load(f)

In [1585]:
# add open acces (oa) tags
df['oa'] = df.filename.apply(lambda x: "yes" if x in oa_filenames else "no")

#### Publication Year

In [10]:
with open(os.path.join(os.getcwd(), 'resources', 'dates_dictionary.json'),'r') as jj:
    dates_dict = json.load(jj)

In [1587]:
# add year of publication (yop)
df['yop'] = df.filename.apply(lambda x: dates_dict[x] if x in dates_dict.keys() else np.nan).copy()

#### Titles

In [11]:
# Springer titles weren't collected during extraction, so add separately
with open(os.path.join(os.getcwd(), 'resources', 'springer_titles_dictionary.json'), 'r') as fp:
    springer_titles = json.load(fp)

In [1589]:
def add_springer_titles(df):
    if df.title == "title_fail":
        return springer_titles[df.filename]
    else:
        return df.title

In [1590]:
df.title = df.apply(add_springer_titles, axis=1).copy()

#### Columns

In [1591]:
ordered_columns = 'compound_name model raw_value raw_units value units temp_value temp_units room_temperature editing\
 excerpt filename error process pressure_value pressure_units direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()

In [1592]:
# ordering and skimming
df = df[ordered_columns].copy()

### Cleaning 
##### (removing duplicates, entries without letters, and huge entires > 90 characters long)

In [1593]:
# drop duplicates
df = df.drop_duplicates()

In [1594]:
try:
    df.drop(columns=['Unnamed: 0'], inplace=True)
except KeyError as e:
    print("no Unnamed: 0 column")

no Unnamed: 0 column


In [1595]:
df['clean_name'] = df.compound_name.apply(lambda x: x[2:-2].split("', '")) 

In [1596]:
df['first_name'] = df.clean_name.apply(lambda x: x[0])

#### Trying to remove some false positives

In [1597]:
# Identify any entries without alphabetical letters

def has_letters(s):
    for c in s:
        if c in ascii_letters:
            return True
    return False



In [1598]:
# remove entries without alphabetical letters
df = df[df.first_name.apply(has_letters)].copy()

In [1599]:
# remove huge entries
df = df[df.first_name.apply(lambda x: False if len(x) > 79 else True)].copy()

In [1600]:
# remove some problematic entries, which are due to referencing (e.g. the sample containing 6 % CaTe.
# At some point we may be smart about that and use some coreference resolution by looking at the other
# extracted compounds from the same article and chekcing if the dopings match)

In [1601]:
# Number something(e.g. at wt etc.) % CEM(not spaces)
def check_problematic_form(str):
    if re.match(r"^\d+\s.+\s%\s[^\s]+$", str):
        return True
    return False
df[df.first_name.apply(check_problematic_form)]["first_name excerpt".split()].head()

Unnamed: 0,first_name,excerpt


In [1602]:
# Number % CEM
def check_problematic_form2(str):
    if re.match(r"^\d+\s%\s[^\s]+$", str):
        return True
    return False
problematic2 = df[df.first_name.apply(check_problematic_form2)]["first_name excerpt filename".split()]

#### Just noting the normalised_model, without conducting any units or value normalisation

In [1603]:
# NORMALISE electrical models
# normalised_model essentially just reffers to the property
electrical_models = 'Conductivity Conductivity2 Resistivity'
df['normalised_model'] = df.model.apply(lambda x: 'Conductivity' if x in electrical_models else x)

In [1604]:
ordered_columns = 'compound_name first_name normalised_model model raw_value raw_units value units temp_value room_temperature temp_units editing\
 excerpt filename error process pressure_value pressure_units direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()

In [1605]:
# ordering and skimming
df = df[ordered_columns].copy()

#### Adding pressure

In [1606]:
# pressure was added post-extraction,
# using the Pressure class in chemdataextractor.model.units.thermoelectric_models.py,
# and was associated to results with excerpts
press = pd.read_pickle("resources/pressures_data.pkl")

In [1607]:
def simplify_pressure(s):
    if s:
        p = s[0]['Pressure']
        return [p['value'][0], p['raw_units'].replace("(","").replace(")","")]
    else:
        return s

In [1608]:
press['simple'] = press.results.apply(simplify_pressure)

In [1609]:
def recover_excerpt(s):
    if "FROM" in s:
        return s.split("FROM: ")[0][1:-1]
    else:
        return s

In [1610]:
def extract_pressure(s):
    r = recover_excerpt(s)
    if r in press.excerpt.unique():
        return press[press.excerpt == r].simple.values[0]
    else:
        return None

In [1611]:
df["pressure"] = ""

In [1612]:
df["pressure"] = df.excerpt.apply(extract_pressure)

## Normalising

#### Getting real values

In [1613]:
def make_number_list2(x):
    try:
        x_list = x[1:-1].split(',')

        return [float(n) for n in x_list]
    except:
        return np.nan

In [1614]:
def make_temp_number_list(x):
    if x == '-':
        # this is not sufficient, since it doesn't account for extractions where there are both room temp
        # and value extractions. It's just a stepping stone
        return [295] 
    else:
        x_list = x[1:-1].split(',')
        return [float(n) for n in x_list]

In [1615]:
def get_average_from_list(x):
    return (sum(x) / len(x))

In [1616]:
df['temp_numbers'] = df.temp_value.apply(make_temp_number_list)  # just a stepping stone

In [1617]:
count_dashed = (df.room_temperature != "-").sum()
if count_dashed == 0:
    print("WAIT! there seem to be no dashes in room temperature, please check that normalising will work.")
else:
    print(count_dashed)

5


In [1618]:
# temp_value is just a string, while temp numbers is a list of numbers
df['temp_value temp_numbers'.split()].head()

Unnamed: 0,temp_value,temp_numbers
0,-,[295]
1,-,[295]
2,[793.0],[793.0]
3,[900.0],[900.0]
4,[300.0],[300.0]


#### Normalising temperature

In [1619]:
def make_temp_normalised_list(df):
    # prioritise room temperature in the case where there is both room temp mention and numerical value!
    if df.room_temperature != "-":  # make sure we haven't replaced '-' with something else
        return [295]
    if df.temp_units == 'Celsius^(1.0)':
        return [t + 273 for t in df.temp_numbers]
    elif df.temp_units == 'Fahrenheit^(1.0)':
        return [(t - 32) * 5/9 + 273 for t in df.temp_numbers]

    else:  # if Kelvin
        return df.temp_numbers


In [1620]:
df['normalised_temp_values'] = df.apply(make_temp_normalised_list, axis=1)

In [1621]:
df["normalised_temp_avg"] = df.normalised_temp_values.apply(get_average_from_list)

In [1622]:
df["normalised_temp_units"] = "Kelvin^(1.0)"

#### Normalising models

In [1623]:
df['value_numbers'] = df.value.apply(make_number_list2)

In [1624]:
# Seebeck V/C is the same as V/K (change per kelvin = chenge per celsius)
df[df.units == '(10^-6.0) * Celsius^(-1.0)  Volt^(1.0)']['compound_name value_numbers units'.split()]

Unnamed: 0,compound_name,value_numbers,units


In [1625]:
def normalise_units_prefix(df):
    exponent_list = re.findall('\(10\^(\-?\d\d?).0\)', df.units) #find all the powers of 10 and return the exponent
    if exponent_list:
        return [v * 10**int(exponent_list[0]) for v in df.value_numbers]
    else:
        return df.value_numbers

In [1626]:
df['normalised_values'] = df.apply(normalise_units_prefix, axis=1)

In [1627]:
# fix the values for resistivity.

def normalise_resistivity_values(df):
    try:
        if df.model == 'Resistivity':
            return [1.0 / v for v in df.normalised_values]
        else:
            return df.normalised_values
    except:
        return np.nan

In [1628]:
# drop some wrong zero values to avoid zero division
# use apply to compare list entry to list
df = df[df.normalised_values.apply(lambda x: x != [0.0])]

In [1629]:
df.normalised_values = df.apply(normalise_resistivity_values, axis=1)

In [1630]:
df['normalised_avg'] = df.normalised_values.apply(get_average_from_list)
# average of inverse, for resistivity extractions

#### Normalising units

In [1631]:
norm_mod_dict = {'ThermCond': 'Kelvin^(-1.0)  Meter^(-1.0)  Watt^(1.0)',
 'ZT': '-',
 'Conductivity': 'Meter^(-1.0)  Siemens^(1.0)',
 'Seebeck': 'Kelvin^(-1.0)  Volt^(1.0)',
 'PF': 'Kelvin^(-2.0)  Meter^(-1.0)  Watt^(1.0)'}


In [1632]:
df['normalised_units'] = df.normalised_model.apply(lambda x: norm_mod_dict[x])

In [1633]:
df = df.replace('-', np.nan)

### Thermal conductivity (total, lattice, electronic) and electrical conductivity (ionic and normal) tagging

In [1634]:
def thermal_tagging(df):
    if df.normalised_model == "ThermCond":
        tag = "total"
        x = df.specifier
        
        if ('el' in x) or ('κe' in x) or ('κ_e' in x) or ('λe' in x) or ('λ_e' in x) :
            tag = 'electronic'
        if ('p' in x) or ('L' in x) or ('la' in x) or ('κl' in x) or ('κ_l' in x):
            tag = 'lattice'
        return tag
    else:
        return np.nan

In [1635]:
df.insert(4, "type", df.apply(thermal_tagging, axis=1), True) #True is for inplace, but the paramter name ain't inplace

In [1636]:
# Ionic conductivity tagging
df.loc[df.specifier.str.contains("[Ii]on"), "type"] = "ionic"

In [1637]:
ordered_columns = 'compound_name first_name\
 normalised_model model type\
 normalised_temp_values normalised_temp_avg normalised_temp_units temp_value\
 normalised_values normalised_avg normalised_units editing pressure\
 excerpt filename error process direction_of_measurement labels parser specifier title\
 publisher yop oa'.split()

In [1638]:
# order
df = df[ordered_columns].copy()

### Further Cleaning

In [1639]:
db = df.copy()

In [1640]:
def temperature_gradient_problem(df):
    if re.search("(temperature (difference|gradient))|Δ", str(df.excerpt)) and (df.normalised_temp_avg < 290):
        return True
    return False

In [1641]:
def matching_temp_and_process(df):
    if isinstance(df.temp_value, str) and isinstance(df.process, str):
        temp_val = df.temp_value[1:-1].split(",")[0].split(".")[0]
        if temp_val in df.process:
            return True
    return False

In [1642]:
db = db[~ db.apply(temperature_gradient_problem, axis=1)].copy()
db = db[~ db.compound_name.str.contains("temp", na=False)].copy()
db = db[~ db.apply(matching_temp_and_process, axis=1)].copy()

In [1643]:
db = db[~((db.normalised_temp_avg < 0) | (db.normalised_temp_avg > 2500))]

In [1644]:
mod_names = "ZT ThermCond Conductivity PF Seebeck".split()

In [1645]:
# remove negative PF values
db = db[~((db.normalised_model == "PF") & (db.normalised_avg < 0))].copy()

In [1646]:
# drop too small ZT values
db = db[~((db.normalised_model == "ZT") & (db.normalised_avg < 10**(-18)))]

In [1647]:
try:
    db.drop(["Unnamed: 0"], axis=1, inplace=True)
except KeyError:
    print("no 'Unnamed: 0' column.")

no 'Unnamed: 0' column.


In [1648]:
# if True, saves intermediary version, used for aggregation
if save_intermediary:
    database_name = "intermediary_database.csv"
    save_name = name_prefix + database_name

    if not os.path.exists(save_name):
        db.to_csv(save_name)
        print(f"Saved {save_name}.")
    else:
        save = input(f"'{save_name}' already exists, would you like to overwrite? [Y/n]")
        if save == "Y":
            db.to_csv(save_name)
            print(f"Overwrote {save_name}")
        else:
            print("Skipping save.")

#### Converting to main format

In [1649]:
ordered_columns = "first_name labels editing normalised_model type specifier normalised_values \
normalised_units normalised_temp_values normalised_temp_units normalised_avg normalised_temp_avg pressure process \
direction_of_measurement filename title oa publisher yop".split()

In [1650]:
db = db[ordered_columns].copy()

In [1651]:
renaming_list = "Name Label Editing Model Model_Type Specifier Value Units Temperature_Value Temperature_Units \
Value_Average Temperature_Average Pressure Process Direction_of_Measurement DOI Title Access_Type Publisher \
Publication_Year".split()

In [1652]:
if len(ordered_columns) == len(renaming_list):
    renaming_dict = {ordered_columns[i] : renaming_list[i] for i in range(len(ordered_columns))}
else:
    raise KeyError

In [1653]:
# rename the columns according to the final (main) format
db.rename(renaming_dict, axis=1, inplace=True)

In [1654]:
def change_access_type_entries(x):
    return "open" if x == "yes" else "payment"

db.Access_Type = db.Access_Type.apply(change_access_type_entries)

#### Adding authors and journal from chemataextractor's metadata

In [12]:
meta_path = os.path.join(os.getcwd(), 'resources', 'metadata_dict.json')

In [13]:
with open(meta_path, "rb") as handle:
    meta_dict = json.load(handle)

In [1657]:
def meta_to_df(df):
    doi = df.DOI
    try:
        authors = meta_dict[doi]['authors']
    except KeyError:
        authors = np.nan
    try:
        journal = meta_dict[doi]["journal"]
    except KeyError:
        journal = np.nan

        
    df.authors = authors
    df.journal = journal
    
    return df

In [1658]:
for c in "authors journal".split():
    db[c] = ""

In [1659]:
db = db.apply(meta_to_df, axis=1)

In [1660]:
# restore the DOI from the filename format
def return_doi_to_original_form(d):
    d = list(d.rsplit('.',1)[0]) # split and index to skip file extension, transform to list
    d[7] = "/"  # replace affected hyphen with original slash
    return "".join(d) # join and return

In [1661]:
db.DOI = db.DOI.apply(return_doi_to_original_form)

In [1662]:
db.rename({s: s.capitalize() for s in "authors journal".split()}, axis=1, inplace = True)

### Saving formatted main database


In [1663]:
database_name = "main_database.csv"
save_name = name_prefix + database_name

if not os.path.exists(save_name):
    db.to_csv(save_name, index=False)
    print(f"Saved {save_name}")
else:
    save = input(f"'{save_name}' already exists, would you like to overwrite? [Y/n]")
    if save == "Y":
        db.to_csv(save_name, index=False)
        print(f"Overwrote {save_name}")
    else:
        print("Skipping save.")

Saved example_main_database.csv
