In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
df = pd.read_csv('../raw_data/Superconductivity/data/220808_MDR_OAndM.txt', sep='\t', encoding='utf-8', encoding_errors='replace', low_memory=False)

In [3]:
def merge_temperatures(cols, labels):
    out = []
    categories = []
    mask = []
    for i in range(len(cols[0])):
        nonans = False
        for j, col in enumerate(cols):
            if not pd.isnull(col[i]): 
                out.append(col[i])
                categories.append(labels[j])
                mask.append(1)
                nonans = True
                break
        if not nonans:
            mask.append(0)
    return out, categories, np.array(mask)

In [4]:
out, categories, mask = merge_temperatures((df['transition temperature (mid point)'], 
                                            df['transition temperature (R = 100%)'], 
                                            df['transition temperature (R = 0)']), ('mid', '100', '0'))

In [5]:
temperature_dataset = df.iloc[np.nonzero(mask)[0]]

In [6]:
journal_names = temperature_dataset['journal']
physrevb_names_dirty = [n for n in journal_names if 'Phys.Rev.B' in n]
physrevb_names_dirty = list(set(physrevb_names_dirty))
physrevb_names = []
for n in physrevb_names_dirty:
    if ',' in n: physrevb_names.append(n.split(',')[1].strip())
    else: physrevb_names.append('')
        
dois = []
journal_doi_map = []
journal_to_doi = {}
for i, (n, n_d) in enumerate(zip(physrevb_names, physrevb_names_dirty)):
    try:
        a = n.index("(")
        b = n.index(")")
        doi = (n[:a] + "." + n[b+1:])
        doi = "".join((n[:a] + "." + n[b+1:]).split()).lower()
        doi = "".join(doi.split("-")[0])
        dois.append(doi)
        journal_doi_map.append(i)
        journal_to_doi[n_d] = doi
    except ValueError:
        continue
        
valid_dois = []
with open('../acquire_pdfs/physrevb_dois.txt', 'r') as f:
    valid_dois = f.readlines()
valid_dois_stripped = [".".join(v.split(".")[-2:]).strip() for v in valid_dois]
valid_dois_map = {".".join(v.split(".")[-2:]).strip(): index for index, v in enumerate(valid_dois)}

journal_to_doi_valid = {}
for i, item in enumerate(dois):
    a = valid_dois_map.get(item)
    if a is None:
        continue
    else:
        journal_to_doi_valid[physrevb_names_dirty[journal_doi_map[dois.index(item)]]] = valid_dois[a] # something something pullback pushforward

In [7]:
journal_to_doi

{'Phys.Rev.B, 30(1984)2986': '30.2986',
 'Phys.Rev.B, 82(2010)014534': '82.014534',
 'Phys.Rev.B, 30(1984)1253': '30.1253',
 'Phys.Rev.B, 71(2005)024533': '71.024533',
 'Phys.Rev.B, 80(2009)094501': '80.094501',
 'Phys.Rev.B, 68(2003)10050': '68.10050',
 'Phys.Rev.B,53(1996)R11976': '53.r11976',
 'Phys.Rev.B, 93(2016)115119': '93.115119',
 'Phys.Rev.B,36(1987)4014': '36.4014',
 'Phys.Rev.B,58(1998)15238': '58.15238',
 'Phys.Rev.B, 93(2016)220504': '93.220504',
 'Phys.Rev.B, 88(2013)224514': '88.224514',
 'Phys.Rev.B,50(1994)10346': '50.10346',
 'Phys.Rev.B, 86(2012)224514': '86.224514',
 'Phys.Rev.B, 81(2010)104525': '81.104525',
 'Phys.Rev.B,50 (1994) 10238': '50.10238',
 'Phys.Rev.B, 71(2005)020501': '71.020501',
 'Phys.Rev.B, 92(2015)020505': '92.020505',
 'Phys.Rev.B, 91(2015)020507': '91.020507',
 'Phys.Rev.B, 94(2016)180509': '94.180509',
 'Phys.Rev.B, 68(2003)64512': '68.64512',
 'Phys.Rev.B, 87(2013)134512': '87.134512',
 'Phys.Rev.B, 50(1994)4168': '50.4168',
 'Phys.Rev.B, 78(

In [8]:
journal_to_doi_valid

{'Phys.Rev.B, 30(1984)2986': '10.1103/physrevb.30.2986\n',
 'Phys.Rev.B, 82(2010)014534': '10.1103/physrevb.82.014534\n',
 'Phys.Rev.B, 30(1984)1253': '10.1103/physrevb.30.1253\n',
 'Phys.Rev.B, 71(2005)024533': '10.1103/physrevb.71.024533\n',
 'Phys.Rev.B, 80(2009)094501': '10.1103/physrevb.80.094501\n',
 'Phys.Rev.B,53(1996)R11976': '10.1103/physrevb.53.r11976\n',
 'Phys.Rev.B, 93(2016)115119': '10.1103/physrevb.93.115119\n',
 'Phys.Rev.B,36(1987)4014': '10.1103/physrevb.36.4014\n',
 'Phys.Rev.B,58(1998)15238': '10.1103/physrevb.58.15238\n',
 'Phys.Rev.B, 93(2016)220504': '10.1103/physrevb.93.220504\n',
 'Phys.Rev.B, 88(2013)224514': '10.1103/physrevb.88.224514\n',
 'Phys.Rev.B,50(1994)10346': '10.1103/physrevb.50.10346\n',
 'Phys.Rev.B, 86(2012)224514': '10.1103/physrevb.86.224514\n',
 'Phys.Rev.B, 81(2010)104525': '10.1103/physrevb.81.104525\n',
 'Phys.Rev.B,50 (1994) 10238': '10.1103/physrevb.50.10238\n',
 'Phys.Rev.B, 71(2005)020501': '10.1103/physrevb.71.020501\n',
 'Phys.Rev.B,

In [9]:
correct_output = {}
for name in sorted(physrevb_names_dirty):
    name_idxs = np.nonzero(temperature_dataset['journal'] == name)[0]
    if len(name_idxs) > 0:
        doi = journal_to_doi_valid.get(name)
        if doi is None: continue
        doi = doi.split('/')[1][:-1]
        correct_output[doi] = []
        correct_output[doi].append("MATERIAL: " + ",".join(list(temperature_dataset.iloc[name_idxs]['chemical formula'])))
        correct_output[doi].append("CRITICAL TEMPERATURE: " + ",".join(list(out[i] for i in name_idxs)))

In [10]:
correct_output

{'physrevb.65.064504': ['MATERIAL: S1', 'CRITICAL TEMPERATURE: 17.4'],
 'physrevb.93.224512': ['MATERIAL: Li1.84H1Fe1.14Se1O1',
  'CRITICAL TEMPERATURE: 40.5'],
 'physrevb.96.094523': ['MATERIAL: Ba1Fe1.864Rh0.136As2',
  'CRITICAL TEMPERATURE: 23.3'],
 'physrevb.96.134503': ['MATERIAL: Li0.32M1Fe2Se0.8Te1.2,Fe1Se0.4Te0.6',
  'CRITICAL TEMPERATURE: 20.84,8'],
 'physrevb.96.134504': ['MATERIAL: Ga41Mo8,Ga41Mo7V1',
  'CRITICAL TEMPERATURE: 9.8,9.2'],
 'physrevb.96.144502': ['MATERIAL: Th1As1Fe1N1', 'CRITICAL TEMPERATURE: 30'],
 'physrevb.96.184503': ['MATERIAL: Ca0.8La0.2Fe0.98Co0.02As2,Ca0.8La0.2Fe0.98Co0.02As2,Ca0.8La0.2Fe0.98Co0.02As2,Ca0.8La0.2Fe0.98Co0.02As2',
  'CRITICAL TEMPERATURE: 37.5,8.95,16.08,23.89'],
 'physrevb.96.214505': ['MATERIAL: La0.8Ti0.2Bi1S2O1,La0.85Th0.15Bi1S2O1',
  'CRITICAL TEMPERATURE: 2.48,2.64'],
 'physrevb.96.214506': ['MATERIAL: La1F0.5Bi1S2O0.5,La1F0.5Bi1Se2O0.5',
  'CRITICAL TEMPERATURE: 2.2,3.95'],
 'physrevb.96.220510': ['MATERIAL: Ca1K1Fe4As4,Ca1K1Fe3.9

In [11]:
with open('database.json', 'w', encoding='utf-8') as f:
    json.dump(correct_output, f, ensure_ascii=False, indent=4)

In [219]:
!pwd

/home/louis/research/pdf_processor/extraction
