In [29]:
import pandas as pd
import random 
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from collections import defaultdict
from rdflib import Graph, Literal, Namespace, RDF, URIRef, BNode
import openai
import ast
import pickle
from urllib.parse import quote
from tqdm import tqdm

# Load the file ENTPTNERDOI.csv.tar.gz from https://zenodo.org/records/10022727

In [2]:
ENTDOI = pd.read_csv('ENTPTNERDOI.csv')

In [3]:
ENTDOI.sample(5)

Unnamed: 0,Entity,Part_of_text,NER_Tag,DOI,Preferred_Entity,Year
7966059,calcium,Abs,CHM,10.1111/j.1551-2916.2006.01203.x,Calcium,2006.0
83090376,sulfur,Cap,CHM,10.1016/j.apsusc.2017.10.076,Sulfur,2018.0
23313622,anodes,Abs,APL,10.1016/j.jpowsour.2014.07.081,Anodes,2014.0
39090930,p+,Abs,PRO,10.1016/s0169-4332(01)00526-8,P+,2001.0
22326316,catalysts,Abs,APL,10.1186/s11671-018-2824-7,Catalyst,2018.0


### Analysing ENTDOI.csv

In [4]:
print("Number of rows : {}".format(ENTDOI.shape[0]))

Number of rows : 84659831


### Extracting all the raw entities for cleaning

Let's extract all raw entities

In [5]:
all_entities = list(set(ENTDOI['Entity'].tolist()))

In [6]:
print("Number of entities : {}".format(len(all_entities)))

Number of entities : 197623


Now, the entities are clubbed according to their Levenshtein distance

In [None]:
clusters = defaultdict(set)
threshold = 90
for entity in tqdm(entity_list, total = len(entity_list)):
    found = False
    for key in clusters.keys():
        if fuzz.ratio(key, entity) >= threshold:
            clusters[key].add(entity)
            found = True
            break
    if not found:
        clusters[entity].add(entity)

These clustered entities are cleaned using OpenAI API

### Grouping and cleaning clusters using OpenAI

In [None]:
openai.api_key = '<enter api key>'

In [None]:
prompt = """
Given a list of scientific terms as input, return a list of normalized terms as output. The normalization should take into\
account minor spelling variations, plural forms, special characters, and word order. Additionally, the normalization should\
preserve distinctions in terms that have specific additional descriptors. Each input term should have a corresponding\
output term in title case.

Examples:
Input: ['nanobeam', 'nanobeams', 'gap activity', 'gpx activity', 'alp activity', 'vibration amplitude', 'vibration amplitudes',\
 'photo images', 'Photo images', 'photo image', 'pebbles', 'pebble', 'pressure drops', 'pressure - drop',\
 'dark-field image.']
Output: ['Nanobeam', 'Nanobeam', 'Gap Activity', 'GPX Activity', 'ALP Activity', 'Vibration Amplitude', 'Vibration Amplitude',\
 'Photo Images', 'Photo Images', 'Photo Images', 'Pebbles', 'Pebbles', 'Pressure Drop', 'Pressure Drop', 'Dark Field Image']
Please provide the output as a list for the given input terms
Input Terms:
{}
Output Terms:
"""

def return_prompt(input_):
    return prompt.format(input_)

In [None]:
mapping_dict = {}
failed_lists = []
final_list = [list(value) for key, value in clusters.items()]

In [None]:
for input_list in tqdm(final_lists, total = len(final_lists)):
    Run = True
    for x in input_list:
        if x in mapping_dict:
            Run = False
    try:
        if Run:
            response = openai.Completion.create(
                                model="text-davinci-003",
                                prompt = return_prompt(input_list),
                                max_tokens=1024,
                                temperature=0.1,
                            )
            result=response.choices[0].text
            result_list = ast.literal_eval(result)

            if len(result_list) == len(input_list):
                for A, B in zip(input_list, result_list):
                    mapping_dict[A] = B
            else:
                failed_lists.append([input_list, result_list])

            if random.random()<0.01:
                time.sleep(100)
                pickle.dump(mapping_dict, open('final_rdf_mapping_dict.pickle', 'wb'))
                pickle.dump(failed_lists, open('final_rdf_error_lists.pickle', 'wb'))
        else:
            continue
    except:
        failed_lists.append([input_list, result_list])

The mapping dict contains the (raw_entity, preferred_entity) pairings. \
Note that many clusters have only value. These can be filtered to trim the number of API calls

In [11]:
all_pref = list(set(ENTDOI['Preferred_Entity'].tolist()))

In [12]:
count =0
for pref in all_pref:
    if pref not in mapping_dict:
        print(pref)
        count+=1

Molecular Symmetries
Dynamic Melting
Sillen
(MnFe2O4)x
Dry Eye
Axis Sputtering
Corneal Endothelium
G″) Modulus
ZnR2
Spiro-2CBP
Terpy-C6-SAc
Demo Blanket
HSAB
Soft Metallic
Zn Isotope Composition
ICH3
PdTPP
GMS
Initiated Oxidation
Malondialdehyde Content
Methionines
Total Electronic Energy
AISI 4140T Steel
EMI Shielding Materials
Tectono-Metamorphic
Microlitic
Pd(001)
HgCl2–CaCO3
Hydrogenolytic Activity
K Min
Total Carbon Concentration
Electrolysis Voltage
Si2p XPS
Finite Size
Elements Mappings
CZA
FMOC Deprotection
Power Load
Anticonvulsant Properties
Pd(CH3COO)2
BSZMN
Li2RuO3
iFC
Transpassive
Rump Simulation
Leakage Risk
MnGe
HAADF-STEM-EDS Mappings
As Plated
Plant Community
PTG
Piezoelectric Semiconductor
C-Heteroatom
Imaginary ε''
Label-Free Biosensor
Hydrodynamic Amperometry
Booklets
Resiniferatoxin
Ammonia Adsorption Capacity
Aza-Michael Addition
Diffusion Media
Two-Diode Model
Phosphatidylethanol
Cu0.5Co0.5Fe2O4
Ni-Mn-Sn
Precolumn Derivatization
Low-Buckled
TE - Terrestrial Ecoto

In [13]:
count

81518

In [14]:
# Adding this data to the DataFrame
ENTDOI['Preferred_Entity'] = ENTDOI['Preferred_Entity'].apply(lambda x: mapping_dict[x] if x in mapping_dict else x)

In [16]:
ENTDOI.to_csv('ENTDOI.csv', index = None)

In [18]:
def sanitize_entity(entity):
    return quote(entity)

In [25]:
ENTDOI.head()

Unnamed: 0,Entity,Part_of_text,NER_Tag,DOI,Preferred_Entity,Year
0,electrospray ionization mass spectrometry,Abs,APL,10.1016/s0925-4005(02)00161-2,Electrospray Ionization Mass Spectrometry,2002.0
1,wsoc,Abs,APL,10.1016/j.atmosenv.2007.06.034,WSOC,2007.0
2,pyridinium,Abs,APL,10.1021/jo0712401,Pyridinium,2007.0
3,navy blue,Abs,APL,10.1016/j.ecoleng.2013.07.005,Navy Blue,2013.0
4,gantt chart,Abs,APL,10.1021/ie970286p,Gantt Chart,1997.0


In [31]:
NER, pref_entity = ENTDOI[['NER_Tag', 'Preferred_Entity']].values.T

In [32]:
len(NER), len(pref_entity)

(84659831, 84659831)

In [33]:
# Dictionary to keep track of entity URI mappings
entity_uri_mapping = {}

for ner, pref_ent in tqdm(zip(NER, pref_entity), total = len(NER)):
    if pref_ent:

        # Add Preferred_Entity, creating a URI based on NER tag
        preferred_entity_uri = URIRef("http://example.com/{}/{}".format(ner, sanitize_entity(pref_ent)))

        entity_uri_mapping[pref_ent] = preferred_entity_uri

100%|██████████| 84659831/84659831 [02:59<00:00, 471315.03it/s]


### Creating RDF Dataset

In [None]:
# Create a RDF graph
# Create an empty RDF graph
g = Graph()

# Define namespaces
n = Namespace("http://example.com/")
doi = Namespace("https://doi.org/")
entity_uri_dict= {}

# Iterate through each row in the DataFrame to create RDF triples
for index, row in tqdm(ENTDOI.iterrows(), total = ENTDOI.shape[0]):
    
    if any(pd.isnull(row)):
        continue
    
    
    try:
        node_uri = URIRef(f"http://example.com/row{index}")

        # Add Entity
        g.add((node_uri, n.hasEntity, Literal(row['Entity'])))
        
        # Add Part_of_text
        g.add((node_uri, n.partOfText, Literal(row['Part_of_text'])))

        # Add NER_Tag
        g.add((node_uri, n.hasNER_Tag, Literal(row['NER_Tag'])))
        


        # Add DOI, encoding it first
        encoded_doi = quote(row['DOI'])
        doi_uri     =  URIRef(f"https://doi.org/{encoded_doi}")
        g.add((node_uri, n.hasDOI, doi_uri))
        # Add Year to DOI       
        g.add((doi_uri, n.hasYear, Literal(row['Year'])))
        
        # Add Preferred_Entity, creating a URI based on NER tag
        preferred_entity_uri =entity_uri_mapping[row['Preferred_Entity']]
        g.add((node_uri, n.hasPreferredEntity, preferred_entity_uri))
        entity_uri_dict[row['Preferred_Entity']] = preferred_entity_uri
        
        if (index+1)%10000000==0:
            g.serialize(destination='ENTPTNERDOI.nt', format='nt')
    except Exception as e:
        print(e)
        pass

g.serialize(destination='ENTPTNERDOI.nt', format='nt')    

 37%|███▋      | 31587540/84659831 [2:21:34<2:46:09, 5323.66it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 47%|████▋     | 39430092/84659831 [3:10:26<3437:59:36,  3.65it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [1]:
print("Done")

Done
