In [57]:
import requests
from SPARQLWrapper import SPARQLWrapper, JSON
from pprint import pprint
import io
import re

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [65]:
# for entity recognition of compounds without cas numbers
def link_compound2chebi(compound):
    """
    used NCBO Annotator from BioPortal to return ChEBI IDS
    for substrates and products of reactions from Expasy enzyme
    """
    url = 'http://data.bioontology.org/annotator'
    params = dict(apikey=api_key, text=compound, ontologies='CHEBI', longest_only='true',
                  include='properties', exlude_numbers='false', exclude_synonyms='false', mappins='all')
    tm_results = requests.get(url=url, params=params)
    return tm_results.json()

In [66]:
def execute_query(query):
    endpoint = SPARQLWrapper('https://query.wikidata.org/sparql')
    endpoint.setQuery(query)
    endpoint.setReturnFormat(JSON)
    return endpoint.query().convert()

# for mapping the cas number to chebi id using wikidata's sparql endpoint
def map_cas_to_chebi_wd(cas_number):
    query = '''
    select ?compoundLabel ?compound ?chebi where {
      ?compound wdt:P231 '%s';
                wdt:P683 ?chebi.
      SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
      }
    }
    ''' % (cas_number)
    return execute_query(query)

In [67]:
# uses pdfminer2 to extract text from a pdf
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [4]:
# extract text to string from a 
document = convert_pdf_to_txt('data/toc_headings.pdf')

In [24]:
lines = document.split('\n')
lines = [x for x in lines if x]
lines = [x for x in lines if '_' not in x]
lines = [x for x in lines if 'Updated Tables' not in x]
lines = [x.lstrip('\x0c') for x in lines if '(cid:' not in x]

In [64]:
for index, line in enumerate(lines):
    if 'CAS' in line:
        cas = line.split()[-1]
        results = map_cas_to_chebi(cas)['results']['bindings']
        if len(results) == 1:
            print(results[0]['compoundLabel']['value'], results[0]['chebi']['value'])
        else:
            print(cas, 'no wikidata mapping')
#     if not line.startswith('CAS') and not re.search('^\s*[0-9]', line):
    

acrylamide 28619
cotinine 68641
cotinine 68641
4-(N-methyl-N-nitrosamino)-1-(3-pyridyl)butan-1-ol 82569
bromodichloromethane 34591
Chlorodibromomethane 34627
bromoform 38682
chloroform 35255
oxybenzone 34283
bisphenol A 33216
4-tert-octylphenol 34445
triclocarban 48347
3380-34-5 no wikidata mapping
butylparaben 88542
ethylparaben 31575
methylparaben 31835
propylparaben 32063
2,4-dichlorophenol 16738
2,5-dichlorophenol 27929
2-phenylphenol 17043
2-mercaptoimidazoline 34750
87-86-5 no wikidata mapping
2122-19-2 no wikidata mapping
atrazine 15930
138722-96-0 no wikidata mapping
desethyl atrazine 28212
desisopropylatrazine 27399
desisopropylatrazine 27399
2,4-D 28854
2,4,5-T 27903
bensulfuron-methyl 3017
64902-72-3 no wikidata mapping
97780-06-8 no wikidata mapping
foramsulfuron 83502
135397-30-7 no wikidata mapping
208465-21-8 no wikidata mapping
metsulfuron-methyl 39678
nicosulfuron 7554
144651-06-9 no wikidata mapping
86209-51-0 no wikidata mapping
prosulfuron 8523
rimsulfuron 8866
sulf

thallium 30440
7440-31-5 no wikidata mapping
tungsten 27998
uranium 32996
84145-82-4 no wikidata mapping
perchlorate 49706
thiocyanate 18022
2-hydroxy fluorene 34289
fluoren-3-ol 89645
9-hydroxyfluorene 16904
1-phenanthrol 27528
605-55-0 no wikidata mapping
3-phenanthrol 20184
7657-86-7 no wikidata mapping
1-hydroxypyrene 34093
1-naphthol 10319
betanaphthol 10432
trichloroethane 36015
tetrachloroethane 36026
1,2,3-trichloropropane 34036
ethylene dibromide 28534
O-dichlorobenzene 35290
ethylene dichloride 27789
O-dichlorobenzene 35290
paradichlorobenzene 28618
2,5-dimethylfuran 89052
benzene 16716
chlorobenzene 28097
methylene chloride 15767
ethylbenzene 16101
furan 35559
cumene 34656
tert-butyl methyl ether 27642
nitrobenzene 27798
tetrachloroethylene 17300
carbon tetrachloride 27385
toluene 17578
trichloroethylene 16602
108-38-3/106-42-3 no wikidata mapping
95-47-6 no wikidata mapping
51868-61-2 no wikidata mapping
23127-40-4 no wikidata mapping
81690-92-8 no wikidata mapping
74514-75