### Load data

In [144]:
# coding: utf-8

import spacy
from spacy.lang.en import English 
from spacy.tokens import Doc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import tqdm as notebook_tqdm

# If you want to suppress all warnings:
import warnings
warnings.filterwarnings("ignore")

In [145]:
metabolite_file = '../../Data/hmdb_mDivided.csv'
met_lib = pd.read_csv(metabolite_file)
print(met_lib.shape)

import json
syn_lib = json.load(open('../../Data/SYNONYMSv1.1.json'))
print(len(syn_lib))

(217920, 4)
217920


In [146]:
# Check at 391, 394 - nan synonyms
print(met_lib.iloc[0])
syn_lib[0]

accession                                           HMDB0000001
iupac_name    b'(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)pr...
name                                       b'1-Methylhistidine'
synonym       (2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...
Name: 0, dtype: object


['(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoic acid',
 'Pi-methylhistidine',
 '(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoate',
 '1 Methylhistidine',
 '1-Methyl histidine',
 '1-Methyl-histidine',
 '1-Methyl-L-histidine',
 '1-MHis',
 '1-N-Methyl-L-histidine',
 'L-1-Methylhistidine',
 'N1-Methyl-L-histidine',
 '1-Methylhistidine dihydrochloride',
 '1-Methylhistidine']

In [14]:
met_lib[:10]

Unnamed: 0,accession,iupac_name,name,synonym
0,HMDB0000001,b'(2S)-2-amino-3-(1-methyl-1H-imidazol-4-yl)pr...,b'1-Methylhistidine',(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)prop...
1,HMDB0000002,"b'propane-1,3-diamine'","b'1,3-Diaminopropane'","1,3-Propanediamine:1,3-Propylenediamine:Propan..."
2,HMDB0000005,b'2-oxobutanoic acid',b'2-Ketobutyric acid',2-Ketobutanoic acid:2-Oxobutyric acid:3-Methyl...
3,HMDB0000008,b'(2S)-2-hydroxybutanoic acid',b'2-Hydroxybutyric acid',(S)-2-Hydroxybutanoic acid:2-Hydroxybutyrate:2...
4,HMDB0000010,"b'(1S,10R,11S,15S)-5-hydroxy-4-methoxy-15-meth...",b'2-Methoxyestrone',"2-(8S,9S,13S,14S)-3-Hydroxy-2-methoxy-13-methy..."
5,HMDB0000011,b'(3R)-3-hydroxybutanoic acid',b'3-Hydroxybutyric acid',(R)-(-)-beta-Hydroxybutyric acid:(R)-3-Hydroxy...
6,HMDB0000012,"b'1-[(2R,4S,5R)-4-hydroxy-5-(hydroxymethyl)oxo...",b'Deoxyuridine',2-Deoxyuridine:dU:2'-Deoxyuridine:1-(2-Deoxy-b...
7,HMDB0000014,"b'4-amino-1-[(2R,4S,5R)-4-hydroxy-5-(hydroxyme...",b'Deoxycytidine',"4-Amino-1-(2R,4S,5R)-4-hydroxy-5-(hydroxymethy..."
8,HMDB0000015,"b'(1S,2R,10R,11S,14R,15S)-14-hydroxy-14-(2-hyd...",b'Cortexolone',11-Desoxy-17-hydroxycorticosterone:Cortodoxone...
9,HMDB0000016,"b'(1S,2R,10S,11S,14S,15S)-14-(2-hydroxyacetyl)...",b'Deoxycorticosterone',"21-Hydroxy-4-pregnene-3,20-dione:21-Hydroxypro..."


In [5]:
syn_lib[:10]

[['(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoic acid',
  'Pi-methylhistidine',
  '(2S)-2-Amino-3-(1-methyl-1H-imidazol-4-yl)propanoate',
  '1 Methylhistidine',
  '1-Methyl histidine',
  '1-Methyl-histidine',
  '1-Methyl-L-histidine',
  '1-MHis',
  '1-N-Methyl-L-histidine',
  'L-1-Methylhistidine',
  'N1-Methyl-L-histidine',
  '1-Methylhistidine dihydrochloride',
  '1-Methylhistidine'],
 ['1,3-Propanediamine',
  '1,3-Propylenediamine',
  'Propane-1,3-diamine',
  'tn',
  'Trimethylenediamine',
  '1,3-Diaminepropane',
  'Trimethylenediamine dihydrochloride',
  'Trimethylenediamine hydrochloride',
  '1,3-diamino-N-Propane',
  '1,3-Trimethylenediamine',
  '3-Aminopropylamine',
  'a,W-Propanediamine',
  '1,3-Diaminopropane'],
 ['2-Ketobutanoic acid',
  '2-Oxobutyric acid',
  '3-Methyl pyruvic acid',
  'alpha-Ketobutyrate',
  'alpha-Ketobutyric acid',
  'alpha-oxo-N-Butyric acid',
  '2-Oxobutyrate',
  '2-Oxobutanoic acid',
  '2-Ketobutanoate',
  '3-Methyl pyruvate',
  'a-Ketobutyrate',

### Preprocess terms and add to patterns file

#### Creating a pseudo-A list

In [149]:
names_processed = []
total_entries = 10000  # Change values and try which works (tried till 50k, can try more than that) # For full use len(met_lib)
start_time = time.time()

for i in range(total_entries):
## help
    # Clean text, remove 'b' from the beginning of string
    full_name = met_lib.iloc[i]['name'].strip("b\'\"")
    full_name_decoded = full_name.encode().decode() # \u2019 - ' urindine
    names_processed.append(full_name_decoded)
    
print('Metabolite names processed', len(names_processed))

A_list = set() # Set to keep unique values

# Select only one-word names, and these one-word names must have 5 to 10 characters
for i in range(len(names_processed)):
    if(len(names_processed[i]) >= 5 and len(names_processed[i]) <= 11 and len(names_processed[i].split()) == 1): 
        ''' Can 10 Oleic acid and 11 Adipic acid be considered even though they have two words? - No '''
        A_list.add(names_processed[i])

print('Number of names in A_list ', len(A_list))
print('Time taken to prepare A list ', round(time.time() - start_time, 4))

Metabolite names processed 10000
Number of names in A_list  1404
Time taken to prepare A list  0.8117


In [150]:
A_list = list(A_list)
# A_list_df = pd.DataFrame(columns = ['A_list_names'])
# A_list_df['A_list_names'] = A_list
# A_list_df.to_csv('A_list_df_02_19.csv', index=False)

In [17]:
# print('Adenosine monophosphate' in A_list)
# print('monophosphate' in A_list)
# print('Adenosine' in A_list)


#### Create a new_metabolite_lib that contains only those terms that has at least one occurence of the names in A_list

In [151]:
start_time = time.time()

def detect_metabolites_patterns(total_len):
    new_met_lib_index = []

    # Need to remove that b character (represents bytes but it is string here in df) from iupac_name and name
    metabolites_patterns = []
    redundant_items = {}
    parsed_items = {}
    name_and_id = {}

    verbose_frequency=400
    nn = 0
    for ii in range(total_len):

        flag_i1=False
        # Check if word is acceptable - to be acceptable it should have some occurence of any word from A_list
        id_name = names_processed[ii]
        

        ## Check if Main names in A_list:    
        if any(x.lower() in names_processed[ii].lower() for x in A_list):

            # Have the original name as a pattern for exact match of main name, so process the word also
            full_name_spaced = ''.join((' {} '.format(el.lower()) if not el.isalnum() and not el.isspace() else el for el in names_processed[ii]))

            # Check if not present in redundant_items dictionary and also not in parsed_items dictionary
            if((not redundant_items.get(full_name_spaced)) and (not parsed_items.get(full_name_spaced))):
                name_and_id[full_name_spaced] = id_name # key as itself and value as itself
                parsed_items[full_name_spaced] = 1    
                

                ## process synonyms column - Process these only if their main word is also in A_list. Only then we can assign pattern ID name (main word name) to it
                syn_spaced2=''
                x2=''
                # There are many metabolites with nan synonyms, so ignore them
                if(syn_lib[ii] != ['nan'] and syn_lib[ii] != [''] and syn_lib[ii] != [' '] ):

                    # No Need to split synonym
                    syn_list = syn_lib[ii]

                    # Fixing issue of synonyms being just numbers. 
                    # Solution is to filter and check if a name contains only numbers (isnumeric), if so then do not include in patterns file.
                    # syn_list = [item for item in syn_list if not item.isnumeric() and len(item) > 1]
                    updated_syn_list = []
                    for item in syn_list:
                        if(not item.isnumeric() and len(item) > 1): # This greater than 1 condition to avoid single alphabets
                            updated_syn_list.append(item)


                    for syn in updated_syn_list:
                        # Doing encode.decode even on normal strings does not affect anything, but helps with strings having unicode characters
                        syn_decoded = syn.encode().decode()
                        syn_spaced = ''.join((' {} '.format(el) if not el.isalnum() and not el.isspace() else el for el in syn_decoded))
                        syn_split = syn_spaced.split()            

                        # Check if not present in redundant_items dictionary and also not in parsed_items dictionary
                        if((syn_spaced == full_name_spaced) or ((not redundant_items.get(syn_spaced)) and (not parsed_items.get(syn_spaced)))):
                            name_and_id[syn_spaced] = id_name # key as synonym and value as main name
                            parsed_items[syn_spaced] = 1

                        elif((not redundant_items.get(syn_spaced)) and (parsed_items.get(syn_spaced))):
                            # Remove from map
                            del name_and_id[syn_spaced]
                            redundant_items[syn_spaced] = 1
                            

            elif((not redundant_items.get(full_name_spaced)) and (parsed_items.get(full_name_spaced))):
                # Remove from map
                del name_and_id[full_name_spaced]
                redundant_items[full_name_spaced] = 1

                    
        ## For those main names which are not in A_list, cehck their syn.
        else: # Without this one else, it was so hard to find the issue
            ##check whether A_list in current syn
            syn_spaced2=''
            x2=''
            # There are many metabolites with nan synonyms, so ignore them
            if(syn_lib[ii] != ['nan'] and syn_lib[ii] != [''] and syn_lib[ii] != [' '] ):

                # No Need to split synonym
                syn_list = syn_lib[ii]

                # Fixing issue of synonyms being just numbers. 
                # Solution is to filter and check if a name contains only numbers (isnumeric), if so then do not include in patterns file.
                # syn_list = [item for item in syn_list if not item.isnumeric() and len(item) > 1]
                updated_syn_list = []
                for item in syn_list:
                    if(not item.isnumeric() and len(item) > 1): # This greater than 1 condition to avoid single alphabets
                        updated_syn_list.append(item)


                for syn in updated_syn_list:

                    # Doing encode.decode even on normal strings does not affect anything, but helps with strings having unicode characters
                    syn_decoded = syn.encode().decode()
                    syn_spaced = ''.join((' {} '.format(el) if not el.isalnum() and not el.isspace() else el for el in syn_decoded))
                    syn_split = syn_spaced.split()
                    
                    ##issue2
                    if any(x.lower() in syn_spaced.lower() for x in A_list):                    
                        # Check if not present in redundant_items dictionary and also not in parsed_items dictionary
                        if(((not redundant_items.get(syn_spaced)) and (not parsed_items.get(syn_spaced)))):
                            name_and_id[syn_spaced] = id_name # key as synonym and value as main name
                            parsed_items[syn_spaced] = 1

                        elif((not redundant_items.get(syn_spaced)) and (parsed_items.get(syn_spaced))):
                            # Remove from map
                            del name_and_id[syn_spaced]
                            redundant_items[syn_spaced] = 1



    # Once every word is processed, remove all items from map which are also in redundant_items
    final_name_and_id = {k: v for k, v in name_and_id.items() if k not in redundant_items} 

    # Now map to patterns
    for full_name_spaced in final_name_and_id:
        id_name = final_name_and_id[full_name_spaced]
        full_name_list = full_name_spaced.split()
        add_split_pattern = []

        ##issue2
        # If two letter word then do case sensitive exact match. This is to handle AS, AT words, not to be confused with as, at.
        if(len(full_name_spaced) == 2):
            metabolites_patterns.append({"label": "Metabolites", "pattern": full_name_spaced, "id": id_name })
        else:  
            # Having double spaces also does not affect as .split() removes any word between any number of spaces
            add_split_pattern = []
            for split_word in full_name_list:
                add_split_pattern.append({"LOWER": str(split_word.lower())}) # The first lower did not work
            metabolites_patterns.append({"label": "Metabolites", "pattern": add_split_pattern, "id": id_name })

        nn+=1
        if verbose_frequency and nn%verbose_frequency==0:
    #         print("\n name col:",nn,ii,"::",names_processed[ii],"::",x)
            print(metabolites_patterns[-1], '\n')
        # No need of break - it completely stops program

    print('Total metabolite patterns detected: ', len(metabolites_patterns))
    return metabolites_patterns, redundant_items
    

In [152]:
metabolites_patterns, red_items = detect_metabolites_patterns(1000) # Pass the number of rows from metabolite file to use for creating patterns


{'label': 'Metabolites', 'pattern': [{'LOWER': '('}, {'LOWER': '+'}, {'LOWER': ')'}, {'LOWER': '-'}, {'LOWER': 'sodium'}, {'LOWER': 'l'}, {'LOWER': '-'}, {'LOWER': 'ascorbate'}], 'id': 'Ascorbic acid'} 

{'label': 'Metabolites', 'pattern': [{'LOWER': 'hydrocortisone'}, {'LOWER': 'butyrate'}], 'id': 'Cortisol'} 

{'label': 'Metabolites', 'pattern': [{'LOWER': 'dihydro'}, {'LOWER': '-'}, {'LOWER': '2'}, {'LOWER': ','}, {'LOWER': '4'}, {'LOWER': '('}, {'LOWER': '1h'}, {'LOWER': ','}, {'LOWER': '3h'}, {'LOWER': ')'}, {'LOWER': '-'}, {'LOWER': 'pyrimidinedione'}], 'id': 'Dihydrouracil'} 

{'label': 'Metabolites', 'pattern': [{'LOWER': 'potassium'}, {'LOWER': 'glycolate'}], 'id': 'Glycolic acid'} 

{'label': 'Metabolites', 'pattern': [{'LOWER': 'formic'}, {'LOWER': 'acid'}, {'LOWER': ','}, {'LOWER': 'potassium'}, {'LOWER': 'salt'}], 'id': 'Formic acid'} 

{'label': 'Metabolites', 'pattern': [{'LOWER': 'alanine'}, {'LOWER': 'doms'}, {'LOWER': '-'}, {'LOWER': 'adrian'}, {'LOWER': 'brand'}], 'i

In [47]:
syn_lib[199]

['Uridine diphosphate',
 'Uridine diphosphoric acid',
 "Uridine 5'-diphosphoric acid",
 "5'-UDP",
 'UDP',
 "Uridine 5'-pyrophosphate",
 "Uridine 5'-pyrophosphorate",
 "Uridine 5'-pyrophosphoric acid",
 'Uridine pyrophosphate',
 'Pyrophosphate, uridine',
 'Diphosphate, uridine',
 '5’-UDP',
 "Uridine 5'-(trihydrogen pyrophosphate)",
 "Uridine 5'-diphosphate",
 'Uridine 5’-(trihydrogen pyrophosphate)',
 'Uridine 5’-diphosphate',
 'Uridine 5’-diphosphoric acid',
 'Uridine 5’-pyrophosphate',
 'Uridine 5’-pyrophosphoric acid']

In [12]:
syn_lib[42]

["3'-Phosphoadenylate",
 "Adenosine 3',5'-bisphosphate",
 'PAP',
 'Phosphoadenosine phosphate',
 "3'-Phosphoadenylic acid",
 "Adenosine 3',5'-bisphosphoric acid",
 'Phosphoadenosine phosphoric acid',
 "Adenosine 3',5'-diphosphoric acid",
 "3'-Phosphoryl-AMP",
 '3,5-ADP',
 '3,5-Diphosphoadenosine',
 '3-Phosphoadenosine 5-phosphate',
 '5-(Dihydrogen phosphate) 3-adenylate',
 "5-(Dihydrogen phosphate)3'-adenylic acid",
 'Adenosine 3,5-bis',
 'Adenosine 3,5-bisphosphate',
 "Adenosine 3'-phosphate-5'-phosphate, disodium salt",
 "3'-Phosphoadenosine 5'-phosphate",
 "Adenosine 3'-phosphate-5'-phosphate, monosodium salt",
 "3',5'-ADP",
 "Adenosine 3'-phosphate-5'-phosphate"]

In [13]:
metabolites_patterns[:3]

[{'label': 'Metabolites',
  'pattern': [{'LOWER': 'alpha'},
   {'LOWER': '-'},
   {'LOWER': 'ketobutyric'},
   {'LOWER': 'acid'},
   {'LOWER': ','},
   {'LOWER': 'sodium'},
   {'LOWER': 'salt'}],
  'id': '2-Ketobutyric acid'},
 {'label': 'Metabolites',
  'pattern': [{'LOWER': '2'}, {'LOWER': '-'}, {'LOWER': 'methoxyestrone'}],
  'id': '2-Methoxyestrone'},
 {'label': 'Metabolites',
  'pattern': [{'LOWER': '2'},
   {'LOWER': '-'},
   {'LOWER': '('},
   {'LOWER': '8s'},
   {'LOWER': ','},
   {'LOWER': '9s'},
   {'LOWER': ','},
   {'LOWER': '13s'},
   {'LOWER': ','},
   {'LOWER': '14s'},
   {'LOWER': ')'},
   {'LOWER': '-'},
   {'LOWER': '3'},
   {'LOWER': '-'},
   {'LOWER': 'hydroxy'},
   {'LOWER': '-'},
   {'LOWER': '2'},
   {'LOWER': '-'},
   {'LOWER': 'methoxy'},
   {'LOWER': '-'},
   {'LOWER': '13'},
   {'LOWER': '-'},
   {'LOWER': 'methyl'},
   {'LOWER': '-'},
   {'LOWER': '7'},
   {'LOWER': ','},
   {'LOWER': '8'},
   {'LOWER': ','},
   {'LOWER': '9'},
   {'LOWER': ','},
   {'LOWER'

In [153]:
len(metabolites_patterns)

10963

### Now for abstracts: For every keyword in A_list, extract 1000 abstracts per keyword. There should be presence of that word in that abstract anywhere - not just title.
### And then do pattern-matching using our patterns file

### Below is my scraping method - not very efficient

In [15]:
# import csv
# import re
# import urllib
# from time import sleep

# query = "Cortexolone"

# # common settings between esearch and efetch
# base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
# db = 'db=pubmed'

# # esearch settings
# search_eutil = 'esearch.fcgi?'
# search_term = '&term=' + query
# search_usehistory = '&usehistory=y'
# search_rettype = '&rettype=json'

# # call the esearch command for the query and read the web result
# search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype
# print("this is the esearch command:\n" + search_url + "\n")
# f = urllib.request.urlopen (search_url)
# search_data = f.read().decode('utf-8')

# # extract the total abstract count
# total_abstract_count = int(re.findall("<Count>(\d+?)</Count>",search_data)[0])

# if(total_abstract_count < 10):
#     max_count = total_abstract_count
# else:
#     max_count = 10

# # efetch settings
# fetch_eutil = 'efetch.fcgi?'
# retmax = 20
# retstart = 0
# fetch_retmode = "&retmode=text"
# fetch_rettype = "&rettype=abstract"

# # obtain webenv and querykey settings from the esearch results
# fetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
# fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>",search_data)[0]

# # call efetch commands using a loop until all abstracts are obtained
# run = True
# all_abstracts = list()
# loop_counter = 1

# while len(all_abstracts) < max_count:
#     print("this is efetch run number " + str(loop_counter))
#     loop_counter += 1
#     fetch_retstart = "&retstart=" + str(retstart)
#     fetch_retmax = "&retmax=" + str(retmax)
#     # create the efetch url
#     fetch_url = base_url+fetch_eutil+db+fetch_querykey+fetch_webenv+fetch_retstart+fetch_retmax+fetch_retmode+fetch_rettype
#     print(fetch_url)
#     # open the efetch url
#     f = urllib.request.urlopen (fetch_url)
#     fetch_data = f.read().decode('utf-8')
#     # split the data into individual abstracts
#     abstracts = fetch_data.split("\n\n\n")
#     # append to the list all_abstracts
#     all_abstracts = all_abstracts+abstracts
#     print("a total of " + str(len(all_abstracts)) + " abstracts have been downloaded.\n")
#     # wait 2 seconds so we don't get blocked
#     sleep(2)
#     # update retstart to download the next chunk of abstracts
#     retstart = retstart + retmax
#     if retstart > total_abstract_count:
#         run = False

### Below is code found online for scraping - works fast but needs fixing

In [12]:
import gc

start_time = time.time()
abstract_writer = pd.DataFrame(columns = ['keyword', 'abstract'])
start_index = 0
end_index = len(A_list)

for i in range(start_index, end_index):
    
    if(i%100 == 0 and i!= 0): 
        time.sleep(100)
        
    time.sleep(2)
    
    gc.collect()
    # to get abstracts
    query_name = A_list[i]
    fc = 0
    with open('keywords.txt', 'w') as f:
        f.write(query_name)
        fc = 1
    f.close()
    
    
    '''
    !python3 async_pubmed_scraper_2010_2015.py --pages=10 --start=2020 --end=2022 --output='article_data.csv' # Separating out years because it can not do 2000 to 2022 at a time (gives error) 2017 - 2022

    '''

    try:
        # Round 1
        !python3 async_pubmed_scraper.py # Separating out years because it can not do 2000 to 2022 at a time (gives error) 2017 - 2022

        # Once output file is created, parse it and get necessary values
        if(fc == 1):
            file = 'article_data.csv'
            abs_pbs = pd.read_csv(file)

            abstracts = abs_pbs['abstract']
            print('Number of abstracts for', query_name, ' is ', len(abstracts))

            for j in range(len(abstracts)):
                text = str(abstracts.iloc[j])
                if(text and text != 'NO_ABSTRACT' and str(text) != 'nan'):
                    abstract_writer = abstract_writer.append({'keyword': query_name, 'abstract': text}, ignore_index=True)

    except Exception as e: 
        print('error at ', i, ' with exception ')
        
print('Time taken to get abstracts ', round(time.time() - start_time, 4))



Finding PubMed article URLs for 1 keywords found in keywords.txt

Scraping initiated for 100 article URLs found from 2017 to 2021

It took 12.25301456451416 seconds to find 100 articles; 100 unique articles were saved to article_data.csv
Number of abstracts for Acetoin  is  100

Finding PubMed article URLs for 1 keywords found in keywords.txt

Scraping initiated for 100 article URLs found from 2017 to 2021

It took 11.352988958358765 seconds to find 100 articles; 100 unique articles were saved to article_data.csv
Number of abstracts for Baclofen  is  100

Finding PubMed article URLs for 1 keywords found in keywords.txt

Scraping initiated for 71 article URLs found from 2017 to 2021

It took 8.848851203918457 seconds to find 71 articles; 69 unique articles were saved to article_data.csv
Number of abstracts for Picrocrocin  is  69

Finding PubMed article URLs for 1 keywords found in keywords.txt

Scraping initiated for 36 article URLs found from 2017 to 2021

It took 4.2440125942230225 

In [None]:
# Now all we have to do is do NER pattern matching to get entities and then for each of those entites, check if they
# are present in the A list, if they are there then record it.

### Very important note: Save A_list also, and use that A_list to accept or reject entity names from NER. For some reason, A_list differs everytime I run - check why.

In [16]:
A_list_df = pd.DataFrame(data = A_list, columns = ['A_list_names'])
A_list_df.to_csv('A_list_31_01.csv', index=False)

In [17]:
## Uncomment if extracted abstracts again

# abstract_writer.to_csv('abstract_writer_31_01.csv', index=False)
# print(abstract_writer.shape)

In [19]:
abstract_writer['keyword'].value_counts()

Acetoin        100
CE(22:0)       100
Sulindac       100
Carnosol       100
Ticarcillin    100
              ... 
12-KETE          1
Hawkinsin        1
Diflorasone      1
L-Canaline       1
Spirapril        1
Name: keyword, Length: 1430, dtype: int64

In [20]:
abstract_writer.head()

Unnamed: 0,keyword,abstract
0,Acetoin,Serratia marcescens is reported to possess the...
1,Acetoin,Acetoin (3-hydroxy-2-butanone) is an important...
2,Acetoin,A set of 917 wines of Czech origin were analys...
3,Acetoin,Microbial production of acetoin is eco-friendl...
4,Acetoin,"In this study, physicochemical composition, ni..."


### Add ruler to pipeline

In [155]:
nlp = None
nlp = English()
ruler = nlp.add_pipe("entity_ruler") #can only work for non-token pattern
with nlp.select_pipes(enable="tagger"):
    ruler.add_patterns(metabolites_patterns)

"""
Use this once we have proper A list to avoid re-computing patterns and ruler componenets

# ruler.to_disk("patterns_spaced.jsonl")
# ruler.from_disk("patterns_spaced.jsonl")

"""
# nlp.to_disk("patterns_02_17")

print('Adding to ruler done')


Adding to ruler done


### Now run the entity ruler matching algorithm on abstracts

In [23]:
"""
Here 'abstract_writer_31_01.csv' assumes that the abstracts are already stored. And A_list assumes A_list is already computed.

# If A_list is not loaded, then import A list directly

A_list_df = pd.read_csv('A_list_31_01.csv')
A_list = A_list_df['A_list_names']
print('A_list ', len(A_list) )

"""

In [158]:
# abstract = "Uridine 5'-diphosphate ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides .UDP - glucuronic acid is also a sugar donor for the glycosylation of various plant specialized metabolites .Nevertheless , the roles of UGDs in plant specialized metabolism remain poorly understood .Glycyrrhiza species ( licorice ) , which are medicinal legumes , biosynthesize triterpenoid saponins , soyasaponins and glycyrrhizin , commonly glucuronosylated at the C - 3 position of the triterpenoid scaffold ."
# abstract_spaced = "".join((' {} '.format(el.encode().decode()) if (not el.isalnum() and not el.isspace()) else el for el in abstract))
# abstract_preprocessed = " ".join(abstract_spaced.split())
# doc = None
# doc = nlp(abstract_preprocessed)
# entities_dict = {}

# #print NER result
# for ent in doc.ents:
#     # Add to a dict
#     if(not entities_dict.get(str(ent.ent_id_))):
#         entities_dict[str(ent.ent_id_)] = [(str(ent.text), (ent.start_char, ent.end_char))] # Value and its offset  
#     else:
#         entities_dict[str(ent.ent_id_)].append((str(ent.text), (ent.start_char, ent.end_char)))
        
# entities_dict

{"Uridine 5'-diphosphate": [("Uridine 5 ' - diphosphate", (0, 25)),
  ('UDP', (28, 31)),
  ("uridINe 5 ' - dipHOSphaTe", (38, 63))],
 'D-Glucose': [('glucose', (66, 73))],
 'Uridine diphosphate glucuronic acid': [('UDP - glucuronic acid', (105, 126)),
  ('UDP - glucuronic acid', (198, 219))],
 'Uridine diphosphate glucose': [('UDP - glucose', (132, 145))],
 'Sucrose': [('sugar', (230, 235))]}

In [159]:
# sample_abstracts = ["Uridine 5 ' - diphosphate ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides .UDP - glucuronic acid is also a sugar donor for the glycosylation of various plant specialized metabolites .Nevertheless , the roles of UGDs in plant specialized metabolism remain poorly understood .Glycyrrhiza species ( licorice ) , which are medicinal legumes , biosynthesize triterpenoid saponins , soyasaponins and glycyrrhizin , commonly glucuronosylated at the C - 3 position of the triterpenoid scaffold .",
# "Uridine diphosphate glycosyltransferases (UGTs) are the key enzymes in glycosylation processes for decorating plant natural products with sugars. Crystallography, one of the powerful techniques for determining protein structures, was used as the main experimental technique and combined with biochemical methods to study the structure-function relationship and molecular mechanisms of UGTs. Crystal structures of plant UGTs have revealed their exquisite architectures and provided the structural basis for understanding their catalytic mechanism and substrate specificity. In this chapter, some protocols and experimental details of all key stages of protein structure determination are provided, and the structural insights on plant UGTs are also highlighted in combination of method description.",
# "Lung cancer (LC) is the second most common cause of death in men after prostate cancer, and the third most recurrent type of tumor in women after breast and colon cancers. Unfortunately, when LC symptoms begin to appear, the disease is already in an advanced stage and the survival rate only reaches 2%. Thus, there is an urgent need for early diagnosis of LC using specific biomarkers, as well as effective therapies and strategies against LC. On the other hand, the influence of metals on more than 50% of proteins is responsible for their catalytic properties or structure, and their presence in molecules is determined in many cases by the genome. Research has shown that redox metal dysregulation could be the basis for the onset and progression of LC disease. Moreover, metals can interact between them through antagonistic, synergistic and competitive mechanisms, and for this reason metals ratios and correlations in LC should be explored. One of the most studied antagonists against the toxic action of metals is selenium, which plays key roles in medicine, especially related to selenoproteins. The study of potential biomarkers able to diagnose the disease in early stage is conditioned by the development of new analytical methodologies. In this sense, omic methodologies like metallomics, proteomics and metabolomics can greatly assist in the discovery of biomarkers for LC early diagnosis.",
# "Omega-3 polyunsaturated fatty acid (ω-3 PUFA) supplements for chemoprevention of different types of cancer including lung cancer has been investigated in recent years. ω-3 PUFAs are considered immunonutrients, commonly used in the nutritional therapy of cancer patients. ω-3 PUFAs play essential roles in cell signaling and in cell structure and fluidity of membranes. They participate in the resolution of inflammation and have anti-inflammatory effects. Lung cancer patients suffer complications, such as anorexia-cachexia syndrome, pain and depression. The European Society for Clinical Nutrition and Metabolism (ESPEN) 2017 guidelines for cancer patients only discuss the use of ω-3 PUFAs for cancer-cachexia treatment, leaving aside other cancer-related complications that could potentially be managed by ω-3 PUFAs. This review aims to elucidate whether the effects of ω-3 PUFAs in lung cancer is supplementary, pharmacological or both. In addition, clinical studies, evidence in cell lines and animal models suggest how ω-3 PUFAs induce anticancer effects. ω-3 PUFAs and their metabolites are suggested to modulate pivotal pathways underlying the progression or complications of lung cancer, indicating that this is a promising field to be explored. Further investigation is still required to analyze the benefits of ω-3 PUFAs as supplementation or pharmacological treatment in lung cancer.",
# "A causal association has been established between alcohol consumption and cancers of the oral cavity, pharynx, larynx, oesophagus, liver, colon, rectum, and, in women, breast; an association is suspected for cancers of the pancreas and lung. Evidence suggests that the effect of alcohol is modulated by polymorphisms in genes encoding enzymes for ethanol metabolism (eg, alcohol dehydrogenases, aldehyde dehydrogenases, and cytochrome P450 2E1), folate metabolism, and DNA repair. The mechanisms by which alcohol consumption exerts its carcinogenic effect have not been defined fully, although plausible events include: a genotoxic effect of acetaldehyde, the main metabolite of ethanol; increased oestrogen concentration, which is important for breast carcinogenesis; a role as solvent for tobacco carcinogens; production of reactive oxygen species and nitrogen species; and changes in folate metabolism. Alcohol consumption is increasing in many countries and is an important cause of cancer worldwide.",
# "Ethanol is neither genotoxic nor mutagenic. Its first metabolite acetaldehyde, however, is a powerful local carcinogen. Point mutation in ALDH2 gene proves the causal relationship between acetaldehyde and upper digestive tract cancer in humans. Salivary acetaldehyde concentration and exposure time are the two major and quantifiable factors regulating the degree of local acetaldehyde exposure in the ideal target organ, oropharynx. Instant microbial acetaldehyde formation from alcohol represents >70% of total ethanol associated acetaldehyde exposure in the mouth. In the oropharynx and achlorhydric stomach acetaldehyde is not metabolized to safe products, instead in the presence of alcohol it accumulates in saliva and gastric juice in mutagenic concentrations. A common denominator in alcohol, tobacco and food associated upper digestive tract carcinogenesis is acetaldehyde. Epidemiological studies on upper GI tract cancer are biased, since they miss information on acetaldehyde exposure derived from alcohol and acetaldehyde present in 'non-alcoholic' beverages and food.",
# "Infertility is a severe medical problem and is considered a serious global public health issue affecting a large proportion of humanity. Oxidative stress is one of the most crucial factors involved in infertility. Recent studies indicate that the overproduction of reactive oxygen species (ROS) or reactive nitrogen species (RNS) may cause damage to the male and female reproductive systems leading to infertility. Low amounts of ROS and RNS are essential for the normal functioning of the male and female reproductive systems, such as sperm motility, acrosome reaction, interactions with oocytes, ovulation, and the maturation of follicles. Environmental factors such as heavy metals can cause reproductive dysfunction in men and women through the overproduction of ROS and RNS. It is suggested that oxidative stress caused by arsenic is associated with male and female reproductive disorders such as through the alteration in sperm counts and motility, decreased sex hormones, dysfunction of the testis and ovary, as well as damage to the processes of spermatogenesis and oogenesis. This review paper highlights the relationship between arsenic-induced oxidative stress and the prevalence of infertility, with detailed explanations of potential underlying mechanisms.",
# "The microbiota-gut-brain axis is a bidirectional signaling mechanism between the gastrointestinal tract and the central nervous system. The complexity of the intestinal ecosystem is extraordinary; it comprises more than 100 trillion microbial cells that inhabit the small and large intestine, and this interaction between microbiota and intestinal epithelium can cause physiological changes in the brain and influence mood and behavior. Currently, there has been an emphasis on how such interactions affect mental health. Evidence indicates that intestinal microbiota are involved in neurological and psychiatric disorders. This review covers evidence for the influence of gut microbiota on the brain and behavior in Alzheimer disease, dementia, anxiety, autism spectrum disorder, bipolar disorder, major depressive disorder, Parkinson's disease, and schizophrenia. The primary focus is on the pathways involved in intestinal metabolites of microbial origin, including short-chain fatty acids, tryptophan metabolites, and bacterial components that can activate the host's immune system. We also list clinical evidence regarding prebiotics, probiotics, and fecal microbiota transplantation as adjuvant therapies for neuropsychiatric disorders.",
# "O - linked N - acetylglucosamine ( O - GlcNAc ) is a dynamic post - translational modification occurring on myriad proteins in the cell nucleus , cytoplasm , and mitochondria .The donor sugar for O - GlcNAcylation , uridine - diphosphate N - acetylglucosamine ( UDP - GlcNAc ) , is synthesized from glucose through the hexosamine biosynthetic pathway ( HBP ) .",
# "Cholesterol is a multifaceted metabolite that is known to modulate processes in cancer, atherosclerosis, and autoimmunity. A common denominator between these diseases appears to be the immune system, in which many cholesterol-associated metabolites impact both adaptive and innate immunity. Many cancers display altered cholesterol metabolism, and recent studies demonstrate that manipulating systemic cholesterol metabolism may be useful in improving immunotherapy responses. However, cholesterol can have both proinflammatory and anti-inflammatory roles in mammals, acting via multiple immune cell types, and depending on context. Gaining mechanistic insights into various cholesterol-related metabolites can improve our understanding of their functions and extensive effects on the immune system, and ideally will inform the design of future therapeutic strategies against cancer and/or other pathologies.",
# "The recycling of O - GlcNAc on proteins is mediated by two enzymes in cells - O - GlcNAc transferase ( OGT ) and O - GlcNAcase ( OGA ) , which catalyze the addition and removal of O - GlcNAc , respectively .O - GlcNAcylation is involved in a number of important cell processes including transcription , translation , metabolism , signal transduction , and apoptosis .Deregulation of O - GlcNAcylation has been reported to be associated with various human diseases such as cancer , diabetes , neurodegenerative diseases , and cardiovascular diseases .(0 - not any proper association)"]

# Import our file
sample_abstracts = pd.read_csv('abstract_writer_31_01.csv')
print('Abstract file :', sample_abstracts.shape)

# Check acceptance of an entity and save if it is included (save information of which keyword it belongs to and the iloc of abstracts csv)
accepted_entities = pd.DataFrame(columns=['Entity name', 'Entity label', 'Entity ID', 'Index of abstract in csv', 'Start position in document', 'End position in document'])
count = 0

# Check with Xin if this is correct
A_list = list(map(str.lower, A_list)) # Making it lower because we use lower in phrase matcher and our detected entity might not be same case as A_list text, so might miss out on those strings


for abs_index in range(500): # Change this to 10 or some low number and check results
    start_time = time.time()
    abstract = sample_abstracts.iloc[abs_index]['abstract']
    abstract_spaced = "".join((' {} '.format(el.encode().decode()) if (not el.isalnum() and not el.isspace()) else el for el in abstract))
    abstract_preprocessed = " ".join(abstract_spaced.split())

    ''' For list of documents (DO THIS OUTSIDE LOOP)
    docs = list(nlp.pipe(new_abstract))
    c_doc = Doc.from_docs(docs)
    doc = nlp(c_doc)
    '''

    ''' For single document at a time
    doc = nlp(abstract)
    '''
    doc = None
    doc = nlp(abstract_preprocessed)

    #print NER result
    for ent in doc.ents:
        # print(ent.text)
        """ 
        In new Feb 01 Update, we remove this check condition and include long terms also.

        if(str(ent.text).lower() in A_list): # If accepted as exact match in our needed A_list, then save it
        
        """
        accepted_entities = accepted_entities.append( {'Entity name':str(ent.text), 'Entity label':str(ent.label_), 'Entity ID':str(ent.ent_id_), 'Index of abstract in csv':abs_index, 'Start position in document':ent.start_char,\
        'End position in document': ent.end_char}, ignore_index=True )

    if(abs_index % 100 == 0):
        print('\nProcessing abstract number ', abs_index)
        print('Number of Accepted entities ', accepted_entities.shape[0])
    # print([(ent.text, ent.label_, ent.ent_id_, ent.start_char, ent.end_char) for ent in doc.ents])

# accepted_entities.to_csv('accepted_entities_31_01.csv', index=False)
print('Program done')


Abstract file : (98901, 2)

Processing abstract number  0
Number of Accepted entities  0

Processing abstract number  100
Number of Accepted entities  210

Processing abstract number  200
Number of Accepted entities  360

Processing abstract number  300
Number of Accepted entities  462

Processing abstract number  400
Number of Accepted entities  1092
Program done


### Output entities or final accepted entities are stored in accepted_entities_31_01.csv file