# Abstract Fetching, Tokenization, and Classification

In [1]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-2.5.1.tar.gz (169kB)
[K    100% |████████████████████████████████| 174kB 3.7MB/s 
[?25hCollecting et_xmlfile (from openpyxl)
  Downloading et_xmlfile-1.0.1.tar.gz
Collecting jdcal (from openpyxl)
  Downloading jdcal-1.3.tar.gz
Building wheels for collected packages: openpyxl, et-xmlfile, jdcal
  Running setup.py bdist_wheel for openpyxl ... [?25l- \ done
[?25h  Stored in directory: /content/.cache/pip/wheels/98/5e/20/70cde417026f1e168acdac7babf47b204a7b752b1a8e6bb795
  Running setup.py bdist_wheel for et-xmlfile ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/99/f6/53/5e18f3ff4ce36c990fa90ebdf2b80cd9b44dc461f750a1a77c
  Running setup.py bdist_wheel for jdcal ... [?25l- done
[?25h  Stored in directory: /content/.cache/pip/wheels/0f/63/92/19ac65ed64189de4d662f269d39dd08a887258842ad2f29549
Successfully built openpyxl et-xmlfile jdcal
Installing collected packages: et-xmlfile, jdcal, openpyxl
Successfully ins

In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.70-cp27-cp27mu-manylinux1_x86_64.whl (2.2MB)
[K    100% |████████████████████████████████| 2.2MB 502kB/s 
Installing collected packages: biopython
Successfully installed biopython-1.70


**Pubmed Search:**

In [0]:
#https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.T._entrez_unique_identifiers_ui
from Bio import Entrez

email = "mmagzoub@stanford.edu"

def search(query, num_papers = 10):
  Entrez.email = email
  handle = Entrez.esearch(db='pubmed', 
                          sort='relevance', 
                          retmax= num_papers,
                          retmode='xml',
                          term=query)
  results = Entrez.read(handle)
  return results
  
def fetch_abstracts(id_list):
  Entrez.email = email
  abstracts = []
  for id in id_list:
    fetch_abstract = Entrez.efetch(db='pubmed',
                                   rettype = 'abstract',
                                   retmode='text',
                                   id=id)
    abstract = fetch_abstract.readlines()
    abstract = '\n'.join(abstract)
    abstracts.append(abstract)
    
  return abstracts

In [0]:
import unicodedata
import re
import pandas as pd

def get_instance_table(pubmed_ids, pubmed_abstracts):

    output_df = pd.DataFrame(columns=['individual', 'molecules', 'cells', 
                                      'circuits', 'physiology', 'behavior', 
                                      'self_report', 'paradigms'])

    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')

    def normalize_string(s):
        s = unicode_to_ascii(s.lower().decode('utf-8').strip())
        s = ' '.join(s.split())
        s = s.replace("'","")
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        return s

    molecules = "BDNF CCK cortisol/corticosterone/steroid CRF/CRF-family dopamine/dopaminergic cannabinoids FGF2 GABA \
    glutamate neuropeptide neurosteroid NMDA NPY orexin oxytocin serotonin/serotinergic vasopressin acth \
    HPA-axis-hormones/hypothalamic-pituitary-adrenal"
    
    cells = "GABAergic glia microglia neurons pyramidal pituitary"
    
    circuits = "autonomic-nervous-system/ANS basamyg/basal-amygdala central-nucleus d-hippocampus dmpfc dorsal-acc/dacc dpag \
    hypothalamus icms insular-cortex latamyg/lateral-amygdala latPFC/insula LC medial-amygdala/medamyg OFC pag pons \
    rostral/ventral-acc rpvm v-hippocampus vmPFC vPAG bed-nucleus-of-stria-terminalis/stria-terminalis"
    
    physiology = "BP/blood-pressure context-startle \
    emg eye-tracking facial-emg fear-potentiated-startle heart-rate \
    pupillometry respiration response-accuracy skin-conductance acth average-cortisol-level potentiated-startle"
    
    behavior = "analgesia approach/early-developement avoidance facial-expressions freezing open-field \
    response-inhibition response-time risk-assessment social-approach"
    
    self_report = "fear-survey-schedule suds anxiety-sensitivity-index bis/Barratt-Impulsivity-Scale \
    fear-of-negative-evaluation-scale intolerance-of-uncertainty-scale leds"
    
    paradigms = "behavioral-approach-test co2-challenge-test cold-pressor-test fear-conditioning \
    stranger-tests trier-social-stress-test npu-threat-task"
    
    list_of_indices = []
    list_of_uoas = []
    list_of_uoa_types = []
    index2word = {}
    total_size = 0
    UOAs = [molecules, cells, circuits, physiology, 
            behavior, self_report, paradigms]
    UOAnames = ['molecules', 'cells', 'circuits', 'physiology', 
                'behavior', 'self_report', 'paradigms']
    
    for doc_indx, [pubmed_id, abstract_str] in enumerate(zip(pubmed_ids, pubmed_abstracts)):
        output_df.loc[doc_indx, 'individual'] = 'paper' + pubmed_id
        abstract_str = normalize_string(abstract_str)
        bit_vector = [0]*len((' '.join(UOAs).split()))
        i = 0
        u = 0
        for uoa in UOAs:
            # print("size of ", UOAnames[u]," : ", len(' '.join(uoa.split()).split(' '))) 
            for unit in ' '.join(uoa.split()).split(' '):
                unit = unit.lower()
                # print(unit)
                if any(word in abstract_str for word in unit.replace("-", " ").split("/")):
    #                 list_of_indices.append(i)
    #                 list_of_uoas.append(unit)
    #                 list_of_uoa_types.append(UOAnames[u])
#                     print("u = {}".format(u))
#                     print("UOAnames[u] = {}".format(UOAnames[u]))
#                     print(unit.replace("-", "_").split("/")[0])
                    output_df.loc[doc_indx, UOAnames[u]] = unit.replace("-", "_").split("/")[0]
                    bit_vector[i] = 1
                else:
                    bit_vector[i] = 0
    #             index2word[i] = unit
                i+=1
            u += 1

    return output_df

In [0]:
def run_pubmed_query(q):
    pubmed_query = search(q)
    pubmed_abstracts = fetch_abstracts(pubmed_query['IdList'])
    pubmed_ids = [pubmed_abstracts[i].split('PMID: ')[1].split(' ')[0] for i in range(len(pubmed_abstracts))]
    output_df = get_instance_table(pubmed_ids, pubmed_abstracts)
    return(output_df)

In [0]:
def classify(tmp_df):
    anxiety_molecules = ['cortisol', 'crf']

    fear_molecules = ['bdnf', 'cck', 'cortisol', 'crf', 'dopamine', 
                     'cannabinoids', 'fgf2', 'gaba', 'glutamate', 
                     'neuropeptide', 'neurosteroid', 'nmda', 'npy', 'orexin',
                     'oxytocin', 'serotonin', 'vasopressin', 'acth', 
                     'hpa_axis_hormones']

    anxiety_cells = ['pituitary']

    fear_cells = ['pituitary', 'gabaergic', 'glia', 'microglia', 'neurons']

    anxiety_circuits = ['bed_nucleus_of_stria_terminalis']

    fear_circuits = ['autonomic_nervous_system', 'basamyg', 'central_nucleus', 
                     'd_hippocampus', 'dmpfc', 'dorsal_acc', 'dpag',
                     'hypothalamus', 'icms', 'insular_cortex', 'latamyg', 
                     'latpfc', 'lc', 'medial_amygdala', 'ofc', 'pag', 
                     'pons', 'rostral', 'rpvm', 'v_hippocampus', 'vmpfc',
                     'vpag', 'bed_nucleus_of_stria_terminalis']

    anxiety_physiologies = ['acth', 'average_cortisol_level', 'potentiated_startle']

    fear_physiologies = ['bp', 'context_startle', 'emg', 'eye_tracking', 'facial_emg',
                        'fear_potentiated_startle', 'heart_rate', 'pupillometry',
                        'respiration', 'response_accuracy', 'skin_conductance']

    fear_behaviors = ['analgesia', 'approach', 'avoidance', 
                      'facial_expressions', 'freezing', 'open_field',
                      'response_inhibition', 'response_time', 'risk_assessment',
                      'social_approach']

    anxiety_behaviors = []

    anxiety_self_reports = ['anxiety_sensitivity_index', 'bis', 
                                'fear_of_negative_evaluation_scale', 
                                'intolerance_of_uncertainty_scale', 'leds']

    fear_self_reports = ['fear_survey_schedule', 'suds']

    anxiety_paradigms = ['npu_threat_task']

    fear_paradigms = ['behavioral_approach_test', 'co2_challenge_test', 
                      'cold_pressor_test', 'fear_conditioning', 
                      'stranger_tests', 'trier_social_stress_test']

    anxiety_uoas =  anxiety_molecules + anxiety_cells + anxiety_circuits + \
                    anxiety_physiologies + anxiety_behaviors + anxiety_paradigms

    fear_uoas =  fear_molecules + fear_cells + fear_circuits + \
                 fear_physiologies + fear_behaviors + fear_paradigms



    tmp_df = output_df.copy()
    tmp_df.insert(1, 'paper_classification', value=pd.Series([None]*len(output_df.index)))
    for i in range(len(tmp_df.index)):
        class_list = set()
        for j in range(len(tmp_df.columns)):
            if tmp_df.iloc[i, j] in fear_uoas:
                class_list.add('fear')
            if tmp_df.iloc[i, j] in anxiety_uoas:
                class_list.add('anxiety')
        tmp_df.loc[i, 'paper_classification'] = ', '.join(class_list)

    return tmp_df

In [0]:
def download_df(tmp_df):
    writer = pd.ExcelWriter('output.xlsx')
    output_df.to_excel(writer, 'Sheet1')
    writer.save()
    files.download('output.xlsx')

# Interactive Abstract Classifier

In [13]:
from google.colab import files

q = raw_input("Enter PubMed Search Query: ")
output_df = run_pubmed_query(q)
download_df(output_df)
classified = classify(output_df)
classified

Enter PubMed Search Query: hamburger


Unnamed: 0,individual,paper_classification,molecules,cells,circuits,physiology,behavior,self_report,paradigms
0,paper26930162,fear,,,lc,,,,
1,paper29391624,fear,,,pons,,,,
2,paper26826254,fear,,,,,approach,,
3,paper27793059,,,,,,,,
4,paper27964791,,,,,,,,
5,paper23497662,fear,,,autonomic_nervous_system,bp,,,
6,paper26286001,,,,,,,,
7,paper23196671,fear,,,pons,,,,
8,paper26212992,,,,,,,,
9,paper22611668,,,,,,,,


# Label Abstract

In [0]:
import pandas as pd

def get_bit_vector_and_key(pubmed_ids, pubmed_abstracts):
    
    '''
    input - a list of strings that are the abstract or title or article
    output:
        bit_vector - 85 binary bits, one for each unit of analysis uoa
        index2word - dictionary mapping the bit index starting at 0 to the uoa string 
        list_of_uoas - list of the positive unit of analyses in string format
        list_of_indices - list of the positive indices in the bit_vector
        
    The function will also print out for you the size of each class of UOA
    the first cell is in index 19 for example for GABAergic
    '''
    
    def unicode_to_ascii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')

    def normalize_string(s):
        s = unicode_to_ascii(s.lower().strip())
        s = ' '.join(s.split())
        s = s.replace("'","")
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        return s

    molecules = "BDNF CCK cortisol/corticosterone/steroid CRF-family dopamine/dopaminergic cannabinoids FGF2 GABA \
    glutamate neuropeptide neurosteroid NMDA NPY orexin oxytocin serotonin/serotinergic vasopressin acth \
    HPA-axis-hormones/hypothalamic–pituitary–adrenal"

    cells = "GABAergic glia neurons pyramidal pituitary "

    circuits = "autonomic-nervous-system/ANS basamyg/basal-amygdala central-nucleus d-hippocampus dmpfc dorsal-acc dpag \
    hypothalamus icms insular-cortex latamyg/lateral-amygdala latPFC/insula LC medial-amygdala/medamyg OFC pag pons \
    rostral/ventral-acc rpvm v-hippocampus vmPFC vPAG bed-nucleus-of-stria-terminalis/stria-terminalis"

    physiology = "BP/blood-pressure context-startle \
    emg eye-tracking facial-emg fear-potentiated-startle heart-rate \
    pupillometry respiration response-accuracy skin-conductance acth average-cortisol-level potentiated-startle"

    behavior = "analgesia approach/early-developement avoidance facial-expressions freezing open-field \
    response-inhibition response-time risk-assessment social-approach"

    self_report = "fear-survey-schedule suds anxiety-sensitivity-index bis/Barratt-Impulsivity-Scale \
    fear-of-negative-evaluation-scale intolerance-of-uncertainty-scale leds"

    paradigms = "behavioral-approach-test co2-challenge-test cold-pressor-test fear-conditioning \
    stranger-tests trier-social-stress-test npu-threat-task"
    
    for index, abstract_string in enumerate(abstracts):
        list_of_indices = []
        list_of_uoas = []
        index2word = {}
        total_size = 0
        UOAs = [molecules, cells, circuits, physiology, 
                behavior, self_report, paradigms]
        UOAnames = ['molecules', 'cells', 'circuits', 'physiology', 
                    'behavior', 'self_report', 'paradigms']

        bit_vector = [0]*len((' '.join(UOAs).split()))
        i = 0
        u = 0
        for uoa in UOAs:
            print("size of ", UOAnames[u]," : ", len(' '.join(uoa.split()).split(' '))) 
            u += 1
            for unit in ' '.join(uoa.split()).split(' '):
                for uoa_token in unit.split("/"):
                    if uoa_token in abstract_string:
                        output_df.loc[i, UOAnames[u]]
                if any(word in abstract_string for word in unit.split("/")):
                    list_of_indices.append(i)
                    list_of_uoas.append(unit)
                    bit_vector[i] = 1
                else:
                    bit_vector[i] = 0
                index2word[i] = unit
                i+=1

        return bit_vector, index2word, list_of_uoas, list_of_indices

In [30]:
bit_vector, index2word, list_of_uoas, list_of_indices = get_bit_vector_and_key(abstract_string)
list_of_indices

('size of ', 'molecules', ' : ', 19)
('size of ', 'cells', ' : ', 5)
('size of ', 'circuits', ' : ', 23)
('size of ', 'physiology', ' : ', 14)
('size of ', 'behavior', ' : ', 10)
('size of ', 'self_report', ' : ', 7)
('size of ', 'paradigms', ' : ', 7)


[]

In [26]:
for abstract_string in pubmed_abstracts:
  bit_vector, index2word, list_of_uoas, list_of_indices = get_bit_vector_and_key(abstract_string)
  print(list_of_indices)

('size of ', 'molecules', ' : ', 19)
('size of ', 'cells', ' : ', 5)
('size of ', 'circuits', ' : ', 23)
('size of ', 'physiology', ' : ', 14)
('size of ', 'behavior', ' : ', 10)
('size of ', 'self_report', ' : ', 7)
('size of ', 'paradigms', ' : ', 7)
[]
('size of ', 'molecules', ' : ', 19)
('size of ', 'cells', ' : ', 5)
('size of ', 'circuits', ' : ', 23)
('size of ', 'physiology', ' : ', 14)
('size of ', 'behavior', ' : ', 10)
('size of ', 'self_report', ' : ', 7)
('size of ', 'paradigms', ' : ', 7)
[]
('size of ', 'molecules', ' : ', 19)
('size of ', 'cells', ' : ', 5)
('size of ', 'circuits', ' : ', 23)
('size of ', 'physiology', ' : ', 14)
('size of ', 'behavior', ' : ', 10)
('size of ', 'self_report', ' : ', 7)
('size of ', 'paradigms', ' : ', 7)
[]
('size of ', 'molecules', ' : ', 19)
('size of ', 'cells', ' : ', 5)
('size of ', 'circuits', ' : ', 23)
('size of ', 'physiology', ' : ', 14)
('size of ', 'behavior', ' : ', 10)
('size of ', 'self_report', ' : ', 7)
('size of ', 'p

# OWL ontology