### Fragment Extraction

In this notebook, we will discuss how to search documents based on the Disease described in the CVD tree.

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
import csv

#### Authentication to access covidgraph.org graph

In [2]:
covid_browser = "https://db.covidgraph.org/browser/"
covid_url = "bolt://db.covidgraph.org:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

##### Example of a paper node in the covid graph

In [3]:
paper_query = "MATCH (n:Paper) RETURN n LIMIT 1"
Data = []
with driver.session() as session:
    info = session.run(paper_query)
    for item in info:
        print(item)

<Record n=<Node id=2385529 labels=frozenset({'Paper'}) properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}>>


In [39]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (d:Disease)-->(gn:Gene) RETURN d LIMIT 1"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record d=<Node id=3655483 labels=frozenset({'Disease'}) properties={'license': 'CC0 1.0', 'name': 'idiopathic pulmonary fibrosis', 'link': 'http://www.disease-ontology.org/?id=DOID:0050156', 'doid': 'DOID:0050156', 'definition': '"A pulmonary fibrosis that is characterized by scarring of the lung." [url:https\\://www.pulmonaryfibrosis.org/life-with-pf/about-ipf]', 'source': 'http://www.disease-ontology.org', '_id': 'e44b0dd76d1d22a1e9fe5c6c6dd395ab'}>>


In [43]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (d:Disease)-[rel:ASSOCIATES_DaG]->(gn:Gene) RETURN gn LIMIT 1"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record gn=<Node id=95261 labels=frozenset({'Gene'}) properties={'Synonyms': 'MG1|MUC-5B|MUC5|MUC9', 'Symbol_from_nomenclature_authority': 'MUC5B', 'Symbol': 'MUC5B', 'chromosome': '11', 'description': 'mucin 5B, oligomeric mucus/gel-forming', 'Other_designations': 'mucin-5B|cervical mucin MUC5B|high molecular weight salivary mucin MG1|mucin 5, subtype B, tracheobronchial|sublingual gland mucin', 'source': 'ncbigene', 'type_of_gene': 'protein-coding', 'dbXrefs': 'MIM:600770|HGNC:HGNC:7516|Ensembl:ENSG00000117983', 'tax_id': '9606', 'sid': '727897', 'Nomenclature_status': 'O', 'map_location': '11p15.5', 'GeneID': '727897', 'Modification_date': '20200614', 'Feature_type': '-', 'LocusTag': '-', 'Full_name_from_nomenclature_authority': 'mucin 5B, oligomeric mucus/gel-forming'}>>


In [50]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (d:Disease)-[rel:ASSOCIATES_DaG]->(gn:Gene) \
         RETURN d.name,type(rel),gn.Symbol_from_nomenclature_authority LIMIT 5"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record d.name='idiopathic pulmonary fibrosis' type(rel)='ASSOCIATES_DaG' gn.Symbol_from_nomenclature_authority='MUC5B'>
<Record d.name='idiopathic pulmonary fibrosis' type(rel)='ASSOCIATES_DaG' gn.Symbol_from_nomenclature_authority='TOLLIP'>
<Record d.name='idiopathic pulmonary fibrosis' type(rel)='ASSOCIATES_DaG' gn.Symbol_from_nomenclature_authority='MUC5AC'>
<Record d.name='idiopathic pulmonary fibrosis' type(rel)='ASSOCIATES_DaG' gn.Symbol_from_nomenclature_authority='TERT'>
<Record d.name='idiopathic pulmonary fibrosis' type(rel)='ASSOCIATES_DaG' gn.Symbol_from_nomenclature_authority='BRSK2'>


In [56]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (d:Disease)-[rel:ASSOCIATES_DaG]->(gn:Gene)-[mem:MEMBER]->(pw:Pathway) \
         RETURN d.name,gn.Symbol_from_nomenclature_authority, pw.name LIMIT 5"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' pw.name='Defective GALNT12 causes colorectal cancer 1 (CRCS1)'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' pw.name='O-linked glycosylation'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' pw.name='Post-translational protein modification'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' pw.name='Diseases associated with O-glycosylation of proteins'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' pw.name='Defective C1GALT1C1 causes Tn polyagglutination syndrome (TNPS)'>


In [64]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (pw:Pathway)<--(gn:Gene)-->(t:Transcript)-->(p:Protein) \
         RETURN pw.name,gn.synonyms,t.sid,p.sid LIMIT 5"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record pw.name='Hemostasis' gn.synonyms=None t.sid='NM_001001579' p.sid='Q86WN5'>
<Record pw.name='Hemostasis' gn.synonyms=None t.sid='NM_001001579' p.sid='Q86SI6'>
<Record pw.name='Hemostasis' gn.synonyms=None t.sid='NM_001001579' p.sid='O76083'>
<Record pw.name='Hemostasis' gn.synonyms=None t.sid='NM_001001579' p.sid='O95225'>
<Record pw.name='Hemostasis' gn.synonyms=None t.sid='NM_001001579' p.sid='Q86SF7'>


In [65]:
#query = "MATCH (g:GeneSymbol)<-[:MAPS]-(gn:Gene)<-[:ASSOCIATES]-(d:Disease) RETURN d LIMIT 10"
query = "MATCH (d:Disease)-->(gn:Gene)-->(t:Transcript)-->(p:Protein) \
         RETURN d.name,gn.Symbol_from_nomenclature_authority,t.sid, p.sid LIMIT 5"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' t.sid='NM_002458' p.sid='O95451'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' t.sid='NM_002458' p.sid='Q9UE28'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' t.sid='NM_002458' p.sid='O00573'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' t.sid='NM_002458' p.sid='O00447'>
<Record d.name='idiopathic pulmonary fibrosis' gn.Symbol_from_nomenclature_authority='MUC5B' t.sid='NM_002458' p.sid='Q99552'>


### Get Fragment Text

In [None]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)\
        RETURN f.text limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append(item.values())

#### Text Fragments for Diseases

In [152]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)<--(gn:Gene)<--(d:Disease)\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,d.name limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "disease":item.values()[2]})
df = pd.DataFrame(TEXT)
df.head(10)

Unnamed: 0,disease,fragment,gene
0,schizophrenia,Interval III of the 5 CT with PLC's available ...,RANGAP1
1,schizophrenia,Overall delay in CT with PLC's also was shorte...,RANGAP1
2,schizophrenia,"Of the 21 requests, the total delay between re...",RANGAP1
3,lung cancer,This random-effects regression model construct...,APCDD1
4,lung cancer,"For influenza B, the corresponding statistics ...",APCDD1
5,lung cancer,"In the HTS with an HCP-collected swab, 132 of ...",APCDD1
6,lung cancer,The second measurement (crosses ) was collecte...,APCDD1
7,lung cancer,"E-H, Difference between observed and expected ...",APCDD1
8,lung cancer,"A-D, Molecular viral loads on first and third ...",APCDD1
9,lung cancer,Trends in influenza B viral loads are shown in...,APCDD1


In [113]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
-[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]-(f:Fragment)\
-[:MENTIONS]->(g:GeneSymbol)<--(gn:Gene)<--(d:Disease)\
WHERE d.name='lung cancer'\
RETURN f.text,gn.Symbol_from_nomenclature_authority,d.name limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "disease":item.values()[2]})
df = pd.DataFrame(TEXT)
df.head()

Unnamed: 0,disease,fragment,gene
0,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6
1,lung cancer,TNFAIP6 protein levels did not predict overall...,TNFAIP6
2,lung cancer,More than half of all variants were located on...,TNFAIP6
3,lung cancer,"In colon cancer, better overall survival was p...",TNFAIP6
4,lung cancer,"These genes are CCL28, CXL12, EDNRRB, GFRA1, G...",TNFAIP6


#### Text Fragments for Pathways

In [103]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)<--(gn:Gene)-->(pw:Pathway)\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,pw.name limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "pathway":item.values()[2]})
df = pd.DataFrame(TEXT)
df.head()

Unnamed: 0,fragment,gene,pathway
0,SARS and viral hemorrhagic fevers),SARS1,Cytosolic tRNA aminoacylation
1,SARS and viral hemorrhagic fevers),SARS1,Metabolism
2,SARS and viral hemorrhagic fevers),SARS1,Metabolism of amino acids and derivatives
3,SARS and viral hemorrhagic fevers),SARS1,Metabolism of proteins
4,SARS and viral hemorrhagic fevers),SARS1,Selenoamino acid metabolism


In [125]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)\
        WITH g,f\
        MATCH (g:GeneSymbol)<--(gn:Gene)-->(pw:Pathway)\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,pw.name limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "pathway":item.values()[2]})
df = pd.DataFrame(TEXT)
df.head()

Unnamed: 0,fragment,gene,pathway
0,SARS and viral hemorrhagic fevers),SARS1,Cytosolic tRNA aminoacylation
1,SARS and viral hemorrhagic fevers),SARS1,Metabolism
2,SARS and viral hemorrhagic fevers),SARS1,Metabolism of amino acids and derivatives
3,SARS and viral hemorrhagic fevers),SARS1,Metabolism of proteins
4,SARS and viral hemorrhagic fevers),SARS1,Selenoamino acid metabolism


#### Disease - Fragment -Pathways connection

In [135]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)\
        WITH f,g\
        MATCH ((g:GeneSymbol)<--(gn:Gene)-->(pw:Pathway))\
        WITH f,g,gn,pw\
        MATCH ((g:GeneSymbol)<--(gn:Gene)<--(d:Disease))\
        WHERE d.name='lung cancer'\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,pw.name, d.name limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "pathway":item.values()[2],\
                    "disease":item.values()[3]})
df = pd.DataFrame(TEXT)
df.head()


Unnamed: 0,disease,fragment,gene,pathway
0,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Immune System
1,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Neutrophil degranulation
2,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Innate Immune System
3,lung cancer,TNFAIP6 protein levels did not predict overall...,TNFAIP6,Immune System
4,lung cancer,TNFAIP6 protein levels did not predict overall...,TNFAIP6,Neutrophil degranulation


In [136]:
TEXT = []
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)\
        WITH f,g\
        MATCH ((g:GeneSymbol)<--(gn:Gene)-->(pw:Pathway))\
        WITH f,g,gn,pw\
        MATCH ((g:GeneSymbol)<--(gn:Gene)<--(d:Disease))\
        WHERE d.name='lung cancer'\
        WITH f,g,gn,pw,d\
        MATCH (d:Disease)-->(gn:Gene)-->(t:Transcript)-->(p:Protein)\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,pw.name, d.name,t.sid,p.sid limit 100"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "pathway":item.values()[2],\
                    "disease":item.values()[3],\
                    "transcript": item.values()[4],\
                    "protein":item.values()[5]})
df = pd.DataFrame(TEXT)
df.head()


Unnamed: 0,disease,fragment,gene,pathway,protein,transcript
0,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Immune System,P98066,NM_007115
1,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Neutrophil degranulation,P98066,NM_007115
2,lung cancer,Several genes involved in innate and adaptive ...,TNFAIP6,Innate Immune System,P98066,NM_007115
3,lung cancer,TNFAIP6 protein levels did not predict overall...,TNFAIP6,Immune System,P98066,NM_007115
4,lung cancer,TNFAIP6 protein levels did not predict overall...,TNFAIP6,Neutrophil degranulation,P98066,NM_007115


In [145]:
def disease_query(entity):
    TEXT = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)\
        WITH f,g\
        MATCH ((g:GeneSymbol)<--(gn:Gene)-->(pw:Pathway))\
        WITH f,g,gn,pw\
        MATCH ((g:GeneSymbol)<--(gn:Gene)<--(d:Disease))\
        WHERE d.name='"+entity+\
        "'WITH f,g,gn,pw,d\
        MATCH (d:Disease)-->(gn:Gene)-->(t:Transcript)-->(p:Protein)\
        RETURN f.text,gn.Symbol_from_nomenclature_authority,pw.name, d.name,t.sid,p.sid limit 100"

    with driver.session() as session:
        info = session.run(query)
        for item in info:
            TEXT.append({"fragment":item.values()[0],\
                     "gene":item.values()[1],\
                     "pathway":item.values()[2],\
                    "disease":item.values()[3],\
                    "transcript": item.values()[4],\
                    "protein":item.values()[5]})
    df = pd.DataFrame(TEXT)
    return df

In [165]:
df = disease_query('hypertension')
df.head()

Unnamed: 0,disease,fragment,gene,pathway,protein,transcript
0,hypertension,Humans encode nine members of this protease fa...,FURIN,Amyloid fiber formation,NP_001369548,NM_001382619
1,hypertension,Humans encode nine members of this protease fa...,FURIN,Extracellular matrix organization,NP_001369548,NM_001382619
2,hypertension,Humans encode nine members of this protease fa...,FURIN,Pre-NOTCH Processing in Golgi,NP_001369548,NM_001382619
3,hypertension,Humans encode nine members of this protease fa...,FURIN,Signaling by Receptor Tyrosine Kinases,NP_001369548,NM_001382619
4,hypertension,Humans encode nine members of this protease fa...,FURIN,Plasma lipoprotein remodeling,NP_001369548,NM_001382619


### Path Collection

In [81]:
TEXT = []
query = "MATCH Path = (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
        -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]\
        -(f:Fragment)-[:MENTIONS]->(g:GeneSymbol)<--(gn:Gene)<--(d:Disease)\
        RETURN Path limit 1"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record Path=<Path start=<Node id=2385529 labels=frozenset({'Paper'}) properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}> end=<Node id=3657303 labels=frozenset({'Disease'}) properties={'license': 'CC0 1.0', 'link': 'http://www.disease-ontology.org/?id=DOID:5419', 'name': 'schizophrenia', 'doid': 'DOID:5419', 'definition': '"A psychotic disorder that is characterized by a disintegration of thought processes and of emotional responsiveness." [url:http\\://en.wikipedia.org/wiki/Schizophrenia]', 'source': 'http://www.disease-ontology.org', '_id': '6cafaa31aba7dc88b1e34c75151b951d'}> size=6>>


In [97]:
query = "MATCH Path = (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
-[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(:BodyText)-[:HAS_FRAGMENT]-(f:Fragment)-[:MENTIONS]\
->(g:GeneSymbol)<--(gn:Gene)<--(d:Disease) RETURN Path LIMIT 1"

with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record Path=<Path start=<Node id=2385529 labels=frozenset({'Paper'}) properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}> end=<Node id=3657303 labels=frozenset({'Disease'}) properties={'license': 'CC0 1.0', 'link': 'http://www.disease-ontology.org/?id=DOID:5419', 'name': 'schizophrenia', 'doid': 'DOID:5419', 'definition': '"A psychotic disorder that is characterized by a disintegration of thought processes and of emotional responsiveness." [url:http\\://en.wikipedia.org/wiki/Schizophrenia]', 'source': 'http://www.disease-ontology.org', '_id': '6cafaa31aba7dc88b1e34c75151b951d'}> size=6>>
