### COVID-19 Graph CVD Protein and Gene Analysis

This notebook analyzes the list of proteins and associated genes in publications with their associations with cardiovascular diseases. This specific notebook focuses on a small list of proteins from the larger list of 8000+.

#### Authentication to access covidgraph.org graph

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
# from neo4j import APOC

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

### The queries below focus on proteins with the protein ID and associated genes specified before it
- For each protein a list of all its associated names is created
- In a loop each name is queried into a dictionary with 5 main publication attributes (journal, publish time, source, title, and url)
- This dictionary is appended to a larger dictionary that maps each name to all of its associated papers
- This data is then written to a ```json``` file named by its ID

#### Protein ID: P45379, Gene: TNNT2

In [3]:
query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)\
                                -[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                WHERE (LOWER(a.text) CONTAINS 'troponin t') \
                                    return p LIMIT 1"
with driver.session() as session:
    info = session.run(query)
    for item in info:
        print(item)

<Record p=<Node id=52396 labels={'Paper'} properties={'cord_uid': 'txk5z8ub', 'cord19-fulltext_hash': 'dc91750f913c083da70bb7dd1816c0418eaf346d', 'journal': 'Int J Emerg Med', 'publish_time': '2009-04-08', 'source': 'PMC', 'title': 'Analysis of trends in emergency department attendances, hospital admissions and medical staffing in a Hong Kong university hospital: 5-year study', '_hash_id': '4131343e79b0683fbad4ab4e1a77e0d0', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2760706/'}>>


In [4]:
entities_tnnt2 = ['troponin t','cardiac muscle', 'tntc', 'ctnt']

In [5]:
result_tnnt2 = []
for entity in entities_tnnt2:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_tnnt2.append({entity:entity_result})

In [7]:
with open("output/Data_Protein_Gene/Papers/Gene-TNNT2.json", 'w') as tnnt2:
    json.dump(result_tnnt2, tnnt2)

#### Protein ID: P19429, Gene: TNNI3 and TNNC1

In [8]:
entities_tnn = ['troponin i', 'cardiac muscle']

In [9]:
result_tnn = []
for entity in entities_tnn:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_tnn.append({entity:entity_result})

In [10]:
with open("output/Data_Protein_Gene/Papers/Gene-TNN.json", 'w') as tnn:
    json.dump(result_tnn, tnn)

#### Protein ID: P07550, Gene: ADRB2 and ADRB2R and B2AR

In [11]:
entities_adrb2 = ['beta-2 adrenergic receptor','beta-2 adrenoreceptor', 'beta-2 adrenoceptor']

In [14]:
result_adrb2 = []
for entity in entities_adrb2:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_adrb2.append({entity:entity_result})

In [15]:
with open("output/Data_Protein_Gene/Papers/Gene-ADRB2.json", 'w') as adrb2:
    json.dump(result_adrb2, adrb2)

#### Protein ID: P08588, Gene: ADRB1 and ADRB1R and B1AR

In [16]:
entities_adrb1 = ['beta-1 adrenergic receptor','beta-1 adrenoreceptor', 'beta-1 adrenoceptor']

In [17]:
result_adrb1 = []
for entity in entities_adrb1:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_adrb1.append({entity:entity_result})

In [18]:
with open("output/Data_Protein_Gene/Papers/Gene-ADRB1.json", 'w') as adrb1:
    json.dump(result_adrb1, adrb1)

#### Protein ID: P01130, Gene: LDLR

In [19]:
entities_ldlr = ['low-density lipoprotein receptor', 'ldl receptor']

In [20]:
result_ldlr = []
for entity in entities_ldlr:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_ldlr.append({entity:entity_result})

In [21]:
with open("output/Data_Protein_Gene/Papers/Gene-LDLR.json", 'w') as ldlr:
    json.dump(result_ldlr, ldlr)

#### Protein ID: Q8WZ42, Gene: TTN

In [22]:
entities_ttn = ['titin', 'connectin', 'ec 2.7.11.1', 'rhabdomyosarcoma antigen mu-rms-40.14']

In [23]:
result_ttn = []
for entity in entities_ttn:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_ttn.append({entity:entity_result})

In [24]:
with open("output/Data_Protein_Gene/Papers/Gene-TTN.json", 'w') as ttn:
    json.dump(result_ttn, ttn)

#### Protein ID: Q9BYF1, Gene: ACE2 and UNQ868/PRO1885

In [25]:
entities_ace2 = ['angiotensin-converting enzyme 2', 'ec 3.4.17.23', 'ace-related carboxypeptidase'\
                'angiotensin-converting enzyme homolog', 'aceh', 'metalloprotease mprot15', \
                 'processed angiotensin-converting enzyme 2']

In [26]:
result_ace2 = []
for entity in entities_ace2:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_ace2.append({entity:entity_result})

In [27]:
with open("output/Data_Protein_Gene/Papers/Gene-ACE2.json", 'w') as ace2:
    json.dump(result_ace2, ace2)

#### Protein ID: P01019, Gene: AGT and SERPINA8

In [28]:
entities_angt = ['angiotensinogen', 'serpin a8', 'angiotensin']

In [29]:
result_angt = []
for entity in entities_angt:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_angt.append({entity:entity_result})

In [30]:
with open("output/Data_Protein_Gene/Papers/Gene-ANGT.json", 'w') as angt:
    json.dump(result_angt, angt)

#### Protein ID: P30556, Gene: AGTR1 and AGTR1A and AGTR1B and AT2R1 and AT2R1B

In [33]:
entities_agt = ['type-1 angiotensin ii receptor', 'at1ar', 'at1br', 'at1', 'angiotensin ii type-1 receptor']

In [34]:
result_agt = []
for entity in entities_agt:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_agt.append({entity:entity_result})

In [35]:
with open("output/Data_Protein_Gene/Papers/Gene-AGT.json", 'w') as agt:
    json.dump(result_agt, agt)

#### Protein ID: P01375, Gene: TNF and TNFA and TNFSF2

In [36]:
entities_tnf = ['tumor necrosis factor', 'cachectin', 'tnf-alpha', 'ntf', 'intracellular domain', 'c-domain']

In [37]:
result_tnf = []
for entity in entities_tnf:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_tnf.append({entity:entity_result})

In [38]:
with open("output/Data_Protein_Gene/Papers/Gene-TNF.json", 'w') as tnf:
    json.dump(result_tnf, tnf)

#### Protein ID: P05231, Gene: IL6 and IFNB2

In [39]:
entities_il = ['interleukin-6', 'il-6', 'b-cell stimulatory factor 2', 'bsf-2', 'ctl differentiation factor', 'cdf'\
              'hybridoma growth factor', 'interferon beta-2', 'ifn-beta-2']

In [40]:
result_il = []
for entity in entities_il:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_il.append({entity:entity_result})

In [41]:
with open("output/Data_Protein_Gene/Papers/Gene-IL.json", 'w') as il:
    json.dump(result_il, il)

#### Protein ID: P12821, Gene: ACE and DCP and DCP1

In [42]:
entities_ace = ['kininase ii', 'ec 3.2.1.-', 'ec 3.4.15.1', 'angiotensin-converting enzyme', 'ace', \
                'dipeptidyl carboxypeptidase i', 'cd antigen cd143']

In [43]:
result_ace = []
for entity in entities_ace:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_ace.append({entity:entity_result})

In [44]:
with open("output/Data_Protein_Gene/Papers/Gene-ACE.json", 'w') as ace:
    json.dump(result_ace, ace)

#### Protein ID: P00797, Gene: REN

In [45]:
entities_ren = ['renin', 'ec 3.4.23.15', 'angiotensinogenase']

In [46]:
result_ren = []
for entity in entities_ren:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_ren.append({entity:entity_result})

In [47]:
with open("output/Data_Protein_Gene/Papers/Gene-REN.json", 'w') as ren:
    json.dump(result_ren, ren)

#### Protein ID: Q06481, Gene: APLP2 and APPL2

In [48]:
entities_apl = ['amyloid-like protein 2', 'aplp-2', 'apph', 'amyloid protein homolog', 'cdei box-binding protein',\
                'cdebp']

In [49]:
result_apl = []
for entity in entities_apl:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_apl.append({entity:entity_result})

In [50]:
with open("output/Data_Protein_Gene/Papers/Gene-APL.json", 'w') as apl:
    json.dump(result_apl, apl)

#### Protein ID: P12277, Gene: CKB and CKBB

In [51]:
entities_kcrb = ['creatine kinase b-type', 'b-ck', 'creatine kinase b chain', 'ec 2.7.3.2']

In [52]:
result_kcrb = []
for entity in entities_kcrb:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_kcrb.append({entity:entity_result})

In [53]:
with open("output/Data_Protein_Gene/Papers/Gene-KCRB.json", 'w') as kcrb:
    json.dump(result_kcrb, kcrb)

#### Protein ID: P17540, Gene: CKMT2

In [54]:
entities_kcrs = ['creatine kinase s-type', 'mib-ck', 'basic-type mitochondrial creatine kinase', 'ec 2.7.3.2',\
                'sarcomeric mitochondrial creatine kinase', 's-mtck']

In [55]:
result_kcrs = []
for entity in entities_kcrs:
    entity_result = []
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-\
                                    [:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) \
                                    WHERE (LOWER(a.text) CONTAINS '" + entity + "')" + \
                                    "RETURN DISTINCT p.journal, p.publish_time, p.source, p.title, p.url"
    
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            entity_result.append({'journal': item.values()[0], \
                                  "publish_time": item.values()[1],\
                                  "source": item.values()[2],\
                                  "title": item.values()[3],\
                                  "url": item.values()[4]})
            
    result_kcrs.append({entity:entity_result})

In [56]:
with open("output/Data_Protein_Gene/Papers/Gene-KCRS.json", 'w') as kcrs:
    json.dump(result_kcrs, kcrs)