### MeSH Based Query Search

In this notebook, we will discuss how to search documents based on the Disease described in the CVD tree.

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
import csv

#### Authentication to access covidgraph.org graph

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

#### MeSH descriptor to its entity list
- Ex. ```C14.260.500: [syphilis, cardiovascular]```
- Pandas Dataframe is very convenient for handeling a CSV file specifically for data transformation with ```lambda``` mapping functon.

In [3]:
MeSH = pd.read_csv("../Input/mesh/cardiac_arrhythmia.csv")
MeSH = MeSH.set_index('ID')
MeSH.head()

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
C14.280.067,"Arrhythmias, Cardiac"
C14.280.067.093,"Arrhythmia, Sinus"
C14.280.067.093.249,Sick Sinus Syndrome
C14.280.067.093.500,"Sinus Arrest, Cardiac"
C14.280.067.198,Atrial Fibrillation


- Implementing ```lambda``` function to map one column to another column

In [4]:
MeSH['phrases'] = MeSH['name'].apply(lambda x: x.lower().strip())

In [5]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C14.280.067,"Arrhythmias, Cardiac","arrhythmias, cardiac"
C14.280.067.093,"Arrhythmia, Sinus","arrhythmia, sinus"
C14.280.067.093.249,Sick Sinus Syndrome,sick sinus syndrome
C14.280.067.093.500,"Sinus Arrest, Cardiac","sinus arrest, cardiac"
C14.280.067.198,Atrial Fibrillation,atrial fibrillation


In [6]:
MeSH['phrases'] = MeSH['phrases'].apply(lambda x:x.split(','))

In [7]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C14.280.067,"Arrhythmias, Cardiac","[arrhythmias, cardiac]"
C14.280.067.093,"Arrhythmia, Sinus","[arrhythmia, sinus]"
C14.280.067.093.249,Sick Sinus Syndrome,[sick sinus syndrome]
C14.280.067.093.500,"Sinus Arrest, Cardiac","[sinus arrest, cardiac]"
C14.280.067.198,Atrial Fibrillation,[atrial fibrillation]


In [8]:
MeSH.index[0]

'C14.280.067'

In [9]:
id2name = {}
for ID,name in zip(MeSH.index, MeSH['name']):
    id2name.update({ID:name})
    
with open("../Data/mesh/statistics/cardiacarrhythmia_id2name.json", 'w') as wf:
    json.dump(id2name,wf)

#### MeSH to Doc Mapping
- Create a dictionary where the key is a MeSH descriptor, and the value is a list of papers (publications) that contains mention of the MeSH terms in its body text
- Each paper is represented as dictionary linking each attribute name in the paper (cord_uid, journal, title, etc.) with its actual information

##### Example of a paper node in the covid graph

In [10]:
paper_query = "MATCH (n:Paper) RETURN n LIMIT 1"
Data = []
with driver.session() as session:
    info = session.run(paper_query)
    for item in info:
        print(item)

<Record n=<Node id=2385529 labels={'Paper'} properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}>>


#### Writes data to file

In [11]:
MeSH_to_result = {}
MeSH_number = {}
for desc, entities in zip(MeSH.index, MeSH['phrases']):
    #Builds each part of the query based on the MeSH descriptor entity list
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                                        "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) WHERE ("
    for i in range(len(entities)):
        if i == len(entities)-1:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "') RETURN DISTINCT p"
        else:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "' AND "

    MeSH_result = []
        
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                paper = {}
                for i in range(len(node_keys)):
                    if (node_keys[i] == 'cord_uid'):
                        MeSH_result.append(node_values[i])
            except:
                continue
    
    try:
        name = id2name[desc]
        MeSH_to_result[name] = MeSH_result
        MeSH_number[desc] = len(MeSH_result)
    except:
        continue

In [12]:
ff = open("../Data/mesh/cardiac_arrhythmia_paper_ids.json", 'w')

json.dump(MeSH_to_result, ff)

ff.close()

In [13]:
MeSH_number.items()

dict_items([('C14.280.067', 51), ('C14.280.067.093', 15), ('C14.280.067.093.249', 5), ('C14.280.067.093.500', 1), ('C14.280.067.198', 55), ('C14.280.067.248', 7), ('C14.280.067.319', 53), ('C14.280.067.322', 3), ('C14.280.067.325', 0), ('C14.280.067.325.250', 0), ('C14.280.067.325.500', 2), ('C14.280.067.441', 1), ('C14.280.067.558', 4), ('C14.280.067.558.137', 0), ('C14.280.067.558.230', 8), ('C14.280.067.558.323', 1), ('C14.280.067.558.430', 0), ('C14.280.067.558.536', 5), ('C14.280.067.558.750', 0), ('C14.280.067.565', 2), ('C14.280.067.565.070', 0), ('C14.280.067.565.440', 0), ('C14.280.067.565.720', 0), ('C14.280.067.672', 0), ('C14.280.067.780', 0), ('C14.280.067.780.560', 0), ('C14.280.067.780.770', 0), ('C14.280.067.780.977', 1), ('C14.280.067.845', 118), ('C14.280.067.845.695', 6), ('C14.280.067.845.787', 0), ('C14.280.067.845.787.249', 0), ('C14.280.067.845.787.500', 0), ('C14.280.067.845.880', 10), ('C14.280.067.845.880.315', 0), ('C14.280.067.845.880.320', 0), ('C14.280.067