### MeSH Based Query Search

In this notebook, we will discuss how to search documents based on the Disease described in the CVD tree.

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
import csv

#### Authentication to access covidgraph.org graph

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

#### MeSH descriptor to its entity list
- Ex. ```C14.260.500: [syphilis, cardiovascular]```
- Pandas Dataframe is very convenient for handeling a CSV file specifically for data transformation with ```lambda``` mapping functon.

In [3]:
MeSH = pd.read_csv("../Input/mesh/cvd.csv")
MeSH = MeSH.set_index('ID')
MeSH.head()

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
C14,Cardiovascular Diseases
C14.260,Cardiovascular Infections
C14.260.500,"Syphilis, Cardiovascular"
C14.260.750,"Tuberculosis, Cardiovascular"
C14.280,Heart Diseases


- Implementing ```lambda``` function to map one column to another column

In [4]:
MeSH['phrases'] = MeSH['name'].apply(lambda x: x.lower().strip())

In [5]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C14,Cardiovascular Diseases,cardiovascular diseases
C14.260,Cardiovascular Infections,cardiovascular infections
C14.260.500,"Syphilis, Cardiovascular","syphilis, cardiovascular"
C14.260.750,"Tuberculosis, Cardiovascular","tuberculosis, cardiovascular"
C14.280,Heart Diseases,heart diseases


In [6]:
MeSH['phrases'] = MeSH['phrases'].apply(lambda x:x.split(','))

In [7]:
for x in MeSH['phrases']:
    for val in x:
        val = val.strip()

In [8]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
C14,Cardiovascular Diseases,[cardiovascular diseases]
C14.260,Cardiovascular Infections,[cardiovascular infections]
C14.260.500,"Syphilis, Cardiovascular","[syphilis, cardiovascular]"
C14.260.750,"Tuberculosis, Cardiovascular","[tuberculosis, cardiovascular]"
C14.280,Heart Diseases,[heart diseases]


In [9]:
MeSH.index[0]

'C14'

In [10]:
id2name = {}
for ID,name in zip(MeSH.index, MeSH['name']):
    id2name.update({ID:name})
    
with open("../Data/mesh/statistics/cvd_id2name.json", 'w') as wf:
    json.dump(id2name,wf)

#### MeSH to Doc Mapping
- Create a dictionary where the key is a MeSH descriptor, and the value is a list of papers (publications) that contains mention of the MeSH terms in its body text
- Each paper is represented as dictionary linking each attribute name in the paper (cord_uid, journal, title, etc.) with its actual information

##### Example of a paper node in the covid graph

In [11]:
paper_query = "MATCH (n:Paper) RETURN n LIMIT 1"
Data = []
with driver.session() as session:
    info = session.run(paper_query)
    for item in info:
        print(item)

<Record n=<Node id=2385529 labels={'Paper'} properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}>>


#### Writes data to file

In [13]:
MeSH_to_result = {}
MeSH_number = {}
for desc, entities in zip(MeSH.index, MeSH['phrases']):
    #Builds each part of the query based on the MeSH descriptor entity list
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                                        "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) WHERE ("
    for i in range(len(entities)):
        if i == len(entities)-1:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "') RETURN DISTINCT p"
        else:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "' AND "

    MeSH_result = []
        
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                paper = {}
                for i in range(len(node_keys)):
                    if (node_keys[i] == 'cord_uid'):
                        MeSH_result.append(node_values[i])
            except:
                continue
    
    try:
        name = id2name[desc]
        MeSH_to_result[name] = MeSH_result
        MeSH_number[desc] = len(MeSH_result)
    except:
        continue

In [14]:
ff = open("../Data/mesh/cvd_paper_ids.json", 'w')

json.dump(MeSH_to_result, ff)

ff.close()

In [15]:
MeSH_number.items()

dict_items([('C14', 133), ('C14.260', 0), ('C14.260.500', 1), ('C14.260.750', 12), ('C14.280', 35), ('C14.280.104', 0), ('C14.280.123', 0), ('C14.280.123.750', 0), ('C14.280.123.750.560', 0), ('C14.280.123.750.770', 0), ('C14.280.155', 12), ('C14.280.238', 14), ('C14.280.238.070', 39), ('C14.280.238.160', 7), ('C14.280.238.190', 0), ('C14.280.238.281', 1), ('C14.280.238.406', 2), ('C14.280.238.625', 220), ('C14.280.282', 71), ('C14.280.282.407', 17), ('C14.280.282.407.407', 4), ('C14.280.282.703', 1), ('C14.280.383', 0), ('C14.280.383.610', 14), ('C14.280.434', 277), ('C14.280.434.156', 1), ('C14.280.434.611', 22), ('C14.280.434.676', 34), ('C14.280.459', 0), ('C14.280.470', 0), ('C14.280.470.475', 0), ('C14.280.470.475.900', 1), ('C14.280.484', 0), ('C14.280.484.095', 2), ('C14.280.484.150.070.160', 25), ('C14.280.484.400', 0), ('C14.280.484.400.100', 0), ('C14.280.484.400.500', 4), ('C14.280.484.400.875', 0), ('C14.280.484.461', 2), ('C14.280.484.517', 2), ('C14.280.484.660', 1), ('C