### MeSH Based Query Search

In this notebook, we will discuss how to search documents based on the comorbidities of coronavirus.

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
import csv

#### Authentication to access covidgraph.org graph

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

#### MeSH descriptor to its entity list
- Ex. ```C01.925.782.600.550.200.360: [feline infectious peritonitis]```
- Pandas Dataframe is very convenient for handeling a CSV file specifically for data transformation with ```lambda``` mapping functon.

In [3]:
MeSH = pd.read_csv("../Input/mesh/comorbidities.csv")
MeSH = MeSH.set_index('ID')
MeSH.head()

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
B04.820.650.589.650.350,HIV
C01.778.640.400.040,Acquired Immunodeficiency Syndrome
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome
C01.925.813.400.040,Acquired Immunodeficiency Syndrome
C01.925.839.040,Acquired Immunodeficiency Syndrome


- Implementing ```lambda``` function to map one column to another column

In [4]:
MeSH['phrases'] = MeSH['name'].apply(lambda x: x.lower().strip())

In [5]:
MeSH.at['B04.820.650.589.650.350', 'phrases'] = 'HIV'

In [6]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
B04.820.650.589.650.350,HIV,HIV
C01.778.640.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.813.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.839.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome


In [7]:
MeSH['phrases'] = MeSH['phrases'].apply(lambda x:x.split(','))

In [8]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
B04.820.650.589.650.350,HIV,[HIV]
C01.778.640.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.813.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.839.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]


In [9]:
MeSH.index[0]

'B04.820.650.589.650.350'

In [10]:
id2name = {}
for ID,name in zip(MeSH.index, MeSH['name']):
    id2name.update({ID:name})
    
with open("../Data/mesh/statistics/corona_comorbidities_id2name.json", 'w') as wf:
    json.dump(id2name,wf)

#### MeSH to Doc Mapping
- Create a dictionary where the key is a MeSH descriptor, and the value is a list of papers (publications) that contains mention of the MeSH terms in its body text
- Each paper is represented as dictionary linking each attribute name in the paper (cord_uid, journal, title, etc.) with its actual information

##### Example of a paper node in the covid graph

In [11]:
paper_query = "MATCH (n:Paper) RETURN n LIMIT 1"
Data = []
with driver.session() as session:
    info = session.run(paper_query)
    for item in info:
        print(item)

<Record n=<Node id=2385529 labels={'Paper'} properties={'cord_uid': 'ocp6yodg', 'cord19-fulltext_hash': 'b8957d48b6bcf17b7b51e004d19314ce77f653a1', 'journal': 'BMC Infect Dis', 'publish_time': '2011-12-28', 'source': 'PMC', 'title': 'Timeliness of contact tracing among flight passengers for influenza A/H1N1 2009', '_hash_id': '84b069ab23fb0ecebe6925af9c2b18ae', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3265549/'}>>


#### Writes data to file

In [18]:
MeSH_to_result = {}
MeSH_number = {}
for desc, entities  in zip(MeSH.index, MeSH['phrases']):
    #Builds each part of the query based on the MeSH descriptor entity list
    if entities == ['HIV']:
        query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                    "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) WHERE (a.text " \
                    "CONTAINS 'HIV') RETURN DISTINCT p"
    else:
        query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                                            "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) WHERE ("
        for i in range(len(entities)):
            if i == len(entities)-1:
                query += "LOWER(a.text) CONTAINS '" + entities[i] + "') RETURN DISTINCT p"
            else:
                query += "LOWER(a.text) CONTAINS '" + entities[i] + "' AND "

    MeSH_result = []
        
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                for i in range(len(node_keys)):
                    if (node_keys[i] == 'cord_uid'):
                        MeSH_result.append(node_values[i])    
            except:
                continue
    
    try:
        name = id2name[desc]
        MeSH_to_result[name] = MeSH_result
        MeSH_number[desc] = len(MeSH_result)
    except:
        continue

In [19]:
ff = open("../Data/mesh/corona_comorbidities_paper_ids.json", 'w')

json.dump(MeSH_to_result, ff)

ff.close()

#### Prints MeSH descriptor to publication list dictionary created above

In [20]:
MeSH_number.items()

dict_items([('B04.820.650.589.650.350', 1592), ('C01.778.640.400.040', 56), ('C01.925.782.815.616.400.040', 56), ('C01.925.813.400.040', 56), ('C01.925.839.040', 56), ('C04', 92), ('C06.552', 68), ('C08.127.108', 527), ('C08.381', 133), ('C08.381.495.108', 527), ('C08.381.495.389', 256), ('C08.674.095', 527), ('C10', 12), ('C10.228.140.300', 2), ('C12.777.419', 22), ('C13.351.968.419', 22), ('C14.280.238', 14), ('C14.280.434', 277), ('C14.280.647.250.260', 83), ('C14.907.137.126.339', 83), ('C14.907.253', 2), ('C14.907.489', 412), ('C14.907.585.250.260', 83), ('C15.378.071.141.150.150', 19), ('C15.378.071.141.150.875', 36), ('C15.378.420.155', 19), ('C15.378.420.826', 36), ('C16.320.070.150', 19), ('C16.320.070.875', 36), ('C16.320.365.155', 19), ('C16.320.365.826', 36), ('C18.452', 46), ('C18.452.394.750.124', 28), ('C18.452.394.750.149', 42), ('C18.654.726.500', 259), ('C19.246.267', 28), ('C19.246.300', 42), ('C20.111.327', 28), ('C20.543.480.680.095', 527), ('C20.673.480.040', 56),