### MeSH Based Query Search

In this notebook, we will discuss how to search documents based on the comorbidities of coronavirus.

In [1]:
import pandas as pd
import json
from neo4j import GraphDatabase
import csv

#### Authentication to access covidgraph.org graph

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

#driver = GraphDatabase.driver(uri, auth=(user, password))
driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

#### MeSH descriptor to its entity list
- Ex. ```C01.925.782.600.550.200.360: [feline infectious peritonitis]```
- Pandas Dataframe is very convenient for handeling a CSV file specifically for data transformation with ```lambda``` mapping functon.

In [3]:
MeSH = pd.read_csv("input/mesh/comorbidities.csv")
MeSH = MeSH.set_index('ID')
MeSH.head()

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
B04.820.650.589.650.350,HIV
C01.778.640.400.040,Acquired Immunodeficiency Syndrome
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome
C01.925.813.400.040,Acquired Immunodeficiency Syndrome
C01.925.839.040,Acquired Immunodeficiency Syndrome


- Implementing ```lambda``` function to map one column to another column

In [4]:
MeSH['phrases'] = MeSH['name'].apply(lambda x: x.lower().strip())

In [5]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
B04.820.650.589.650.350,HIV,hiv
C01.778.640.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.813.400.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome
C01.925.839.040,Acquired Immunodeficiency Syndrome,acquired immunodeficiency syndrome


In [6]:
MeSH['phrases'] = MeSH['phrases'].apply(lambda x:x.split(','))

In [7]:
MeSH.head()

Unnamed: 0_level_0,name,phrases
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
B04.820.650.589.650.350,HIV,[hiv]
C01.778.640.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.782.815.616.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.813.400.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]
C01.925.839.040,Acquired Immunodeficiency Syndrome,[acquired immunodeficiency syndrome]


In [8]:
MeSH.index[0]

'B04.820.650.589.650.350'

In [9]:
id2name = {}
for ID,name in zip(MeSH.index, MeSH['name']):
    id2name.update({ID:name})
    
with open("data/data-statistics/corona_comorbidities_id2name.json", 'w') as wf:
    json.dump(id2name,wf)

#### MeSH to Doc Mapping
- Create a dictionary where the key is a MeSH descriptor, and the value is a list of papers (publications) that contains mention of the MeSH terms in its body text
- Each paper is represented as dictionary linking each attribute name in the paper (cord_uid, journal, title, etc.) with its actual information

##### Example of a paper node in the covid graph

In [10]:
paper_query = "MATCH (n:Paper) RETURN n LIMIT 1"
Data = []
with driver.session() as session:
    info = session.run(paper_query)
    for item in info:
        print(item)

<Record n=<Node id=3198 labels={'Paper'} properties={'cord_uid': 'zrmkq3mz', 'cord19-fulltext_hash': '41c7a01f11ed47591d99f45774e43e45aeba0619', 'journal': 'BMC Microbiol', 'publish_time': '2009-08-12', 'source': 'PMC', 'title': 'CAPIH: A Web interface for comparative analyses and visualization of host-HIV protein-protein interactions', '_hash_id': '3c4b2ee1430dc9ac53aca87c0fc0f7eb', 'url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2782265/'}>>


#### Writes data to file

In [11]:
MeSH_to_result = {}
MeSH_number = {}
for desc, entities  in zip(MeSH.index, MeSH['phrases']):
    #Builds each part of the query based on the MeSH descriptor entity list
    query = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                                        "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(a:BodyText) WHERE ("
    for i in range(len(entities)):
        if i == len(entities)-1:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "') RETURN DISTINCT p"
        else:
            query += "LOWER(a.text) CONTAINS '" + entities[i] + "' AND "

    MeSH_result = []
        
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                for i in range(len(node_keys)):
                    if (node_keys[i] == 'cord_uid'):
                        MeSH_result.append(node_values[i])    
            except:
                continue
    
    try:
        MeSH_to_result[desc] = MeSH_result
        MeSH_number[desc] = len(MeSH_result)
    except:
        continue

In [12]:
ff = open("data/mesh_search/mesh_corona_comorbidities_paper_ids.json", 'w')

json.dump(MeSH_to_result, ff)

ff.close()

#### Prints MeSH descriptor to publication list dictionary created above

In [13]:
MeSH_number.items()

dict_items([('B04.820.650.589.650.350', 10745), ('C01.778.640.400.040', 548), ('C01.925.782.815.616.400.040', 548), ('C01.925.813.400.040', 548), ('C01.925.839.040', 548), ('C04', 556), ('C06.552', 309), ('C08.127.108', 3002), ('C08.381', 526), ('C08.381.495.108', 3002), ('C08.381.495.389', 1201), ('C08.674.095', 3002), ('C10', 80), ('C10.228.140.300', 10), ('C12.777.419', 92), ('C13.351.968.419', 92), ('C14.280.238', 71), ('C14.280.434', 1286), ('C14.280.647.250.260', 300), ('C14.907.137.126.339', 300), ('C14.907.253', 10), ('C14.907.489', 2017), ('C14.907.585.250.260', 300), ('C15.378.071.141.150.150', 103), ('C15.378.071.141.150.875', 110), ('C15.378.420.155', 103), ('C15.378.420.826', 110), ('C16.320.070.150', 103), ('C16.320.070.875', 110), ('C16.320.365.155', 103), ('C16.320.365.826', 110), ('C18.452', 201), ('C18.452.394.750.124', 87), ('C18.452.394.750.149', 145), ('C18.654.726.500', 1274), ('C19.246.267', 87), ('C19.246.300', 145), ('C20.111.327', 87), ('C20.543.480.680.095', 