## Data Analysis

This notebook sumarizes the preliminary data found in Corona virus, Covid-19 document count, time published, cohort studied etc.

In [1]:
import json as json
import pandas as pd
from neo4j import GraphDatabase
import csv

In [2]:
covid_browser = "https://covid.petesis.com:7473"
covid_url = "bolt://covid.petesis.com:7687"
user = "public"
password = "corona"

driver = GraphDatabase.driver(uri = covid_url,\
                              auth = (user,password))

In [3]:
with open("../../Data/mesh/statistics/corona_id2name.json",'r')as rf:
    id2name = json.load(rf)

In [4]:
DATA = []
allIDs = []
with open("../../Data/mesh/corona_paper_ids.json", 'r') as fcount:
        data = json.load(fcount)
        for key,value in data.items():
            for xid in value:
                allIDs.append(xid)
            DATA.append({"MeSH":key,"Count":len(value)})

### Document Count

In [5]:
DATA

[{'Count': 679, 'MeSH': 'C01.925.782.600.550.200'},
 {'Count': 5, 'MeSH': 'C01.925.782.600.550.200.325'},
 {'Count': 1045, 'MeSH': 'C01.925.782.600.550.200.360'},
 {'Count': 153, 'MeSH': 'C01.925.782.600.550.200.400'},
 {'Count': 12176, 'MeSH': 'C01.925.782.600.550.200.750'}]

In [6]:
DF = pd.DataFrame(DATA)

In [7]:
DF = DF.sort_values('Count', ascending =False)

In [None]:
DF

Unnamed: 0,Count,MeSH
4,12176,C01.925.782.600.550.200.750
2,1045,C01.925.782.600.550.200.360
0,679,C01.925.782.600.550.200
3,153,C01.925.782.600.550.200.400
1,5,C01.925.782.600.550.200.325


In [9]:
DF['Name'] = DF['MeSH'].apply(lambda x: id2name[x])

In [10]:
DF = DF.set_index('Name')

In [11]:
DF

Unnamed: 0_level_0,Count,MeSH
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Severe Acute Respiratory Syndrome,12176,C01.925.782.600.550.200.750
Feline Infectious Peritonitis,1045,C01.925.782.600.550.200.360
Coronavirus Infections,679,C01.925.782.600.550.200
"Gastroenteritis, Transmissible, of Swine",153,C01.925.782.600.550.200.400
"Enteritis, Transmissible, of Turkeys",5,C01.925.782.600.550.200.325


In [12]:
DF.to_csv('corona_mesh_count.csv')

### Cohort and Region

In [13]:
uniqueIDs = list(set(allIDs))
len(uniqueIDs)

13357

In [15]:
PaperID_to_Info = []
limit = 0
for _id in uniqueIDs:
    paper_info = {}
    
    #Builds each part of the query based on the MeSH descriptor entity list
    query_info = "MATCH (p:Paper) WHERE p.cord_uid = '" + str(_id) + "' RETURN DISTINCT p"
    
    query_bodytext = "MATCH (p:Paper)-[:PAPER_HAS_BODYTEXTCOLLECTION]-(:BodyTextCollection)-" \
                        "[:BODYTEXTCOLLECTION_HAS_BODYTEXT]-(b:BodyText) WHERE p.cord_uid = '" \
                        + str(_id) + "' RETURN DISTINCT b"
            
    query_abstract = "MATCH (p:Paper)-[:PAPER_HAS_ABSTRACTCOLLECTION]-(:AbstractCollection)-" \
                        "[:ABSTRACTCOLLECTION_HAS_ABSTRACT]-(a:Abstract) WHERE p.cord_uid = '" \
                        + str(_id) + "' RETURN DISTINCT a"
    
    with driver.session() as session:
        info = session.run(query_info)
        for item in info:
            node_keys = list((item.values(0)[0]).keys())
            node_values = list((item.values(0)[0]).values())
            for i in range(len(node_keys)):
                paper_info[node_keys[i]] = node_values[i]
        
        bodytext = session.run(query_bodytext)
        bodytext_list = []
        for item in bodytext:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                for i in range(len(node_keys)):
                    if (node_keys[i] == '_hash_id'):
                        bodytext_list.append(node_values[i])
            except:
                continue
        paper_info['bodytexts'] = bodytext_list
        
        abstract = session.run(query_abstract)
        abstract_list = []
        for item in abstract:
            try:
                node_keys = list((item.values(0)[0]).keys())
                node_values = list((item.values(0)[0]).values())
                for i in range(len(node_keys)):
                    if (node_keys[i] == '_hash_id'):
                        abstract_list.append(node_values[i])  
            except:
                continue
        paper_info['abstracts'] = abstract_list
    
    try:
        PaperID_to_Info.append({"MeSH":_id, "Info":paper_info})
        if limit == 0:
            print(PaperID_to_Info)
    except:
        continue
    limit += 1

[{'MeSH': '0x9mjwpc', 'Info': {'cord_uid': '0x9mjwpc', 'cord19-fulltext_hash': '2419ba014f673040b6a3879c3473e9267eec28c9', 'journal': 'Clinical Microbiology and Infection', 'publish_time': '2014-03-31', 'source': 'Elsevier', 'title': 'Emerging respiratory viruses: is it ‘much ado about nothing’? (Shakespeare)', '_hash_id': '3023a4f64973996f328707734265bc3f', 'url': 'https://doi.org/10.1111/1469-0691.12488', 'bodytexts': ['2bbe6b5c0f010ab26760750d1500e872', '410a44abbda63b7ec6da59632e274287', 'e1020b049806d84befe1a951957b6e30', '1b53f789769037e5d480047a0063a94e', '701b55d2ebb05f4a7d3a309d453e1f2f', '1b66b3ec8a01ccf605f90088686729d9'], 'abstracts': []}}]


In [16]:
PID = pd.DataFrame(PaperID_to_Info)

In [17]:
PID

Unnamed: 0,Info,MeSH
0,"{'cord_uid': '0x9mjwpc', 'cord19-fulltext_hash...",0x9mjwpc
1,"{'cord_uid': 'tyro2vuo', 'cord19-fulltext_hash...",tyro2vuo
2,"{'cord_uid': 'gegboxgs', 'cord19-fulltext_hash...",gegboxgs
3,"{'cord_uid': 'btdbv2tk', 'cord19-fulltext_hash...",btdbv2tk
4,"{'cord_uid': 'eptit46j', 'cord19-fulltext_hash...",eptit46j
5,"{'cord_uid': 'ejfp8gqz', 'cord19-fulltext_hash...",ejfp8gqz
6,"{'cord_uid': 'waidnotv', 'cord19-fulltext_hash...",waidnotv
7,"{'cord_uid': '7bmwfubu', 'journal': 'Bioscienc...",7bmwfubu
8,"{'cord_uid': 'aim1fiaj', 'cord19-fulltext_hash...",aim1fiaj
9,"{'cord_uid': 'f3zdttfo', 'cord19-fulltext_hash...",f3zdttfo


In [18]:
PID = PID.set_index('MeSH')

In [19]:
PID

Unnamed: 0_level_0,Info
MeSH,Unnamed: 1_level_1
0x9mjwpc,"{'cord_uid': '0x9mjwpc', 'cord19-fulltext_hash..."
tyro2vuo,"{'cord_uid': 'tyro2vuo', 'cord19-fulltext_hash..."
gegboxgs,"{'cord_uid': 'gegboxgs', 'cord19-fulltext_hash..."
btdbv2tk,"{'cord_uid': 'btdbv2tk', 'cord19-fulltext_hash..."
eptit46j,"{'cord_uid': 'eptit46j', 'cord19-fulltext_hash..."
ejfp8gqz,"{'cord_uid': 'ejfp8gqz', 'cord19-fulltext_hash..."
waidnotv,"{'cord_uid': 'waidnotv', 'cord19-fulltext_hash..."
7bmwfubu,"{'cord_uid': '7bmwfubu', 'journal': 'Bioscienc..."
aim1fiaj,"{'cord_uid': 'aim1fiaj', 'cord19-fulltext_hash..."
f3zdttfo,"{'cord_uid': 'f3zdttfo', 'cord19-fulltext_hash..."


In [20]:
DF.to_csv('corona_mesh_paper_info.csv')