# Workflow for relationship extraction

# Literature extraction from PubMed

In [1]:
# library imports 
import csv
import pandas as pd
import time
from csv import DictReader

from metapub import PubMedFetcher # for Entrez query
from Bio.Entrez import efetch # for Entrez query
from Bio import Entrez # for Entrez query



In [20]:
model = "en_tox" # model selection
Entrez.email = 'A.N.Other@example.com' # replace with own email
Entrez.api_key = "xxx" # replace with own key
# select input data (chemical list with chemical names):
datafile = "ChemicalList.csv"

## Retrieve PMIDS

In [None]:
query = " AND toxic* AND (human OR Animals, Laboratory OR Disease Models, Animal)"
chemical_list = pd.read_csv(datafile)
retmax = 100 # Maximum number of articles to retrieve
fetch = PubMedFetcher()
aspis = {}
for chemical in chemical_list["Compound Name"]:
    aspis["pmids"] = fetch.pmids_for_query(chemical + query, retmax=retmax)

with open("data/aspis_abstracts_updated.csv", "w") as file:
    csvwriter = csv.DictWriter(file, aspis.keys())
    csvwriter.writeheader()
    csvwriter.writerows(aspis)

In [17]:
from ast import literal_eval
df_aspis =pd.read_csv("data/aspis_abstracts_updated.csv")
df_aspis['pmids'] = df_aspis['pmids'].apply(literal_eval)
df_aspis = df_aspis.explode('pmids', ignore_index = True)
# Remove duplicates (a same article can be linked to multiple chemicals)
df_aspis_nodup = df_aspis.drop_duplicates(subset=['pmids'])
df_aspis_nodup

aspis_nodup = df_aspis_nodup.to_dict('records')

## Retrieve abstracts

In [None]:
# fetch abstracts
t0 = time.time()

for d in aspis_nodup:
    if str(d["pmids"])=='nan':
        d["abstract"] = ""
    else:
        try:
            d["abstract"] = utils.fetch_abstract(d["pmids"])
        except HTTPError:
            d["abstract"] = "HTTPError"
            print("HTTPerror")
   
t1 = time.time()

print("Time elapsed:" + str(t1-t0))

# write abstracts to .csv file
aspis_nodup = list(aspis_nodup)
keys = aspis_nodup[0].keys()

with open("data/aspis_abstracts_text_updated.csv", "w") as file:
    csvwriter = csv.DictWriter(file, keys)
    csvwriter.writeheader()
    csvwriter.writerows(aspis_nodup)

In [58]:
aspis = pd.read_csv("data/aspis_abstracts_text_updated.csv")
# Convert to right type
aspis['pmids'] = aspis['pmids'].astype('Int64')
aspis['abstract'] = aspis['abstract'].astype('str')

# Get lists of pmids and abstracts text for CREW workflow
pmids = aspis['pmids'].tolist()
abst = aspis['abstract'].tolist()

# transpose to dict
abstracts = dict(zip(pmids, abst))

# NLP on abstracts

In [None]:
## load spaCy models
import spacy
from spacy.matcher import DependencyMatcher
from scispacy.abbreviation import AbbreviationDetector
from spacy.pipeline import EntityRuler
from scispacy.linking import EntityLinker
import utils as utils
nlp = spacy.load("en_tox")

# add abbreviation detector pipe to spaCy model
nlp.add_pipe("abbreviation_detector")

## Pre-processing

In [None]:
# Split the text into sentences
df = utils.get_df_pmid_sents(nlp, abstracts)
#df.to_csv("data/aspis_sentences_updated.csv")

In [64]:
# Identify sentences with entities of interest: 2 phenotypes, or a phenotype and a chemical
#df = pd.read_csv("data/aspis_sentences_updated.csv")
df_pheno = utils.get_df_relations(nlp,"PHENOTYPE","PHENOTYPE", df)
df_pheno.to_csv("data/aspis_relations_pheno_updated.csv")
df_chem = utils.get_df_relations(nlp,"COMPOUND","PHENOTYPE", df)
df_chem.to_csv("data/aspis_relations_chem_updated.csv")

In [65]:
# list of causal verbs to use in dependency matcher
causal_verbs = ['increase', 'produce', 'cause', 
                'induce', 'generate', 'effect', 
                'provoke', 'arouse', 'elicit', 'lead', 'trigger',
                'derive', 'associate', 'relate', 'link', 
                'stem', 'originate', 'lead', 'bring', 
                'result', 'inhibit', 'elevate', 'diminish']
                

## Relationships between phenotypes

In [66]:
# run dependency matcher
matcher_pheno = utils.dependency_matcher(nlp, "PHENOTYPE", "PHENOTYPE")
df_pheno = pd.read_csv("data/aspis_relations_pheno_updated.csv")
df_pheno = utils.get_df_dependencyMatcher(nlp, df_pheno, matcher_pheno, causal_verbs)
# transform list elements to rows
df_pheno = df_pheno.explode(['Verb Match','Cause Match', 'Effect Match'])
# save df to csv
df_pheno.to_csv("data/aspis_en_tox_pheno_pheno_updated.csv")

## Relationships between compounds and phenotypes

In [None]:
matcher_chem = utils.dependency_matcher(nlp, "COMPOUND", "PHENOTYPE")
df_chem = pd.read_csv("data/aspis_relations_chem_updated.csv")
df_chem = utils.get_df_dependencyMatcher(nlp, df_chem, matcher_chem, causal_verbs)
# transform list elements to rows
df_chem = df_chem.explode(['Verb Match','Cause Match', 'Effect Match'])
# save df to csv
df_chem.to_csv("adata/spis_en_tox_compound_pheno_updated.csv")

In [73]:
# Take only sentences with a match 
df_pheno = pd.read_csv("aspis_en_tox_pheno_pheno_updated.csv")
df_pheno = pd.DataFrame(df_pheno.loc[df_pheno['Has Match'] == True])
# Remove identical phenotypes
df_pheno = pd.DataFrame(df_pheno.loc[df_pheno['Cause Match'] != df_pheno['Effect Match']])
df_pheno.to_csv("aspis_en_tox_pheno_match_updated.csv")

df_chem = pd.read_csv("aspis_en_tox_compound_pheno_updated.csv")
df_chem = pd.DataFrame(df_chem.loc[df_chem['Has Match'] == True])
# Remove identical phenotypes
df_chem = pd.DataFrame(df_chem.loc[df_chem['Cause Match'] != df_chem['Effect Match']])
df_chem.to_csv("aspis_en_tox_compound_match_updated.csv")

# Load data to neo4j

In [None]:
# import to personal neo4j instance, without factors
from neo4j import GraphDatabase

# Database Credentials
uri = "bolt://localhost:7687" # replace with own local db
userName = "neo4j" # replace with own username
password = "xx" # replace with own password

# Connect to the neo4j database server
graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password))


In [None]:
def create_rel_neo4j_cp(row, cause, effect):
    # Verify that nodes and relationships do not exist, otherwise create them (MERGE)
    cqlCreate = """MERGE (n1:%s { name: "%s"})
    MERGE (n2:%s { name: "%s"})
    MERGE (n1)-[:%s {PMID:%s}]-(n2)""" %(cause,row["Cause Match"],effect,row["Effect Match"],row["Verb Match"],row["Pmid"])
    return cqlCreate

def create_rel_neo4j_pp(row, cause, effect):
    # Verify that nodes and relationships do not exist, otherwise create them (MERGE)
    cqlCreate = """MERGE (n1:%s {name: "%s"})
    MERGE (n2:%s { name: "%s"})
    MERGE (n1)-[:%s {PMID:%s}]-(n2)""" %(cause,row["Cause Match"],effect,row["Effect Match"],row["Verb Match"],row["Pmid"])
    return cqlCreate
