In [2]:
!pip install stanza spacy scispacy nltk transformers --quiet
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz --quiet
    
#You may need to restart the shell after installation in order to run the subsequent codes and load the scispacy model correctly

In [4]:
%%capture
import spacy
import scispacy
import nltk
nltk.download('popular')
nltk.download('brown')
from textblob import TextBlob
import numpy as np
import pandas as pd
from itertools import permutations 
nlp = spacy.load('en_ner_bc5cdr_md')
from transformers import pipeline
nlp_qa = pipeline('question-answering')

In [5]:
#this function takes in a sentence, and returns a dictionary (if possible to extract)
ce_count = 0
sent_count = 0
def get_ce_dict(sent):
  global ce_count,sent_count 
  nps_tb = TextBlob(sent).noun_phrases   #can be substituted with an alternative way of extracting noun phrases
  nps_spacy = [token.text for token in nlp(sent).noun_chunks]
  nps = list(np.unique(nps_tb+nps_spacy))
  curr_score = 0.8  #can be altered
  curr_dict = np.nan
  if(len(nps)>1):
    perms = list(permutations(nps,2))
    questions1 = ["Is {} the cause of {}?".format(a,b) for (a,b) in perms]
    causes1 = [perm[0] for perm in perms]
    effects1 = [perm[1] for perm in perms]
    questions2 = ["What is the cause of {}?".format(a) for a in nps]
    effects2 = nps
    questions = questions1 + questions2
    qa_input = [{'context':sent,'question':question} for question in questions]
    results = nlp_qa(qa_input)
    causes2 = [results[i]['answer'] for i in range(len(questions1),len(results))]
    effects = effects1 +effects2 
    causes = causes1 + causes2
    for i in range(0,len(questions)):
      result = dict(results[i])
      cause = causes[i]
      effect = effects[i]
      if(result['score']>=curr_score):
        #print("Pair found for sentence : {}".format(sent))
        #print(" question:'{}',cause:'{}',effect:'{}',score:'{}'".format(question,cause,effect,result['score']))
        curr_dict  = result
        curr_dict['cause'] = cause
        curr_dict['effect'] = effect
        curr_dict['question'] = questions[i]
        curr_dict['noun_phrases'] = nps
        curr_score = result['score']
  if(curr_score>0.8):
    sent_count += 1
  ce_count+=1
  if(ce_count%10==0):
    print('sentences read == {}, sentences annotated = {}'.format(ce_count,sent_count))
  return(curr_dict)

In [None]:
n = 100  #set the number of sentences you want to analyze
cr_df = pd.DataFrame()
sents = open('227k_labeled_sentences.txt').readlines()
cr_df["raw_sentence"] = sents
cr_df["tag"] = cr_df["raw_sentence"].apply(lambda x:x.split(' == ')[0])
cr_df["raw_sentence"] = cr_df["raw_sentence"].apply(lambda x:x.split(' == ')[1])
cr_df = cr_df.loc[cr_df["tag"]=='1',:].iloc[0:n,:]
sents_raw = cr_df["raw_sentence"].values
cr_df["HF_result"] = cr_df["raw_sentence"].apply(get_ce_dict)
cr_df.isnull().sum()

Pair found for sentence : the subjects were exposed to uv irradiation, causing a local tissue inflammation.

 cause:uv irradiation,,effect:a local tissue inflammation,score:0.9572199767599869
Pair found for sentence : the subjects were exposed to uv irradiation, causing a local tissue inflammation.

 cause:uv irradiation,,effect:local tissue inflammation,score:0.9704209240399138
Pair found for sentence : az12048189 did not, however, have any significant effects as assessed using mechanical stimulation or laser doppler.

 cause:mechanical stimulation,effect:laser doppler,score:0.975353403635566
Pair found for sentence : this study also proved that trpv1 antagonists can inhibit a more complex, non-capsaicin dependent thermally induced pain signal.

 cause:trpv1 antagonists,effect:non-capsaicin dependent thermally induced pain signal,score:0.864427191783534
Pair found for sentence : discomfort, severity and frequency of symptoms, visual symptoms, conjunctival injection, eyelid-meibomian g

In [0]:
cr_df = cr_df.dropna()
cr_df["cause"] = cr_df["HF_result"].apply(lambda x:x["cause"])
cr_df["effect"] = cr_df["HF_result"].apply(lambda x:x["effect"])
cr_df["question"] = cr_df["HF_result"].apply(lambda x:x["question"])
cr_df["score"] = cr_df["HF_result"].apply(lambda x:round(x["score"],3))
cr_df["noun_phrases"] = cr_df["HF_result"].apply(lambda x:x["noun_phrases"])
cr_df[["raw_sentence","cause","effect","question","score","noun_phrases"]].to_csv("cr_df15.csv",index=False)