In [None]:
!pip install stanfordnlp



In [None]:
# This script aims to extract causes, cure and preventions for cancer from tweets and reddit posts

import os,ast,sys,warnings
import pandas as pd
import stanfordnlp
stanfordnlp.download('en')
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

#disable warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:23<00:00, 10.1MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [None]:
# Load the data (splitting tweets and reddit posts into sentences)

def save_dataset(tweet_file, reddit_file):
    # make dataset using these two CSV files, treating all reddits as verified posts
    # One datapoint per post
    # datapoint = (topic, verified, list of sentences)
    tweets = pd.read_csv(tweet_file).drop(['Unnamed: 0'], axis=1)
    #print(tweets.head())
    reddits = pd.read_csv(reddit_file).drop(['Unnamed: 0','ID','url','subreddit'], axis=1)
    reddits['Text'] = reddits['Title'].astype(str) + " " + reddits['Selftext'].astype(str)
    reddits = reddits.drop(['Title','Selftext'], axis=1)
    reddits.insert(1, "verified", 1) 
    #print(reddits.head())
    data = tweets.append(reddits)
    
    #for every text divide it into sentences
    tokenizer = PunktSentenceTokenizer()
    segmented = []
    for item in data['Text']:
        #some cleaning
        item = item.replace(' nan',' ').replace('-',' ').replace('_',' ')
        tokenizer.train(item)
        sentences = tokenizer.tokenize(item)
        #ignore sentences not mentioning cancer
        torem = []
        for s in sentences:
            if 'cancer' not in s.lower():
                torem.append(s) 
        for s in torem:
            sentences.remove(s)
        segmented.append(sentences)
    print(segmented)
    data = data.drop(['Text'],axis=1)
    data['Sentences'] = segmented
    print("Exporting to a CSV file")
    data.to_csv('dataset.csv', sep=',', index=False)
    return data

datafile = 'dataset.csv' 
if datafile not in os.listdir():
    data = save_dataset("cancer-5.8k-tweets.csv","cancer-621-reddit.csv")
else:
    data = pd.read_csv(datafile)

print(data.head())

              Topic  ...                                          Sentences
0  Causes of cancer  ...  ['Pollution causes cancer.', 'In states where ...
1  Causes of cancer  ...  ['Vanity Fair: Trump, "who has hugely downplay...
2  Causes of cancer  ...  ["Between this and my nieces emphatically tell...
3  Causes of cancer  ...  ['The Wash U communications Dept misinterprete...
4  Causes of cancer  ...  ['1/n: What is the flux energy threshold that ...

[5 rows x 4 columns]


In [None]:
# create a text pipeline with all the processors
nlp = stanfordnlp.Pipeline()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand

In [None]:
'''
Approach - Rule based matching
Will try to match sentences with predefined rules. 
Can capture single word causes of cancer
'''
def match_rule(sent):
    target = None
    # sliding token window max size 5
    window = [None,None,None,None,None]
    sz = 0
    for word in sent.words:
        sflag = 1
        if word.lemma != 'cancer':
            if(sz<5 and window[sz]==None):
                window[sz] = word
                sz += 1
                sflag = 0
                continue
        else:
            # - Rule(1) = "X cause cancer" sz=2
            if(sz >= 2 and window[sz-1].lemma=='cause' and window[sz-2].upos not in ['PRON','ADV','CCONJ']):
                #print("\t ++ Matched for rule 1")
                return 1,window[sz-2].lemma
            
            # - Rule(2) = "X will lead to cancer"  sz=4
            if(sz >= 4 and window[sz-1].lemma=='to' and window[sz-2].lemma=='lead' and window[sz-3].lemma=='will' and window[sz-4].upos not in ['PRON','ADV','CCONJ']):
                #print("\t ++ Matched for rule 2")
                #sent.print_tokens()
                return 2,window[sz-4].lemma
            
            # - Rule(3) = "X results in cancer"  sz=3
            if(sz >= 3 and window[sz-1].lemma=='in' and window[sz-2].lemma=='result' and window[sz-3].upos not in ['PRON','ADV','CCONJ']):
                #print("\t ++ Matched for rule 3")
                return 3,window[sz-3].lemma
            
            # - Rule(4) = "X leads to cancer"  sz=3
            if(sz >= 3 and window[sz-1].lemma=='to' and window[sz-2].lemma=='lead' and window[sz-3].upos not in ['PRON','ADV','CCONJ']):
                #print("\t ++ Matched for rule 4")
                return 4,window[sz-3].lemma
            
            # - Rule(5) = "X is a cause/reason of cancer"  sz=5
            if(sz == 5 and window[sz-1].lemma=='of' and window[sz-2].lemma in ['cause','reason'] and window[sz-3].lemma=='a' and window[sz-4].lemma=='be' \
               and window[sz-5].upos not in ['PRON','ADV','CCONJ']):
                #print("\t ++ Matched for rule 5")
                return 5,window[sz-5].lemma
            
            #if no match add cancer to window OR slide
            if(sz<5):
                window[sz] = word
                sz += 1
                sflag = 0
            
        if(sflag==1):
            #slide window one position left
            window[0] = window[1]
            window[1] = window[2]
            window[2] = window[3]
            window[3] = window[4]
            window[4] = word
        
    return 0,None


# Extraction - simplest way
limit = 500
matched = [0,0,0,0,0,0] # count array for 5 rules, index 0 for no match
target_dict = {}

causedata = data[data['Topic']=='Causes of cancer']
print(causedata)
printlist = []

print("Number of datapoints for causes: ",len(causedata))
for idx, row in causedata.iterrows():
    sentences = ast.literal_eval(row["Sentences"])
    #print(sentences)
    for sentence in sentences:
        #print("\nSentence: ",sentence)
        doc = nlp(sentence)
        #print("Dependencies: ")
        #MATCHING
        for s in doc.sentences:
            lemmatized = ""
            dep_rels = ""
            for w in s.words:
                lemmatized = lemmatized + w.lemma + " "
                dep_rels = dep_rels + w.dependency_relation + " "
            printlist.append([sentence,lemmatized,dep_rels])
            #print([sentence,lemmatized,dep_rels])
            #MATCHING BEGINS!
            i, tgt = match_rule(s)
            matched[i] += 1
            if(i!=0):
                if tgt in target_dict.keys():
                    target_dict[tgt] += 1
                else:
                    target_dict[tgt] = 1               
    #uncomment break out of loop early
    #if idx == limit-1:
     #   break

print(matched,"\n",target_dict)
#exporting reqd CSV
df = pd.DataFrame(printlist)
df.columns = ['sentence','lemmatized','dep_parse_out']
df.to_csv('dep_parse_out.csv', sep=',', index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
	 ++ Matched for rule 1

Sentence:  The sun causes cancer.
	 ++ Matched for rule 1

Sentence:  Woke up to the internet telling me that bacon causes cancer.
	 ++ Matched for rule 1

Sentence:  Bacon causes cancer.
	 ++ Matched for rule 1

Sentence:  Everything causes cancer.

Sentence:  Bacon causes cancer.
	 ++ Matched for rule 1

Sentence:  Who says that processed meat causes cancer?
	 ++ Matched for rule 1

Sentence:  WHO says bacon causes cancer.
	 ++ Matched for rule 1

Sentence:  Watching the global news media's internal battle to justify prioritising 'bacon causes cancer' over an earthquake is kind of depressing.
	 ++ Matched for rule 1

Sentence:  @Genevieve Between this and "red meat causes cancer," the hypochondriac part of me is on full alert today.
	 ++ Matched for rule 1

Sentence:  Everything causes cancer, except the things listed by Pocahontas in "Colors of the Wind."

Sentence:  WHO said that meat causes c

In [None]:
#Result anaysis, stats
for i in range(1,6):
    print("\tSentences matched with rule ",i,": ",matched[i])
print("\nPercentage of sentences matched with rules:", 1-(matched[0]/sum(matched)))
print("\nCauses identified and their counts: ")
threshold = 15
for key,val in target_dict.items():
    if(len(key)>1 and val >= threshold):
        print("Cause: %-10s"%key,"\t-- count: ",val)

	Sentences matched with rule  1 :  2060
	Sentences matched with rule  2 :  6
	Sentences matched with rule  3 :  0
	Sentences matched with rule  4 :  39
	Sentences matched with rule  5 :  11

Percentage of sentences matched with rules: 0.5397959183673469

Causes identified and their counts: 
Cause: smoking    	-- count:  208
Cause: cigarett   	-- count:  16
Cause: tobacco    	-- count:  25
Cause: windmill   	-- count:  95
Cause: wifi       	-- count:  25
Cause: meat       	-- count:  107
Cause: turbine    	-- count:  31
Cause: noise      	-- count:  151
Cause: bacon      	-- count:  64
Cause: smoke      	-- count:  69
Cause: coffee     	-- count:  15
Cause: powder     	-- count:  25
Cause: radiation  	-- count:  17
Cause: hpv        	-- count:  22
Cause: Bacon      	-- count:  28
Cause: obesity    	-- count:  26
Cause: not        	-- count:  17
Cause: wind       	-- count:  33


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
matched_sentences = 0
causelist = {}
total_sentences = 0
for idx, row in data.iterrows():
  sentences = ast.literal_eval(row['Sentences'])
  #print(sentences)
  if len(sentences) <= 0:
    continue
  for sentence in sentences:
    total_sentences += 1
    if "causes cancer" in sentence or "cause cancer" in sentence or "lead to cancer" in sentence or "results in cancer" in sentence or "leads to cancer" in sentence or "reason of cancer" in sentence or "cause of cancer" in sentence:
      pass
    else:
      continue
    print('\n++++++++++++ Sentence from datapoint %d +++++++++++++++'%(idx+1))
    print("Sentence: ",sentence)
    doc = nlp(sentence)
    print("Cause of cancer: ", end='')
    wordlist = []
    for word in doc.sentences[0].words:
      #print(f"index: {word.index.rjust(2)}\tword: {word.text.ljust(11)}\tgovernor index: {word.governor}\tgovernor: {(doc.sentences[0].words[word.governor-1].text if word.governor > 0 else 'root').ljust(11)}\tdeprel: {word.dependency_relation}")
      if word.text == 'cancer' and word.governor > 0 and doc.sentences[0].words[word.governor-1].lemma in {'cause', 'lead', 'reason', 'result'}:
        #capturing words with same governor as cancer and governor is synonym of cause
        for word1 in doc.sentences[0].words:
          if word1 != word and word.governor == word1.governor and ('sub' in word1.dependency_relation or 'obj' in word1.dependency_relation):
            wordlist.append(int(word1.index))
            f = True
            #parsing the sentence to capture phrase having governor same as cancer
            while(f == True):
              f = False
              for word2 in doc.sentences[0].words:
                if int(word2.index) not in wordlist and word2.governor in wordlist:
                  wordlist.append(int(word2.index))
                  f = True
            #concatinating multiple words into 1 string
            tempcause = ''
            for word2 in doc.sentences[0].words:
              if int(word2.index) in wordlist:
                tempcause = tempcause + word2.text.lower() + ' '
    if wordlist == []:
      print('No causes of cancer found in this sentense')
    else:
      if tempcause[:-1] in {'it', 'what', 'which', 'that'}:
        continue
      matched_sentences += 1
      print(tempcause[:-1])
      if tempcause[:-1] in causelist:
        causelist[tempcause[:-1]] += 1
      else:
        causelist[tempcause[:-1]] = 1    

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

++++++++++++ Sentence from datapoint 1886 +++++++++++++++
Sentence:  This is the same group that said glyphosate causes cancer.
Cause of cancer: glyphosate

++++++++++++ Sentence from datapoint 1887 +++++++++++++++
Sentence:  Pence didn't believe smoking causes cancer, opposed sterile needles.
Cause of cancer: smoking

++++++++++++ Sentence from datapoint 1888 +++++++++++++++
Sentence:  we know it can cause cancer.
Cause of cancer: 
++++++++++++ Sentence from datapoint 1888 +++++++++++++++
Sentence:  but literally everything causes cancer.
Cause of cancer: everything

++++++++++++ Sentence from datapoint 1888 +++++++++++++++
Sentence:  everyday things like using deodorant or being outside w/o sunscreen can cause cancer.
Cause of cancer: everyday things like using deodorant or being outside w/o sunscreen

++++++++++++ Sentence from datapoint 1889 +++++++++++++++
Sentence:  Pence didn't believe smoking causes cancer, oppos

In [None]:
#Result anaysis, stats
print("\nPercentage of sentences matched with rules:", matched_sentences/3632)
print("\nCauses identified and their counts: ")
threshold = 15
for key, val in causelist.items():
    if(len(key)>1 and val >= threshold):
        print("Cause: %-25s"%key,"\t-- count: ",val)


Percentage of sentences matched with rules: 0.48265418502202645

Causes identified and their counts: 
Cause: everything                	-- count:  72
Cause: smoking                   	-- count:  168
Cause: tobacco                   	-- count:  21
Cause: wifi                      	-- count:  19
Cause: red meat                  	-- count:  22
Cause: the noise from windmills  	-- count:  23
Cause: meat                      	-- count:  31
Cause: windmill noise            	-- count:  71
Cause: coffee                    	-- count:  19
Cause: the noise                 	-- count:  27
Cause: bacon                     	-- count:  60
Cause: obesity                   	-- count:  20
Cause: wind                      	-- count:  24


In [None]:
print('No of sentences matched: ')
print(matched_sentences)
print('No of causes extracted: ')
print(len(causelist))

No of sentences matched: 
1753
No of causes extracted: 
829
