In [None]:
import csv
import dns
import dnspython

import json
import logging
import multiprocessing
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import pymongo
import random
import re
import requests
import requests.exceptions
import scispacy
import spacy
# import spacy_transformers

from datetime import date
from nltk import tokenize
from pathlib import Path
from requests_futures.sessions import *

from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from tqdm import tqdm, trange
from typing import Dict
from urllib.parse import urlsplit
from urllib.parse import urlparse

from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation

#nltk.download('punkt')
%matplotlib inline

In [None]:
logger = logging.getLogger('claim_filtering')
logger.setLevel(logging.INFO)
pd.set_option("max_colwidth", 100)
pd.set_option("max_rows", 50)


In [None]:
# MongoDB connection 

db_client = pymongo.MongoClient("...") # Link to DB
db = db_client.pubhealth
train_col = db.trainset
test_col = db.testset


In [None]:
# load initial training and test set into mongodb 
insert_total_set = False

# train_data.reset_index(inplace=True) 
if insert_total_set:
    train_data = pd.read_csv('datasets/train.tsv', sep='\t')
    test_data = pd.read_csv('datasets/test.tsv', sep='\t')
    test_data.drop(columns=["Unnamed: 0"]) if "Unnamed: 0" in test_data.columns else None

    print(f'Train dataset contrains {len(train_data)} total entries.\n')
    print(f'Test dataset contrains {len(test_data)} total entries.\n')
    
    print(train_data.columns)

    train_col.delete_many({})
    train_dict = train_data.to_dict("records")
    train_col.insert_many(train_dict)

    test_col.delete_many({})
    test_dict = test_data.to_dict("records")
    test_col.insert_many(test_dict)


In [None]:
read_train_set = True
read_test_set = True

if read_train_set:
    cursor =  train_col.find({'relevant_claim': True})
#     cursor =  train_col.find()
    
    train_data = pd.DataFrame(list(cursor)) 
    print(f"Length of training set: {len(train_data)}")

if read_test_set:
    cursor =  test_col.find({'relevant_claim': True})
    test_data = pd.DataFrame(list(cursor)) 
    print(f"Length of test set: {len(test_data)}")

train_data.head(1)


#### Load medical corpus/entities 

In [None]:
med_corpus = []
with open(r'./medical_corpus/medical_corpus.csv', 'r') as file:
    csv_reader = csv.reader(file, delimiter=',', quoting=csv.QUOTE_ALL)
    line_count = 0
    for row in csv_reader:
        med_corpus.extend(row)
    
len(med_corpus)

In [None]:
medical_entities = []
with open(r'./medical_corpus/medical_cn_entities.csv', 'r') as file:
    csv_reader = csv.reader(file, delimiter=',', quoting=csv.QUOTE_ALL)
    line_count = 0
    for row in csv_reader:
        medical_entities.extend(row)

len(medical_entities)

#### 1. Create and save medical corpus using corpus from Wikipedia, Harvard, UMich, Schulich 
#### 2. For each term in corpus => find entity in ConceptNet


In [None]:
# iterate over term files and save in one list 
create_corpus = False

if create_corpus:
    med_corpus = []
    for file in os.listdir('./medical_corpus'):
        temp = pd.read_csv(os.path.join('./medical_corpus', file))
        med_corpus.extend([entry.strip().lower().split(":")[0] for entry in temp["Column 1"]])

    med_corpus = list(set(med_corpus))
    print(len(med_corpus))
    
    with open(r'./medical_corpus/medical_corpus.csv', 'w') as file:
    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
    wr.writerow(med_corpus)


In [None]:
# CN entities for claim's entities 
# If ConceptNet entry found check if context relations include keywords from med_corpus     

entities_for_corpus = False 

if entities_for_corpus:
    medical_entities = []
    for term in med_corpus:
        for cn_entity in get_conceptnet_entity(term): 
            medical_entities.append(cn_entity)
            medical_entities.extend(get_sub_topics(cn_entity))

    print(len(medical_entities))
    with open(r'./medical_corpus/medical_cn_entities.csv', 'w') as file:
    wr = csv.writer(file, quoting=csv.QUOTE_ALL)
    wr.writerow(medical_entities)


## Filtering PubHealth Claims 


### (1) NER with SciSpacy 

In [None]:
nlp = spacy.load("en_core_sci_lg")

# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

# Add umls entity linker
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls", 
                                        "threshold": 0.3, "no_definition_threshold": 0.6, 
                                        "filter_for_definitions": True, 
                                        "max_entities_per_mention": 15})


In [None]:
def get_entities(claim: str) -> list: 
    """
    Returns for a claim a list of recognized enitities using SciSpacy
    """
    if type(claim)!=str: 
        entities = []
    else:
        doc = nlp(claim)    
        entities = [entity.text for entity in doc.ents]
    return entities

### (2) Extracting public health related claims (add. sources used: medical term corpus, ConceptNet)


In [None]:
def get_sub_topics(topic: str) -> list:
    url = "http://api.conceptnet.io/query"
    params = {
        'end': topic,
        'rel': "/r/IsA",
        'limit': 10000
    }
    result = requests.get(url, params)
    result = result.json()
    topics = [edge['start']['term'] for edge in result['edges']]
    return topics


In [None]:
def get_hop_entities(cn_entity) -> list:
    url = "http://api.conceptnet.io/query"
    hop_entities = []
    try:
        # Type of
        params = {
            'start': cn_entity,
            'rel': "/r/IsA",
            'limit': 10000
        }
        result = requests.get(url, params).json()
        hop_entities = [edge['end']['term'] for edge in result['edges'] if "/c/en" in edge['end']['term']]
        hop_entities = list(set(hop_entities))
        
    except Exception as e:
        print(f"Following error occured while execution of function get_hop_entities: {e}.")

    return hop_entities


In [None]:
def get_conceptnet_entity(entity: str):
    result_list = []
    entity_merged = entity.lower().strip().replace(" ", "_")
    base_url =  'http://api.conceptnet.io/c/en/'
    url = base_url+entity_merged
    
    result = requests.get(url).json()
    result_id = ""
    
    if 'error' in result:
        print(f"The following error occurred during execution of function 'get_conceptnet_entity': {result['error']['details']}.")
    else:
        result_id = result['@id']
        result_list.append(result_id)
    
    return result_list


In [None]:
def pubhealth_match(cn_entry: str, pubhealth_topics: set) -> bool:
    
    is_match = False
    try:
        url = "http://api.conceptnet.io/query"
        params = {
            'start': cn_entry,
            'rel': "/r/HasContext",
            'limit': 10000
        }
        req_result = requests.get(url, params)
        if 'error' in req_result:
            logger.info(f"The following error occurred during execution of function 'pubhealth_match': {req_result['error']['details']}.")
        else:
            result = req_result.json()
            end_nodes = [edge['end']['term'] for edge in result['edges']]
            if end_nodes and len(end_nodes)>0 and not set(end_nodes).isdisjoint(pubhealth_topics):
                is_match = True
                
    except Exception as e:
        print(f"Following error occured while execution of function pubhealth_match: {e}.")
            
    return is_match


In [None]:
def hasMedicalContext_ConceptNet(input_data, med_corpus = med_corpus, med_entities = medical_entities) -> bool: 
    """
    Returns True if claim related to a PubHealth topic. Use ConceptNet to determine this is keyword approach not enough.
    
    Procedure:
    # 1. check if medical keyword (from corpus) in claim => return true 
    # 2. Get conceptnet nodes for all medical keywords + claim's entities => med_conceptnet
    # 3. check if claim's entities HasContext in med_entities => return true 
    # 4. check one-hop IsA node for claim's entities => HasContext in med_conceptnet? => return true 
    
    Parameters:
    input_data(set): set containing claim, entities linked with SciSpacy and row index
    med_corpus(list): list of medical terms scraped from medial glossaries (Wikipedia, Harvard, UMich, Schulich)
    med_entities(list): list of ConceptNet nodes linked to med_corpus
    
    Returns: 
    bool: True if claim related to public health, otherwise False
    """

    try:

        claim, entities, index = input_data
        print(f"Index of processed row: {index}.")

        # 1. 
        if any(med_keyword in [w for w in word_tokenize(claim) if w not in punc] for med_keyword in med_corpus):
            # exact keyword match, claims related to public health
            print("Keyword match successful.")
            return True
        
        # 2. 
        cn_entities = []
        for entity in entities:
            # for entities detected with SciSpacy, get corresponding ConceptNet nodes
            for cn_entity in get_conceptnet_entity(entity): 
                cn_entities.append(cn_entity)
                
                # 3. 
                print(f"Match being checked for the following entity: {cn_entity}")
                if pubhealth_match(cn_entity, pubhealth_topics = medical_entities): 
                    return True # medical context found in one of the entities, further search not required

    except Exception as e:
        print(f"Following error occured while execution of function hasMedicalContext_ConceptNet: {e}.")

    # claim not related to public health, return false
    return False
 

In [None]:
def has_pubhealth_context_multiprocess(df: pd.DataFrame, column_name = "entities"):
#     num_processes = multiprocessing.cpu_count()
    num_processes = 5
    
    print(f"Number of processes: {num_processes}")
    pool = multiprocessing.Pool(processes=num_processes)

    result = pool.map(hasMedicalContext_ConceptNet, zip(df["claim"], df[column_name], df.index.values))

    pool.close()
    pool.join()
    
    return result


### (3) Filter claims containing uncertainty indicators e.g. "might", "could", ...

In [None]:
uncertainty_corpus = {'perhaps', 'barely', 'maybe', 'probably', 'possibly', 'apparently', 
                      'sometimes', 'mostly', 'occasionally', 'frequently', 'now and then',
                      'suppose', 'guess', 'imagine', 
                      'may', 'might', 'doubt', 'not sure', 'could', 'doubtful', 'unlikely', 'think', 'not sure', 
                      'uncertain', 'whether', 'suppose', 'shall', 'should', 
                      'believe', 'assume', 'imagine', 'presume', 
                      'approximate', 'approximately', 'debatable', 'potentially', 'theoretically',  
                      'many', 'few', 'some', 'much', 'numerous', 'plenty', 'lot', 'lots', 'several', 'little', 'most', 'enough'}


In [None]:
def match_claim(claim:str, corpus: set) -> bool:
    """
    Returns true if claims has a token matching uncertainty_corpus
    """
    is_match = set(str(claim).lower().strip().split(" ")).isdisjoint(corpus)
    return not is_match 


### (4) Find "typical" table fact claims in PubHealth (get inspiration from TabFact)

In [None]:
def has_pos_tag(claim: str, tag_filter: str, corpus = {}) -> bool:
    """
    Given a text (claim) and a pos filter e.g. "JJS", find out if text contains a token matching this pos filter
    
    Parameters:
    claim (str): text where pos should be searched
    tag_filter (str): tag used to filter 
    
    Returns:
    bool: True if tag_filter found in claim text else False 
    """
    claim_tok = tokenize.word_tokenize(str(claim))
    pos_tags = nltk.pos_tag(claim_tok)
    filtered_tags = [tag for tag in pos_tags if tag[1]==tag_filter]

    if filtered_tags != []:
        # minimum one token's POS is equal to 'JJS'
        if corpus:
            # claim has to include min. one token from 'corpus'
            return match_claim(claim, corpus)
        else:
            # no corpus restriction required, return True 
            return True
    else: 
        return False 


In [None]:
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.help.upenn_tagset()


In [None]:
# Dataframe for which to execute below code: 

claims = df.copy() # TODO enter here dataframe
print(f"Total number of entries in dataframe: {len(df)}")


In [None]:
# Count and aggregations
# "sum", "total", "totally", "average"

aggregation_keywords = {"sum", "total", "totally", "average", "overall", "altogether", "entire", "whole", "range", 
                        "difference", "summarization", "combine", "combined", "all", 'every', 'each'}
claims["is_aggregation"] = [match_claim(row["claim"], aggregation_keywords) for index, row in claims.iterrows()]

print(f"Entries with aggregation keyword: " + str(len(claims[claims["is_aggregation"]==True])))
claims[claims["is_aggregation"]==True]["claim"]


In [None]:
# Superlatives adj => NLTK JJS

claims["has_superlative"] = [has_pos_tag(row["claim"], "JJS") for index, row in claims.iterrows()]

print(f"Entries with superlatives: " + str(len(claims[claims["has_superlative"]==True])))
claims[claims["has_superlative"]==True]["claim"]


In [None]:
# Superlatives adv => NLTK RBS

claims["has_superlative_adv"] = [has_pos_tag(row["claim"], "RBS") for index, row in claims.iterrows()]

print(f"Entries with superlatives: " + str(len(claims[claims["has_superlative_adv"]==True])))
claims[claims["has_superlative_adv"]==True]["claim"][:10]


In [None]:
# Comparatives => NLTK JJR and keyword 'than' in claim text

comparatives_corpus = {'than', 'difference', 'gap', 'seperate', 'above', 'below', 'equal', 'equally'}
claims["has_comparative"] = [has_pos_tag(row["claim"], "JJR", comparatives_corpus) for index, row in claims.iterrows()]

print(f"Entries with comparatives: " + str(len(claims[claims["has_comparative"]==True])))
claims[claims["has_comparative"]==True]["claim"]


In [None]:
# Comparatives => NLTK RBR and keyword 'than' in claim text

comparatives_corpus = {'than', 'difference', 'gap', 'seperate', 'above', 'below', 'equal', 'equally'}
claims["has_comparative_adv"] = [has_pos_tag(row["claim"], "RBR", comparatives_corpus) for index, row in claims.iterrows()]

print(f"Entries with comparatives: " + str(len(claims[claims["has_comparative_adv"]==True])))
claims[claims["has_comparative_adv"]==True]["claim"][:10]


In [None]:
# Numerals => ?? e.g. 5th, 1994, 11,...

claims["has_numerals"] = [has_pos_tag(row["claim"], "CD") for index, row in claims.iterrows()]

print(f"Entries with numerals: " + str(len(claims[claims["has_numerals"]==True])))
claims[claims["has_numerals"]==True]["claim"]


In [None]:
# Unique e.g. "only"

unique_keywords = {"only", "single", "unique", "exclusively", "individual"}
claims["has_unique_keyword"] = [match_claim(row["claim"], unique_keywords) for index, row in claims.iterrows()]

print(f"Entries with unique keyword: " + str(len(claims[claims["has_unique_keyword"]==True])))
claims[claims["has_unique_keyword"]==True]["claim"]


In [None]:
# Majority e.g. "most of"

majority_keywords = {"majority"}
claims["has_majority_keyword"] = [match_claim(row["claim"], majority_keywords) for index, row in claims.iterrows()]

print(f"Entries with majority keyword: " + str(len(claims[claims["has_majority_keyword"]==True])))
claims[claims["has_majority_keyword"]==True]["claim"]


In [None]:
# Filtering claims longer than X tokens

punc = set(list(punctuation) + ["''", "``"])

def has_short_text(claim: str): 
    if claim and len(str(claim))>0 and len([w for w in word_tokenize(str(claim)) if w not in punc])<22:
        return True 
    else:
        return False

claims["has_short_text"] = [has_short_text(row['claim']) for index, row in claims.iterrows()]
print(f"Entries with short text: " + str(len(claims[claims["has_short_text"]==True])))

claims[claims["has_short_text"]==True]["claim"]


In [None]:
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.min_rows', 150)
pd.set_option('display.expand_frame_repr', True)

In [None]:
x = claims[claims["has_short_text"]==False]["claim"]
x[:100]

### (5) Combine previous steps: final dataset:
* only claims related to PubHealth topics
* 'is_uncertain' == False
* at least one of the remaining bool columns == True 

In [None]:
df = train_data.copy()
print(f"Number of initial entries is: {len(df)}")


#### 1.) Filter train data for uncertain claims 

In [None]:
df["is_uncertain"] = [match_claim(row["claim"], uncertainty_corpus) for index, row in df.iterrows()]

print("Number of remaining entries after removing uncertain one's: ")
print(len(df[df["is_uncertain"]==False]))

df = df[df["is_uncertain"]==False]
df.head(1)


#### 2.) Finding "typical" table facts


In [None]:
# Go through code in part "(4) Find "typical" table fact claims in PubHealth (get inspiration from TabFact)"
# and enter claims = desired_df.copy() at top

df = claims.copy() # afterwards execute this line


#### 3.) Filter train_data to get desired subset for further filtering based on pulic health - relatedness

In [None]:
df = df.loc[(df["is_uncertain"]==False) & 
            ((df["is_aggregation"]==True) | 
             (df["has_superlative"]==True) | 
             (df["has_superlative_adv"]==True) | 
             (df["has_comparative"]==True) |
             (df["has_comparative_adv"]==True) | 
             (df["has_numerals"]==True) | 
             (df["has_unique_keyword"]==True) | 
             (df["has_majority_keyword"]==True))]

print(f"Number of remaining train_data entries after filtering: {len(df)}")


#### 4.) Filter train_data to get only short claims (less than 22 tokens)

In [None]:
df = df.loc[(df["has_short_text"]==True)]
print(f"Number of remaining train_data entries after filtering: {len(df)}")


In [None]:
# TODO plot barplot with length of claim text



#### 5.) NER with SciSpacy


In [None]:
df["entities"] = [get_entities(claim) for claim in df["claim"]]


#### 6.) Filter train data for PubHealth related claims (with corpus & ConcepNet) 


In [None]:
if __name__ == "__main__": 
    result = has_pubhealth_context_multiprocess(df, "entities")
    df["health_related"] = result


In [None]:
print("Number of remaining train_data entries after removing one's not related to public health: ")
print(len(df[df["health_related"]==True]))

# TODO get subset of train_data
df = df[df["health_related"]==True]


#### 7.) Filter for true/false entries other labels not relevant 


In [None]:
print(len(df))
df = df.loc[(df["label"].isin(["true", "false"]))].copy()
len(df)


#### 8.) Update entries in MongoDB 


In [None]:
# (7a) set new entity 'relevant' for all documents to False 
train_col.update_many({}, {'$set': {'relevant_claim': False}})

# (7b) set only those to True which are in subset 
for index, row in df.iterrows():
    train_col.update_one({'_id': row["_id"]},
                         {'$set': {'relevant_claim': row["relevant_claim"]}})


In [None]:
# FINALLY SAVE RESULTS

today = date.today()
path = f"datasets/{today}_train_subset_claims_filtered_second.pickle"

train_2_health.to_pickle(path)


In [None]:
train_2_health = train_2.loc[(train_2["health_related"]==True)] 
train_2_non_health = train_2.loc[(train_2["health_related"]==False)]                    

print(len(train_2))
print(len(train_2_health))