#### Importing packages

In [None]:
import argparse
import bs4
import elq.main_dense as main_dense
import glob
import io
import json
import matplotlib.pyplot as plt
import multiprocessing
import nltk
import numpy as np

import os
import pandas as pd
import pickle
import pickle5 as pickle # if required e.g. error with normal pickle package
import pymongo
import qwikidata

import random
import re
import requests
import requests.exceptions
import seaborn as sns
import shutil
import string
import sys

import tempfile
import tabula
import tagme
import tldextract
import torch
import transformers
import utils 
import urllib.parse
import uuid
import unicodedata
import wikipedia
import zipfile

from bs4 import BeautifulSoup
from collections import Counter, deque, OrderedDict
from concurrent.futures import *
from datetime import date
from langdetect import DetectorFactory, detect, detect_langs
from matplotlib.ticker import AutoMinorLocator
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from operator import itemgetter
from pathlib import Path
from requests_futures.sessions import *
from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout,\
    InvalidSchema, InvalidURL

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split
from string import punctuation
from SPARQLWrapper import SPARQLWrapper, JSON

from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
from transformers import *
from typing import Dict
from urllib.parse import urlsplit
from urllib.parse import urlparse
from utils import *

from operator import itemgetter
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
from sentence_transformers.cross_encoder import CrossEncoder
from rank_bm25 import BM25Okapi


#nltk.download('punkt')
%matplotlib inline


In [None]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.info("test")


## Overview

1. <strong>Link claims to Wikimedia entities </strong>


2. <strong>Get referenced websites from Wikipedia </strong>


3. <strong>Get top Tables from extracted Evidence Websites</strong> 


Set the following variables first:


In [None]:
# Path to db config containing pre-processed data which should be used for evidence table extraction
# Or load data from _trainset_evidence_extraction.pkl and _testset_evidence_extraction.pkl
path_config_db = ''
path_trainset = ''
path_testset = ''

train_data = pd.read_pickle(path_trainset)
test_data = pd.read_pickle(path_testset)

# Path to ELQ model used for Wikipedia entity linking 
path_elq_model = ''


--------------

In [None]:
# MongoDB connection - skip if data directly loaded from .pkl files

db_client = pymongo.MongoClient(path_config_db) 
db = db_client.pubhealth
train_col = db.trainset
test_col = db.testset

read_train_set = True
read_test_set = True

if read_train_set:
    train_data = pd.DataFrame(list(cursor)) 
    print(f"Length of training set: {len(train_data)}")

if read_test_set:
    test_data = pd.DataFrame(list(cursor)) 
    print(f"Length of test set: {len(test_data)}")


In [None]:
train_data.head()

##### Load preprocessed claims and extracted tables in a new MongoDB collection for MTurk

In [None]:
# inserted some intermediate results back into DB 

if insert_entries: 
    for index, row in train_data.iterrows(): 
        print('Inserted into final_dataset', index, end="\r")
        
        final_col.update_one({'_id': row["_id"]},
                             {'$set': {'tables': table_result[index]}})
        

### (1) <strong>Link claims to Wikimedia items</strong> 

a. Linking with WAT (successor of TagME)

b. Linking with ELQ entity linker


#### a.) Linking with WAT (successor of TagME)
TagME cited as baseline in Wikidata/-pedia entity linking task papers e.g. 'Efficient One-Pass End-to-End Entity Linking for Questions' (Belinda et al., 2020)

In [None]:
if __name__ == "__main__": 
    num_processes = multiprocessing.cpu_count()
    output = get_wikidata_items_from_txt_multiprocess(train_data, num_processes)
    
    wiki_entities_WAT = [[i[1] for i in entry] if any(entry) else [] for entry in output]



In [None]:
class WATAnnotation:

    def __init__(self, d):
        self.start = d['start']
        self.end = d['end']

        # annotation accuracy
        self.rho = d['rho']
        self.prior_prob = d['explanation']['prior_explanation']['entity_mention_probability']

        # annotated text
        self.spot = d['spot']

        # Wikpedia entity info
        self.wiki_id = d['id']
        self.wiki_title = d['title']


    def json_dict(self):
        return {'wiki_title': self.wiki_title,
                'wiki_id': self.wiki_id,
                'start': self.start,
                'end': self.end,
                'rho': self.rho,
                'prior_prob': self.prior_prob
                }


def get_wikidata_qid(wiki_title: str) -> str: 
    """
    Returns for a Wikipedia title the corresponding Wikidata item
    
    Parameters: 
    wiki_title (str): Wikipedia page title for which corresponding item is searched
    
    Returns:
    str: QID of retrieved item
    """
    qid = None
    
    wikipedia_url = "https://en.wikipedia.org/w/api.php"    
    session = requests.Session()
    params = {
    "action": "query",
    "format": "json",
    "prop": "pageprops",
    "titles": wiki_title
    }

    try:
        response = session.get(url=wikipedia_url, params=params)
        response_data = response.json()

        if response_data and 'query' in response_data and 'pages' in response_data['query']:
            for pages in response.json()['query']['pages'].values():
                if 'pageprops' in pages and 'wikibase_item' in pages['pageprops']:
                    qid = pages['pageprops']['wikibase_item']

    except Exception as e: 
        print(f"Following error occured while execution of function get_wikidata_qid: {e}.")
        print(f"Value of response object: {response}.")
        
    return qid


def get_wikidata_item(qid: str) -> WikidataItem: 
    """
    Returns Wikidata item given its 'qid'
    
    Parameters: 
    qid (str): qid of Wikidata item which is searched
    
    Returns: 
    WikidataItem: Wikidata item object for given qid
    """
    try:
#         q_dict = get_entity_dict_from_api(qid)
        qitem = None
        if qid:
            q_dict = get_entity_dict_from_api(qid)
            qitem = WikidataItem(q_dict)
    
    except Exception as e: 
        print(f"Following exception occurred while retrieving WikidataItem: {e}.")
        
    return qitem

    
def get_wikidata_items_from_txt(input_data) -> list: 
    """
    Return for a given text all linked Wikidata items
    
    Parameters: 
    input_data: object consisting of a string (= text for entity linking) and an int (= index)

    Returns: 
    list: list of Q_IDs of linked items
    """
    text, i = input_data
    return_qid = False
    rho_threshold = 0.3
    
    print(f"Index of entry being printed: {i}.")
    
    wat_url = 'https://wat.d4science.org/wat/tag/tag'
    tagme.GCUBE_TOKEN = "..." # todo enter personal tagme token    
    
    payload = [("gcube-token", tagme.GCUBE_TOKEN),
               ("text", text),
               ("lang", 'en'),
               ("tokenizer", "nlp4j"),
               ('debug', 9),
               ("method",
                "spotter:includeUserHint=true:includeNamedEntity=true:includeNounPhrase=true,prior:k=50,filter-valid,centroid:rescore=true,topk:k=5,voting:relatedness=lm,ranker:model=0046.model,confidence:model=pruner-wiki.linear")]

    response = requests.get(wat_url, params=payload)
    wikidata_entities = []

    try:
        response_data = response.json()
    except Exception as e: 
        print(f"Expection occurred while calling json() on response: {response}")
    
#     if return_qid:
#         wikidata_entities = [get_wikidata_qid(a['title']) for a in response_data['annotations'] if a and a['rho']>rho_threshold]
#     else:
#         wikidata_entities = [a['title'] for a in response.json()['annotations'] if a and a['rho']>rho_threshold]
    
    wikidata_entities = [(a['spot'], a['title'], get_wikidata_qid(a['title']), a['rho']) for a in response_data['annotations'] if a] #  and a['rho']>rho_threshold
    
    return wikidata_entities


def get_wikidata_items_from_txt_multiprocess(df: pd.DataFrame, num_processes: int):
    pool = multiprocessing.Pool(processes = num_processes)
    
    result = pool.map(get_wikidata_items_from_txt, zip(df["claim"], df.index.values))
    
    pool.close()
    pool.join()
    
    return result
    

#### b.) Linking with ELQ entity linker
Switch to conda environment blink37 provided by authors of ELQ entity linker; Already done for entire training and test set

Paper for ELQ linker: https://arxiv.org/pdf/2010.02413.pdf

Github repo: https://github.com/facebookresearch/BLINK/tree/master/elq

In [None]:
# Load ELQ model 

models_path = path_elq_model # todo the path where you stored the BLINK models
config = {
    "interactive": False,
    "biencoder_model": models_path+"elq_wiki_large.bin",
    "biencoder_config": models_path+"elq_large_params.txt",
    "cand_token_ids_path": models_path+"entity_token_ids_128.t7",
    "entity_catalogue": models_path+"entity.jsonl",
    "entity_encoding": models_path+"all_entities_large.t7",
    "output_path": "logs/", # logging directory
    "faiss_index": "none",
    "index_path": models_path+"faiss_hnsw_index.pkl",
    "num_cand_mentions": 10,
    "num_cand_entities": 10,
    "threshold_type": "joint",
    "threshold": -4.5,
}
args = argparse.Namespace(**config)
models = main_dense.load_models(args, logger=logger)


In [None]:
# Cut claim length to 128 if longer than that
count = 0
for index, row in df.iterrows(): 
    if len(row["claim"])>128:
        count += 1
        df.at[index,"claim"] = row["claim"][:128]

print(len(df))


In [None]:
### IF HUGE DATASET AND MULTIPROCESSING CHOOSE CODE BELOW

# Convert claims similarly to data_to_link dict object
prediction_input = df[["claim_id", "claim"]]
prediction_input["claim"] = prediction_input["claim"].str.lower()
prediction_input = prediction_input.rename(columns = {'claim_id': 'id', 'claim': 'text'})

data_to_link = prediction_input.to_dict('records')

predictions = predict_multiprocessing(data_to_link)
predictions[:10]


In [None]:
### OTHERWISE THIS:

# Convert claims similarly to data_to_link dict object
prediction_input = df[["claim_id", "claim"]]
prediction_input["claim"] = prediction_input["claim"].str.lower()
prediction_input = prediction_input.rename(columns = {'claim_id': 'id', 'claim': 'text'})

data_to_link = prediction_input.to_dict('records')

predictions = main_dense.run(args, None, *models, test_data=data_to_link)

# Add predicted Wikipedia titles as column to dataframe 
df["wiki_entities_eql"] = ""
count = 0

for index, row in df.iterrows():
    entities_linking = []
    for linking in predictions[count]['pred_tuples_string']:
        entities_linking.append(linking[0])
        
    count += 1
    df.at[index,"wiki_entities_eql"] = entities_linking
    
# Compare claims, wikipedia titles old and wikipedia titles new to each other
entity_df = pd.DataFrame({'claim': df["claim"], 
                          'entities_eql': df["wiki_entities_eql"]})


In [None]:
# add all mini_train_data dataframes to each other 
df.head(1)


In [None]:
def predict_parallel(data_to_link):
    predictions = main_dense.run(args, None, *models, test_data=[data_to_link])
    return predictions


def predict_multiprocessing(data_to_link: list):
    
    num_processes = multiprocessing.cpu_count()
    print(f"Number of processes: {num_processes}")

    pool = multiprocessing.Pool(processes=num_processes)
    output = pool.map(predict_parallel, data_to_link)

    pool.close()
    pool.join()
    
    return output
    

#### Save Wiki entities to MongoDB OR .pkl file 

In [None]:
# save mini_train_data with both entities

for index, row in df.iterrows():
    test_col.update_one({'_id': row["_id"]},
                        {'$set': {'wiki_entities_eql': row['wiki_entities_eql']}})

# path_intermediate_results = "" # set path to save intermediate results 
# df.to_pickle(path_intermediate_results)


### (2) Get Wikipedia articles (for entities linked with EQL & WAT)

In [None]:
if __name__ == "__main__": 
    num_processes = multiprocessing.cpu_count()
    result = multiprocessing_sort_wikipediapages(test_data, num_processes, col_eql = 'wiki_entities_eql', 
                                                 col_WAT = 'wiki_entities_WAT')
    test_data["wiki_entities"] = [entry if entry and entry!=[] else [] for entry in result]
    

In [None]:
# Update MongoDB 
for index, row in test_data.iterrows():
    test_col.update_one({'_id': row["_id"]},
                        {'$set': {'wiki_entities': result[index]}})


In [None]:
# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

# Lemmatize the stop words
stop_words = set(stopwords.words('english')) 
tokenizer = LemmaTokenizer()
token_stop = tokenizer(' '.join(stop_words))


In [None]:
def retrieve_and_rank(claim, docs, top_k = 20): 
    """
    Retrieves and ranks documents 
    """
    top_docs = get_top_docs_bm25(claim, docs, top_k)
    assert isinstance(top_docs, list)
    
#     reranked_docs = rerank_crossencoder(model, claim, top_docs)
#     assert isinstance(reranked_docs, list)

    return top_docs[:top_k]


In [None]:
def get_top_docs_bm25(claim: str, docs: list, top_k = 20): 
    logger.info("Executing function 'get_top_docs_bm25'...")
    
    tokenized_docs = [doc.split(" ") for doc in docs]
    bm25 = BM25Okapi(tokenized_docs)

    tokenized_query = claim.split(" ")
    top_doc = bm25.get_top_n(tokenized_query, docs, n=top_k)
    
    return top_doc


In [None]:
def rerank_crossencoder(model, claim, top_docs): 
    """
    Rerank retrieved top_docs return dict with documents and socres
    """
    logger.info("Executing function 'rerank_crossencoder'...")
    
    sentences = list(zip([claim]*len(top_docs), top_docs))
    scores = model.predict(sentences)
    
    doc_dict = dict(zip(top_docs, scores))
    top_reranked_docs = dict(sorted(doc_dict.items(), key = itemgetter(1), reverse=True))

    return list(top_reranked_docs.keys())


In [None]:
def multiprocessing_sort_wikipediapages(df: pd.DataFrame, num_processes: int, col_eql: str, col_WAT: str):
    """
    Calls function 'sort_wikipediapages' using multiple threads 
    """
    pool = multiprocessing.Pool(processes = num_processes)
    result = pool.map(sort_wikipediapages, zip(df["claim"], df[col_eql], df[col_WAT], df.index.values))
    
    pool.close()
    pool.join()
    
    return result


In [None]:
def sort_wikipediapages(input_data) -> list: 
    """
    Given the claim's text and linked entites/Wikipedia pages, 
    returns an ordered list of Wikipedia pages based on similarity with claim's text
    
    Parameters:
    input_data: consists of claim, entities_eql, index in the given order
    
    Return: 
    list: ranked list where the first Wikipedia page is the most similiar to the claim
    """    
    claim, entities_eql, entities_WAT, index = input_data
    top_k = 10
    
    logger.info(f"Row with index {index} being processed...")

    assert isinstance(claim, str)
    assert isinstance(entities_eql, list)
    assert isinstance(entities_WAT, list)
    
    try:
        wiki_pages = wikipedia.search(claim, results=10)
    except Exception as e:
        wiki_pages = []
    
    # merge entities (retrieved differently) into one list
    pages = []
    pages.extend(entities_eql)
    pages.extend(entities_WAT)
    pages.extend(wiki_pages)
    pages = list(set(pages)) # delete duplicates in list
    
    # retrieve pages' text and save with title in dict 
    wikipages_dicts = {}
    for page_title in pages: 
        try:
            page = wikipedia.page(page_title)
            wikipages_dicts[page.content] = page.title
        except Exception as e: 
            continue

    print("Start retrieval and ranking...")
    if wikipages_dicts != {}:
        docs_sorted = retrieve_and_rank(claim, list(wikipages_dicts.keys()), top_k)    
        result_list = [wikipages_dicts[doc] for doc in docs_sorted][:top_k]
    else:
        result_list = []
    
    print(f"lengths of result list is: {len(result_list)}")
    return result_list


In [None]:
# Number of entities 
entries_len = [len(entries) for entries in train_data["wiki_entities"] if type(entries)==list]

ax = sns.boxplot(x=entries_len)
ax.set_title("Number of Wiki entities linked for claims")


### (3) <strong>Get referenced websites from Wikipedia and rank them</strong> 

a. get websites cited in Wikipedia articles (similarity to claim) 

b. sort all websites based on similarity to claim 


#### 1.) Extract referenced websites from Wikipedia articles  (from sorted_wikipedia_articles) 

In [None]:
# get references from top_j Wikipedia articles => todo: set top_j in function 'get_wikipedia_externallinks'

if __name__ == "__main__": 
    num_processes = multiprocessing.cpu_count()
    output = multiprocessing_get_wikipedia_externallinks(train_data, num_processes)
#     train_data["urls_wikipedia"] = [entry if any(entry) else [] for entry in output]


In [None]:
# Number of referenced Websites 
entries_len = [len(entries) for entries in output]
print(pd.DataFrame(entries_len).describe())

ax = sns.boxplot(x=entries_len)
ax.set_title("Number of references extracted from corresp. Wikipedia articles")


#### 2.) Sort all websites based on similarity to claim

* return top-k (k=50) websites from all references
* two options tried out: a) (2,3) tfidf b) bm25 => decided to go for bm25 

In [None]:
def main(): 
    return get_top_wikimedia_references_multiprocessing(train_data) # using column "urls_wikipedia"


In [None]:
if __name__ == "__main__":
    result = main()
    test_data["sorted_wiki_references"] = result    


In [None]:
# Number of entities 
entries_len = [len(entries) for entries in train_data["sorted_wiki_references"] if entries and type(entries)==list]
df = pd.DataFrame(entries_len)

print(df.describe())

ax = sns.boxplot(x=entries_len)
ax.set_title("Number of tables extracted for each claim")


#### Depricated: Extract references from Wikidata entities

In [None]:
# Wikidata entities have been retrieved before and ranked saved in column "wiki_entities_sorted" and "wiki_entities_WAT"
# ELQ entities
use_wikidata_tables = False

if __name__ == "__main__" and use_wikidata_tables: 
    output = get_reference_items_title_given_multiprocess(df, wiki_entities = "wiki_entities")
    df["sources_wikidata"] = [entry if any(entry) else [] for entry in output]


In [None]:
if use_wikidata_tables:
    wikidata_sources = [[list(entry.values())[0][0] for entry in source_list if list(entry.values())[0]] for source_list in df["sources_wikidata"]]
    df["urls_wikidata"] = wikidata_sources.copy()
    
    # Update MongoDB with wikidata URLs
    for index, row in df.iterrows():
        test_col.update_one({'_id': row["_id"]},
                            {'$set': {'urls_wikidata': row["urls_wikidata"]}})


In [None]:
def get_reference_items_title_given_multiprocess(df: pd.DataFrame, wiki_entities: str): 
    num_processes = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes = (num_processes//2))
    
    result = pool.map(get_reference_items_titles_given, zip(df[wiki_entities], df.index.values)) # TODO set column with entities
    pool.close()
    pool.join()
    
    return result


def get_reference_items_titles_given(input_data) -> list: 
    """
    Entity matching has been previously done, therefore only extraction of resources necessary
    """
    return_urls = True 
    wiki_titles, claim, index = input_data
    
    logger.info(f"Function get_reference_items_titles_given being executed for index: {index}.")
    ref_items = []
    
    try:
        # extract Wikidata items for linked wiki entity
        q_items = [get_wikidata_item(get_wikidata_qid(t)) for t in wiki_titles if t]
        
        for item in q_items: 
            if not item:
                continue
            
            claim_groups = [claim_gr for claim_gr in item.get_truthy_claim_groups().values()]
            print(f"Number of claim groups extracted: {len(claim_groups)}")
            for claim_gr in claim_groups: 
#             set a count here to only consider first ten claims of a claim group 
                
                for claim in claim_gr: 
                    if return_urls: 
                        ref_url_list = [get_ref_url(elem) for elem in get_reference_elements(claim) if elem]
                        statement = ""
                        try:
                            if claim.mainsnak:
                                property_value, value = get_claim_values(claim)
                                statement = " ".join([str(item.get_enwiki_title()), str(property_value), str(value)])
                                
                            ref_items.append({statement: ref_url_list})
                                
                        except Exception as e: 
                            print(f"The following exception occurred while executing function 'get_reference_items_titles_given': {e}")
                            ref_items.append({statement: ref_url_list})

#                         ref_items.append({statement: ref_url_list})
#                         ref_items.extend([get_ref_url(elem) for elem in get_reference_elements(claim) if elem])
                    else:
                        ref_items.extend(get_reference_elements(claim))
    
    except Exception as e: 
        print(f"The following exception occurred while executing function 'get_reference_items_titles_given': {e}")

    return ref_items



In [None]:
def get_ref_url(ref) -> str: 
    
    if type(ref) == str: 
        # ref is already a URL
        return ref
    
    url = [snak.datavalue.value for pid, snaks in ref.snaks.items() for snak in snaks 
           if snak.property_id == 'P854']
    if url and len(url)>0: 
        return url[0]
    else:
        return None 


def has_formatter_url(pid: str): 
    """
    Given pid of a property find out if it has a P1630 (=formatter_URL) property.
    If so return True, else False. 
    """
    prop_dict = get_entity_dict_from_api(pid)
    has_form_url = False

    if 'P1630' in prop_dict['claims']:
        has_form_url = True
        
    return has_form_url


# Information on Wikidata elemens e.g. snaks, main snaks, etc. 
# https://qwikidata.readthedocs.io/en/stable/entity.html#examples

def get_reference_elements(claim: qwikidata.claim.WikidataClaim) ->list: 
    """
    This function checks if input claim has URL references and returns them 
    
    Parameters:
    claim(): claim for which references should be retrived
    
    Returns: 
    list: list of references which reference an external URL 
    """
    
    ref_list = []
    try:
        for ref_num, ref in enumerate(claim.references):
            for pid, snaks in ref.snaks.items():
                
                if pid == "P854": # "reference URL" property
                    for snak in snaks:
                        ref_list.append(snak.datavalue.value)  

                elif pid == "P248": # "stated in" property
                    property_id = claim.property_id    
                    prop_dict = get_entity_dict_from_api(property_id)
                    
                    if 'P1630' in prop_dict['claims']: # "formatter URL" property
                        formatter_url = prop_dict['claims']['P1630'][0]['mainsnak']['datavalue']['value']
                        entry_id = claim.mainsnak.datavalue.value
                        url = formatter_url.replace('$1', str(entry_id))
                        ref_list.append(url)
                        
                    elif 'P2888' in prop_dict['claims']: # exact match 
                        exact_url = prop_dict['claims']['P2888'][0]['mainsnak']['datavalue']['value']
                        ref_list.append(exact_url)
                        
                elif has_formatter_url(pid):
                    # check if one of the other reference properties include P1630
                    
                    prop_dict = get_entity_dict_from_api(pid)
                    formatter_url = prop_dict['claims']['P1630'][0]['mainsnak']['datavalue']['value']                  
                    entry_id = snaks[0].datavalue.value
                    formatted_url = formatter_url.replace('$1', str(entry_id))
                    ref_list.append(formatted_url)

    except Exception as e: 
        print(f"The following error occurred in function 'get_reference_elements': {e}")
        
    ref_list = list(set(ref_list))
    
    return ref_list


def get_claim_values(claim: qwikidata.claim.WikidataClaim): 
    """
    Gives back the property name and referenced value of a Wikidata claim
    
    Parameters: 
    claim (WikidataClaim): claim for which the values should be retrieved
    
    Returns: 
    (property_value, value): a set of two values, one for the property used in the claim and the secondly the referenced value
    """
    main = claim.mainsnak
    value = main.datavalue.value if main.datavalue else " "
    
    if type(value) == dict and "id" in value:
        qid = value["id"]
        value = get_wikidata_item(qid).get_enwiki_title()

    property_dict = get_entity_dict_from_api(main.property_id) if main.property_id else " "

    if property_dict and property_dict!=" " and property_dict["labels"] and property_dict["labels"]["en"] and property_dict["labels"]["en"]["value"]:
        property_value = property_dict["labels"]["en"]["value"]
    else: 
        property_value = ""
        
    return (property_value, value)


In [None]:
def multiprocessing_get_wikipedia_externallinks(df: pd.DataFrame, num_processes: int, col_name = "wiki_entities"):
    """
    Multiprocessing of function get_wikipedia_externallinks
    """
    pool = multiprocessing.Pool(processes = 10)
    result = pool.map(get_wikipedia_externallinks, zip(df[col_name], df.index.values))
    
    pool.close()
    pool.join()
    
    return result


In [None]:
def get_wikipedia_externallinks(input_data) -> list: 
    """
    Returns for a given Wikipedia page title, the list of external links (websites)
    """
    titles, index = input_data
    print(f"Function get_wikipedia_externallinks executed for index {index}.")
    references = []
    
    for wiki_title in titles:
        try:
            if len(list(set(references)))>800: 
                break
            page = wikipedia.page(wiki_title)
            references.extend(page.references)

        except Exception as e: 
            logger.info(f"Following exception occurred while retrieving website text: {e}")
            continue
    
    # remove duplicate urls
    references = list(set(references))
    print(f"{len(references)} URLs have been retrieved for index {index}.")
    
    return references


In [None]:
def rerank_docs(claim: str, documents: list, method: str, top_k: int):
    """
    Reranks the list of documents according to their relevance given the claim text.
    Applicable are one of two methods: tfidf, bm25
    """
    print(claim)
    print(len(documents))
    print(method)
    print(top_k)
    
    if method == "tfidf": 
        logger.info(f"Reranking websites using Tfidf")
        
        # Get embeddings for claim and website's text
        vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, ngram_range=(2,3))
        doc_vectors = vectorizer.fit_transform([claim] + documents)

        # calculate similarity 
        cosine_similarities = linear_kernel(doc_vectors[0:1], doc_vectors).flatten()
        document_scores = [item.item() for item in cosine_similarities[1:]]
        
        # sort entries based on relevance for claim
        sorted_entries = dict(zip(documents, document_scores))
        sorted_entries = dict(sorted(sorted_entries.items(), key=lambda item: item[1], reverse = True))
        reranked_docs = list(sorted_entries.keys()) if sorted_entries.keys() else []

    elif method == "bm25":
        # Retrieve and rerank with BM25 (from BIER benchmark paper inspired: https://arxiv.org/abs/2104.08663)
        logger.info(f"Reranking websites using BM25")
        reranked_docs = retrieve_and_rank(claim, documents, top_k = top_k)
        
    else: 
        logger.info("No appropriate method selected for ranking documents. Unranked list will be returned.")
        reranked_docs = documents
        
    return reranked_docs
        

In [None]:
def get_top_wikimedia_references_multiprocessing(df: pd.DataFrame):
    """
    Calls function 'get_top_wikimedia_references' using multiple threads 
    """ 
#     num_processes = multiprocessing.cpu_count()
    num_processes = 10
    pool = multiprocessing.Pool(processes = num_processes)
    result = pool.map(get_top_wikimedia_references, zip(df["claim"], df["urls_wikipedia"], df.index.values))
    
    pool.close()
    pool.join()
    
    return result


In [None]:
def get_top_wikimedia_references(input_data):
    """
    Given a list of URLs retrieved from either wikipedia or wikidata and a sentence (e.g. a claim),
    sort URLs based on their similarity to the sentence and return top k websites 
    """
    claim, page_urls_wikipedia, index = input_data
    top_k = 150 # number of pages to return after sorting 
    
    page_urls = page_urls_wikipedia
    logger.info(f"Function 'get_top_wikimedia_references' being executed for index: {index}")
    
    time_out = 10 # timeout for http requests 
    blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script', 'style', 'footer'] # ignore texts in these tags

    website_texts = []
    websites_dict = {}
    
    with FuturesSession() as session:
        futures = [session.get(page_url, timeout=time_out) for page_url in page_urls if "web.archive.org" not in page_url]                         

        # Load websites and extract text
        for future in as_completed(futures):
            try:
                resp = future.result()
                content_type = resp.headers['Content-Type'] if hasattr(resp,'headers') and 'Content-Type' in resp.headers else None
                if content_type and 'text/html' in content_type:
                    soup = BeautifulSoup(resp.content, 'html.parser', from_encoding="iso-8859-1")

                    # load here text as well 
                    text = soup.find_all(text=True)
                    texts = " ".join([str(t) for t in text if t.name not in blacklist and t.parent.name not in blacklist and str(t).strip() != ""])

                    # cleaning website's main text
                    texts = texts.replace('\n', ' ')
                    texts = texts.replace('</s>', '')
                    
                    texts = re.sub(r"\\\w+|w+\\w+", "", texts)                
                    texts = texts.strip()

                    website_texts.append("".join(texts))
                    websites_dict[texts] = resp.url 
                    
            except (TooManyRedirects) as e:
                resp = e.response
                error_msg = str(e)
                logging.info(f"The following error occured while retrieving a website: TooManyRedirects")
                continue

            except (ReadTimeout, ConnectTimeout) as e:
                resp = requests.models.Response()
                resp.reason = 'timeout'
                resp.status_code = 408
                error_msg = str(e)
                logging.info(f"The following error occured while retrieving a website: ReadTimeout, ConnectTimeout - {error_msg}")
                continue

            except (ConnectionError) as e:
                #print(r)
                #traceback.print_exc()
                resp = requests.models.Response()
                resp.reason = 'connection error'
                resp.status_code = 500
                error_msg = str(e)
                logging.info(f"The following error occured while retrieving a website: ConnectionError - {error_msg}")
                continue
                
            except InvalidSchema as e:
                try:
                    resp = requests.models.Response()
                    with urllib.request.urlopen(r, timeout=timeout[1]) as resp_urllib:
                        resp.headers = resp_urllib.headers
                        resp.url = resp_urllib.url
                        error_msg = str(e)
                        resp.status_code = resp_urllib.code if hasattr(resp_urllib,'code') else None
                        resp.reason = resp_urllib.reason if hasattr(resp_urllib,'reason') else None
                except urllib.error.URLError as e:
                    resp = requests.models.Response()
                    resp.reason = 'malformed or invalid url'
                    resp.status_code = 400
                    error_msg = str(e)
                    logging.info(f"The following error occured while retrieving a website: malformed or invalid url")
                    continue
                except Exception as e: 
                    logging.info(f"The following error occured while retrieving a website: {e}")
                    continue
                    
            except (UnicodeError, InvalidURL, ValueError) as e:
                resp = requests.models.Response()
                resp.reason = 'malformed or invalid url'
                resp.status_code = 400
                error_msg = str(e)
                logging.info(f"The following error occured while retrieving a website: UnicodeError, InvalidURL")
                continue
                
            except Exception as e: 
                logging.info(f"The following error occured while retrieving a website: {e}")
                continue
                
        
        # Rerank websites based on similarity between website's text and claim
        if not websites_dict: 
            return []
        
        ranked_docs = rerank_docs(claim, list(websites_dict.keys()), 'bm25', top_k)
        result_list = [websites_dict[doc] for doc in ranked_docs][:top_k]
        
        print(f"lengths of result list is: {len(result_list)}")

    return result_list


###  <strong>Get top Tables from extracted Evidence Websites & rank them</strong> 

* Extract tables from top websites
* Rank tables based on similarity to claim text


=> with notebook Webtables_extraction_from_HTML.ipynb


* Analyse tables to get an understanding of the tables there
* Some further preprocessing steps needed to filter out junk tables?


#### 1.) Extract tables from top websites (from Wikidata and -pedia references) 

In [None]:
import urllib3, socket
from urllib3.connection import HTTPConnection

HTTPConnection.default_socket_options = ( 
    HTTPConnection.default_socket_options + [
    (socket.SOL_SOCKET, socket.SO_SNDBUF, 1000000), #1MB in byte
    (socket.SOL_SOCKET, socket.SO_RCVBUF, 1000000)
])


In [None]:
def main():
    return convert_tables_multiprocess(train_data, column_name = "sorted_wiki_references")


In [None]:
if __name__ == "__main__": 
    result = main()
#     test_data["tables_wiki_references"] = [[t for t in entry if t!=[]] if entry and any(entry) else [] for entry in result]
        

In [None]:
with open("train_data_tables_wiki_references.pkl", "wb") as file: 
    pickle.dump(result, file)
    

In [None]:

# path = f"train_data_tables_wiki_references_{begin_index}till{end_index}.pkl"
path = f"train_data_tables_wiki_references_{begin_index}till_end.pkl"

with open(path, "rb") as file: 
    x = pickle.load(file)
#     for index, row in train_data[begin_index:end_index].iterrows(): 
    for index, row in train_data[begin_index:].iterrows(): 
        train_data.at[index, "tables_wiki_references"] = x[index-begin_index]


In [None]:
train_data["tables_wiki_references"] = ""

for index, row in train_data.iterrows(): 
    train_data.at[index, "tables_wiki_references"] = result
    

In [None]:
# Number of entities 
entries_len = [len(entries) for entries in train_data["tables_wiki_references"] if entries and type(entries)==list]
df = pd.DataFrame(entries_len)

print(df.describe())

ax = sns.boxplot(x=entries_len)
ax.set_title("Number of tables extracted for each claim")


In [None]:
from utils import * 

def rerank_tables(claim: str, tables: list, top_k: int):
    """
    Reranks the list of documents according to their relevance given the claim text.
    Applicable are one of two methods: tfidf, bm25
    """
    assert isinstance(tables, list)
    
    logging.info(f"Reranking websites using BM25")
    try: 
        table_entries = dict()
        for t in tables: 
            if len(t["rows_list"]) > 15: # if more than X rows, skip
                continue
            elif (t["header_horizontal"] and len(t["header_horizontal"])>10):
                continue
            elif (t["header_vertical"] and len(t["header_vertical"])>10):
                continue
                
            cell_len = [len(cell) for row in t["rows_list"] for cell in row] # if one cell has more than 1k characters, skip
            if cell_len and max(cell_len) > 250:
                continue

            col_len = [len(row) for row in t["rows_list"]] # if ten or more columns, skip 
            if col_len and max(col_len) > 15:
                continue

            if t["header_horizontal"]: # if one header cell has more than 100 characters, skip
                header_h_len = [len(cell) for cell in t["header_horizontal"]]
                if max(header_h_len) > 100:
                    continue

            if t["header_vertical"]: # if one header cell has more than 1k characters, skip
                header_v_len = [len(cell) for cell in t["header_vertical"]]
                if max(header_v_len) > 100:
                    continue

            comparison_txt = get_table_caption(t)

            if not comparison_txt.strip() or len(comparison_txt)<10: 
                comparison_txt = get_table_header(t)
                if not comparison_txt.strip() or len(comparison_txt)<10: 
                    comparison_txt = get_table_text(t)
            table_entries[comparison_txt] = str(t["id"])

        documents = list(table_entries.keys())
        
        # TFIDF
        # Get embeddings for claim and website's text
#         vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer, ngram_range=(2,3))
#         doc_vectors = vectorizer.fit_transform([claim] + documents)

        # calculate similarity 
#         cosine_similarities = linear_kernel(doc_vectors[0:1], doc_vectors).flatten()
#         document_scores = [item.item() for item in cosine_similarities[1:]]

#         # sort entries based on relevance for claim
#         sorted_entries = dict(zip(documents, document_scores))
#         sorted_entries = dict(sorted(sorted_entries.items(), key=lambda item: item[1], reverse = True))
#         reranked_docs = list(sorted_entries.keys()) if sorted_entries.keys() else []

#         reranked_tables = []
#         for doc in reranked_docs: 
#             _id = table_entries[doc]
#             reranked_tables.append([entry for entry in tables if str(entry["id"])==_id][0])

#         print(len(reranked_tables))
#         top_doc = reranked_tables[:top_k]

        # BM25
        tokenized_docs = [doc.split(" ") for doc in docum,ents]
        bm25 = BM25Okapi(tokenized_docs)

        tokenized_query = claim.split(" ")
        top_doc = bm25.get_top_n(tokenized_query, docs, n=top_k)

        return top_doc
    
    except Exception as e: 
        logging.info(f"Following error occured whiled ranking tables: {e}")
        

In [None]:
table_result = []

for index, row in train_data.iterrows(): 
    print(f"Index of processed row: {index}")
    table_result.append(rerank_tables(row["claim"], row["tables_wiki_references"], 15))


In [None]:
# Update MongoDB 

for index, row in train_data[400:].iterrows():
    print('Updating index',index,end='\r')
    try:
        train_col.update_one({'_id': row["_id"]},
                             {'$set': {'sorted_tables_wikipedia': table_result[index-400]}})
    except Exception as e: 
        print(index)
        

In [None]:
# Number of entities 
entries_len = [len(entries) for entries in table_result if entries and type(entries)==list]
df = pd.DataFrame(entries_len)

print(df.describe())

ax = sns.boxplot(x=entries_len)
ax.set_title("Number of tables extracted for each claim")


In [None]:
def clean_cell_value(cell_val: str) -> str:
    """
    Parameters: 
    cell_val (str): string to be cleaned

    Returns: 
    str: cleaned value
    """
    val = unicodedata.normalize('NFKD', cell_val)
    val = val.encode('ascii', errors='ignore')
    val = str(val, encoding='ascii')
    
    val = clean_wiki_template(val)
    val = re.sub(r'\s+', ' ', val).strip()

    return val

def clean_wiki_template(text):
    if re.match(r'^{{.*}}$', text):
        text = text[2:-2].split('|')[-1]  # remove {{ }}, and retain the last
    else:
        text = re.sub(r'{{.*}}', '', text)
        
    return text


In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'table', 'th', 'td', 'footer']:
        return False
    if isinstance(element, bs4.element.Comment):
        return False
    return True


In [None]:
def convert_tables_multiprocess(df: pd.DataFrame, column_name: str):
    num_processes = multiprocessing.cpu_count()
#     num_processes = 10
    logger.info(f"Number of processes: {num_processes}")

    try:
        pool = multiprocessing.Pool(processes=num_processes)
        result = pool.map(convert_tables_parallel, zip(df[column_name], df["claim"], df["wiki_entities"], 
                                                       df.index.values))
        pool.close()
        pool.join()
                
    except Exception as e: 
        logger.info(f"The following exception occurred while function 'convert_tables_multiprocess': {e}")
        
    return result


In [None]:
def convert_tables_parallel(input_data) -> list:
    """
    iterates over a list of urls and returns their tables in a list.
    Additional settings: time_out

    Parameters:
    page_url (list): list of source urls 
    index: an int showing the index of the row processed 

    Returns: 
    list: list containing all tables found on source url pages 

    """
    try:
        page_urls, claim, wiki_entities, index = input_data
        logging.info(f"Index of processed row: {index}")
        
        if not page_urls or page_urls == [] or type(page_urls) != list:
            return [] # if URL list is empty 
        
        keywords = [w.lower() for wiki_entity in wiki_entities for w in tokenizer(wiki_entity) if w not in token_stop]
        keywords.extend([w.lower() for w in tokenizer(claim) if w not in token_stop])
        keywords = list(set(keywords)) # create list of keywords the website will be compared against
        conv_tables = []
        time_out = 10 # adjust if longer timeout nedded
        
        # Iterate over list of urls in parallel 
        with FuturesSession() as session:
            futures = [session.get(page_url, timeout=time_out) for page_url in page_urls if "web.archive.org" not in page_url] 
            
            for future in as_completed(futures):
                try:
                    page = future.result(timeout=3)  
                    content_type = page.headers['Content-Type'] if hasattr(page,'headers') and 'Content-Type' in page.headers else None
                    
                    # Check header before further processing
                    if not content_type or 'text/html' not in content_type:
                        # skip this website 
                        logger.info("Website skipped because content_type not text/html.")
                        continue 
                        
                    soup = BeautifulSoup(page.content, 'html.parser', from_encoding="iso-8859-1")

                    # check if English website, otherwise skip website  
                    texts = soup.findAll(text=True)
                    visible_texts = filter(tag_visible, texts)  
                    text = u" ".join(t.strip() for t in visible_texts)
                    if text.strip()!="" and detect(text) and detect(text) != "en":
                        logger.info("Given URL is a non-English website.")
                        continue
                        
                    keyword_match = [w for w in text.split(" ") if w.lower() in keywords]
                    if len(keyword_match)<3: 
                        # if less than three keywords occur in website's texts => skip
                        continue

                    # Extract all HTML tables 
                    tables = soup.find_all('table') 
                    if not tables or tables == []: # no table on website => skip
                        continue

                    # Adding further textual sources from website: title and meta data if available 
                    title = soup.find('head').find('title').text if soup.find('head') and soup.find('head').find('title') and soup.find('head').find('title').text else "" 
                    title = clean_cell_value(title.strip())

                    meta_data = soup.find('meta', {"name": "description"})
                    meta_data = meta_data["content"] if meta_data and "content" in meta_data else ""

                    meta_data_keywords = soup.find('meta', {"name": "keywords"})
                    meta_data_keywords = meta_data_keywords["content"] if meta_data_keywords and "content" in meta_data_keywords else ""

                    meta_data += meta_data_keywords
                    meta_data = clean_cell_value(meta_data.strip())

#                     html_tables[page.url] = html_tables_to_dict(tables, title, meta_data)
                    table_dict_list = html_tables_to_dict(tables = tables, title = title, meta_data = meta_data, 
                                                          page_url = page.url) # gives back a list of table dictionaries 
    
                    if table_dict_list not in conv_tables:
                        conv_tables.extend(table_dict_list)
                
                
                except (TooManyRedirects) as e:
#                     resp = e.response
                    error_msg = str(e)
                    logging.info(f"The following error occured while retrieving a website: TooManyRedirects")
                    continue

                except (ReadTimeout, ConnectTimeout) as e:
#                     resp = requests.models.Response()
#                     resp.reason = 'timeout'
#                     resp.status_code = 408
                    error_msg = str(e)
                    logging.info(f"The following error occured while retrieving a website: ReadTimeout, ConnectTimeout - {error_msg}")
                    continue

                except (ConnectionError) as e:
                    #print(r)
                    #traceback.print_exc()
#                     resp = requests.models.Response()
#                     resp.reason = 'connection error'
#                     resp.status_code = 500
                    error_msg = str(e)
                    logging.info(f"The following error occured while retrieving a website (Connection error) - {error_msg}")
                    continue

                except (UnicodeError, InvalidURL, ValueError) as e:
#                     resp = requests.models.Response()
#                     resp.reason = 'malformed or invalid url'
#                     resp.status_code = 400
                    error_msg = str(e)
                    logging.info(f"The following error occured while retrieving a website: UnicodeError, InvalidURL")
                    continue
                
                except Exception as e: 
                    logger.info(f"The following exception occurred while executing 'convert_tables_parallel': {e}")
                    continue        
    
    except Exception as e:
        logger.info(f"The following exception occurred while starting function 'convert_tables_parallel': {e}")
        pass
    
    return conv_tables


In [None]:
def rerank_tables_multiprocess(df: pd.DataFrame):
    num_processes = multiprocessing.cpu_count()
#     logging.info(f"Number of processes: {num_processes}")
    print(f"Number of processes: {num_processes}")

    try:
        pool = multiprocessing.Pool(processes=5)
        result = pool.map(rerank_tables_parallel, zip(df["claim"], df["tables_wiki_references"]))
        pool.close()
        pool.join()
        
        return result
        
    except Exception as e:
        print(f"The following exception occurred while function 'rerank_tables_multiprocess': {e}")
#         logging.info(f"The following exception occurred while function 'rerank_tables_multiprocess': {e}")


In [None]:
def html_tables_to_dict(tables: list, title = "", meta_data = "", page_url = "", w_statement = "", include_small_tables = False) -> list:
    """
    Transforms html table into a dict object and filters out small tables if 'include_small_tables' is set False.
    Returns list of table dictionaries of following format: 
     
     tab_dict = {
            '_id': uuid, 
            'title': title,
            'meta_data': meta_data,
            'caption': caption,
            'wikidata_statement': w_statement,
            'header_horizontal': header_horizontal,
            'header_vertical': header_vertical,
            'rows_list': rows,
            'html_table'
        }
    
    Parameters: 
    tables (list): list of bs4.element.table elements to convert 
    title (str, optional): title extracted from the website
    meta_data (str, optional): meta data (e.g. description) extracted from the website
    w_statement (str, optional): Wikidata statement of the reference the tables have been extracted from 
    include_small_tables (bool, optional): whether to include small tables e.g. <3 rows and <2 columns or not 
    
    Returns:
    list: a list of dictionaries, each dictionary equals one converted table
    """
    tables_dict_list = []

    for table in tables:
        if not isinstance(table, bs4.element.Tag):
            # skip if not table instance or table has inner-tables
            continue
        
        if table.caption is None: 
            caption = None
        else: 
            caption = table.caption.text
            caption = re.sub(r'(\[\d+\])+$', '', clean_cell_value(caption)) ###### TODO check function clean_cell_value ######

        header_horizontal = []
        header_vertical = []
        if table.find('thead'):
            header_horizontal = [(clean_cell_value(th.text)) for th in table.find('thead').find_all(['th', 'td'])]

        rows = []
        count = 0
        if table.find('tbody'):
            tr_cells_list = [tr for tr in table.find('tbody').find_all('tr')]
        else:
            tr_cells_list = [tr for tr in table.find_all('tr')]

        for tr in tr_cells_list:

            if not header_horizontal and count == 0: # 'tr' is the first row of the table and no 'thead' is given
                count = 1
                header_horizontal = [(clean_cell_value(th.text)) for th in tr.find_all(['th'])]

                if not header_horizontal and tr.find('b'): # set first row as header if written in bold 
                    header_horizontal = [(clean_cell_value(th.text)) for th in tr.find_all(['td'])]
                else:
                    row = [(clean_cell_value(td.text)) for td in tr.find_all('td') if clean_cell_value(td.text).strip()!=""]
                    if row:
                        rows.append(row)

            else: # either 'tr' is not first row OR 'tr' is first row but a header is already given in 'thead'
                count = 1 
                row = []
                
                # 

                if not header_vertical: # adjust must not always be be header_vertical 
                    header_vertical = [(clean_cell_value(th.text)) for th in tr.find_all('th')]
                else: 
                    for th in tr.find_all('th'):
                        header_vertical.append(clean_cell_value(th.text))
                row = [(clean_cell_value(td.text)) for td in tr.find_all('td') if clean_cell_value(td.text).strip()!=""]
                
                if row:
                    rows.append(row)

        tab_dict = {
            'id': uuid.uuid1(),
            'url': page_url, # website from where tables have been extracted  
            'title': title,
            'meta_data': meta_data,
            'caption': caption,
            'wikidata_statement': w_statement,
            'header_horizontal': header_horizontal,
            'header_vertical': header_vertical,
            'rows_list': rows,
            'html_table': str(table)
        }
        
        # Filter for tables which have less than three rows 
        if len(rows)<3 and not include_small_tables:
            # table too small to be added 
            continue
            
        rows_len = [len(row) for row in rows]
        avg_col_length = sum(rows_len)/len(rows_len)
        
        # Filter for tables which have less than 1.5 columns in average 
        if round(avg_col_length,1)<1.5 and not include_small_tables:
            # table too small to be added 
            continue 
    
        tables_dict_list.append(tab_dict)

    return tables_dict_list
