#### Importing packages

In [None]:
# Load packages 

import bs4 
import boto3
import botocore
import botocore.exceptions
import copy
import dns
import json

import numpy as np
import os
import pandas as pd
import pymongo
import random
import requests
import uuid

from bs4 import BeautifulSoup
from collections import OrderedDict
from datetime import datetime
from pprint import pprint


## Overview 
This notebook executes the crowdsourcing experiments for the three different tasks 


1. <strong>Generate tasksets </strong>


2. <strong>Label gold tasks </strong>


3. <strong>Sent tasks to Mechanical Turk for annotation</strong> 


4. <strong>Frequently update task with workers' answers</strong> 

Set the following variables first:


In [None]:
# Folder containing all data files for crowdsourcing
data_folder = ''

# Folder containing all config files for crowdsourcing
config_folder = ''

# Set path to .json file with crowdsourcing qualification tests
path_qualification_tests = ''

# Set task type as one of the following: 'table_annotation', 'claim_generation', 'adjusted_claim_annotation'
task_type = ''

# Set to 1 if crowdsourcing tasks in production should be created, else 0 for test
create_hits_in_production = 0

# Set 1 if current taskSet (if existing) should be updated
update_tasksets = 0

# Set path to db credentials, used for storing crowdsourcing tasks and results 
path_mongodb_credentials = ''

# Set path to amazon credentials saved in a .json file
path_amazon_credentials = ''

# Set Amazon mturk endpoint used for crowdsourcing experiments
mturk_endpoint = ''


---------------

In [None]:
with open(path_qualification_tests,'r') as f:
    qualification_tests = json.load(f)


In [None]:
# MongoDB 

with open(path_mongodb_credentials,'r') as f:
    mongodb_credentials = json.load(f)

# Connect to MTurk and MongoDB
db_client = pymongo.MongoClient(mongodb_credentials["connection_string"]) # connecting to database
db = db_client['pubhealth']

hit_result_collection = db.hit_results if create_hits_in_production else db.hit_results_sandbox

mt = MTurk()
mt.launch_client(production = create_hits_in_production)


In [None]:
# Mturk client

class MTurk():

    def __init__(self):
        with open(path_amazon_credentials)  as f: # get the credentials from AMT 
            cfg = json.load(f)

        self.access_key = cfg['access_key']
        self.secret_key = cfg['secret_key']
        
        self.environments = {
            "production": {
                "endpoint": mturk_endpoint, # set mturk endpoint
                "preview": "https://www.mturk.com/mturk/preview"
            },
            "sandbox": {
                "endpoint": mturk_endpoint, # set mturk endpoint
                "preview": "https://workersandbox.mturk.com/mturk/preview"
            },
        }

    def launch_client(self, production = False):
        self.mturk_environment = self.environments["production"] if production else self.environments["sandbox"]
        try:
            session = boto3.Session(profile_name='mturk')
        except botocore.exceptions.ProfileNotFound as e:
            session = boto3.Session(
                profile_name='mturk',
                aws_access_key_id  = self.access_key,
                aws_secret_access_key  = self.secret_key
            )
        self.client = session.client(
            service_name= 'mturk',
            region_name= 'us-east-1',
            endpoint_url= self.mturk_environment['endpoint'],
        )
        print(self.client.get_account_balance()['AvailableBalance'])

    def create_hit(self, html_layout, **TaskAttributes):
        QUESTION_XML = """..."""
        question_xml = QUESTION_XML.format(html_layout)

        response = self.client.create_hit(
            **TaskAttributes,
            Question=question_xml
        )

        return response

    def get_hit_status(self, HITId):
        hit = self.client.get_hit(HITId=HITId)
        hit_status = hit['HIT']['HITStatus']
        return hit_status

    def get_hit_answers(self, HITId, approve=False):

        # Get list and number of Assignments that have been completed
        hit_assignmentsList = self.client.list_assignments_for_hit(
            HITId=HITId,
            AssignmentStatuses=['Submitted','Approved']
        )

        assignments = hit_assignmentsList['Assignments']

        # Get details and results of each Assignment and add to answers array
        answers = []
        for assignment in assignments:
            worker_id = assignment['WorkerId']
            assignment_id = assignment['AssignmentId']

            answer_dict = xmltodict.parse(assignment['Answer'])['QuestionFormAnswers']['Answer']
            values = {}
            for entry in answer_dict:
                try:
                    values[entry['QuestionIdentifier']] = json.loads(entry['FreeText'])
                except ValueError:
                    values[entry['QuestionIdentifier']] = entry['FreeText']
                except TypeError:
                    values[entry['QuestionIdentifier']] = None

            answer = {
                'worker_id' : worker_id,
                'assignment_id' : assignment_id,
                'values' : values,
                'HITId' : HITId
            }
            answers.append(answer)
            
            if approve:
                # Approve or not assignments
                if assignment['AssignmentStatus'] == 'Submitted':
                    self.client.approve_assignment(
                        AssignmentId = assignment_id,
                        OverrideRejection = False
                    )
                    
        return answers

### Functions 


In [None]:
second_highest_table = 1

def select_table(table_list: list, claim_id: str) -> dict:
    # select the first table in list which fullfills certain criteria
    
    for table in table_list: 
#         check if table has been annotated before (check in hit_results.references)
        try:
            if second_highest_table:
                query = {'timestamp': {'$gte': datetime(2021, 6, 23)}}
                query['references.claim_db_id'] = claim_id
                query['type'] = "table_annotation"
                query['references.table.rows_list'] = table["rows_list"]
                # todo: add that this entry has not been deleted before assignment was == 3 due to issues
                
                if list(hit_result_collection.find(query)) != []:
                    print("table has been used before is therefore skipped")
                    continue
        except Exception as e: 
            print(f"Following error occurred during MongoDB querying: {e}")
    
        if len(table["rows_list"]) > 15: 
            continue
        elif (table["header_horizontal"] and len(table["header_horizontal"])>10):
            continue
        elif (table["header_vertical"] and len(table["header_vertical"])>10):
            continue
            
        cell_len = [len(cell) for row in table["rows_list"] for cell in row] # if one cell has more than 1k characters, skip
        if cell_len and max(cell_len) > 100:
#             print("cell too long")
            continue
        
        col_len = [len(row) for row in table["rows_list"]] # if ten or more columns, skip 
        if col_len and max(col_len) > 15:
            continue
        
        if table["header_horizontal"]: # if one header cell has more than 100 characters, skip
            header_h_len = [len(cell) for cell in table["header_horizontal"]]
            if max(header_h_len) > 100:
                continue
                
        if table["header_vertical"]: # if one header cell has more than 100 characters, skip
            header_v_len = [len(cell) for cell in table["header_vertical"]]
            if max(header_v_len) > 100:
                continue
            
        return table 
    

In [None]:
maxSingleInstances = 1 #MAX TIMES ANY REFERENCE APPEARS AMONG THE TASK SETS

def get_random_task(counter, n, non_gd_tasks):
    """Returns n random tasks of format (task, index) from list non_gd_tasks"""
    
    task_seq = []
    p = [max(maxSingleInstances-c,0.0001) for c in counter] # indexes of samples which can be used for task generation 
    p = [pp/sum(p) for pp in p]
    indexes = np.random.choice( # random choice of entries (given some constraints)
        a = list(range(len(non_gd_tasks))),
        size=n,
        replace=False, 
        p=p # samples used before in other tasks => probability set to 0 so that they are not chosen twice
    )        

    for i in indexes:
        task = non_gd_tasks[i]
        task_seq.append((task,i))
    
    return task_seq


def get_random_gd_task(n, gd_tasks):
    """Returns a sub-list of gd_tasks with n entries"""
    return random.sample(gd_tasks, n)


def generate_taskset(counter, non_gd_tasks, gd_tasks, n=(5,2)): # set the number of non-gold labelled tasks (e.g. 5) and gold labelled task (e.g. 2)
    '''
    counter = a counter which keeps track of how many times each reference was retrieved
    n = (x,y) where x = number of non_gd references and y = number of gd references
    '''
    taskSet = []
    
    task = get_random_task(counter, n[0], non_gd_tasks) # returns list of (index, sample)
    taskSet = [r for (r,i) in task] # pairs reference,index are generated here, so that we can update the counter later
    taskSet.extend(get_random_gd_task(n[1], gd_tasks))
    
    random.shuffle(taskSet) # gold standard should occur anywhere
    
    return taskSet, [i for (p,i) in task] # indixes returned to update counter


def generate_taskset_no_gold(counter, non_gd_tasks, n=5):
    """
    Generates taskset of n (default = 5) tasks and updates counter,
    so that a task appears only 'maxSingleInstances'-times in a taskSet
    """
    task_index_set = get_random_task(counter, n, non_gd_tasks) # returns list of (index, sample)
    taskSet = [r for (r,i) in task_index_set] # tasks extracted from the pairs of (task, index)
    
    return taskSet, [i for (p,i) in task_index_set] # indixes returned to update counter


def generate_taskset_w_ids(non_gd_tasks, gd_tasks) -> list:
    """
    Function to generate TaskSets given gold standards and other samples 
    """
    taskSets = []
    counter = [0]*len(non_gd_tasks) # keeping track if sample has been used before 

    while (any([c < maxSingleInstances for c in counter])):
        # generate task set with some random samples
        taskSet, indexes = generate_taskset(counter, non_gd_tasks, gd_tasks) 
        taskSetIDd = {
            '_id': str(uuid.uuid4()),
            'taskSet' : taskSet
        }
        taskSets.append(taskSetIDd) # add created taskSet with ID to list of taskSets 
        for i in indexes:
            # increase counter for samples added, so that they only appear 'maxSingleInstances'-times in a taskSet
            counter[i] = counter[i] + 1 
    
    return taskSets, counter


def generate_taskset_w_ids_no_gold(non_gd_tasks, n = 5) -> list:
    """
    Function to generate TaskSets given samples but without gold labelled samples 
    n (int): number of non_gold entries per taskSet
    """
    task_set_list = []
    counter = [0]*len(non_gd_tasks) # keeping track if sample has been used before 

    while (any([c < maxSingleInstances for c in counter])): # iterate as long as any sample in non_gd_tasks as not been used
        # generate task set with some random samples
        task_set, indexes = generate_taskset_no_gold(counter, non_gd_tasks, n) 
        task_set_w_id = {
            '_id': str(uuid.uuid4()),
            'taskSet' : task_set
        }
        
        task_set_list.append(task_set_w_id) # add created taskSet with ID to list of taskSets 
        for i in indexes:
            # increase counter for samples added, so that they only appear 'maxSingleInstances'-times in a taskSet
            counter[i] = counter[i] + 1 
    
    return task_set_list, counter
    

### 1. Generation of task sets


In [None]:
# Load data from MongoDB database

if task_type == "adjusted_claim_annotation":
    claim_col = db.claim_generation
    cursor =  claim_col.find({})
    df = pd.DataFrame(list(cursor))

else:
    claim_col = db.final_dataset
    if task_type == "claim_generation":
        cursor =  claim_col.find({"table_relevant": {'$ne': None}})
    else:
        cursor =  claim_col.find({})
        
    df = pd.DataFrame(list(cursor)) 
    
print(f"Length of test set: {len(df)}")
df.head()


In [None]:
def select_table_claim_generation(table_list: list, claim_id: str) -> dict:
    # select the first table in list which fullfills certain criteria
    
    tables_for_generation = []
    for table in table_list: 
#         check if for table 'table' a claim has been generated before
        try:
#             query = {} #todo filtering => checkout of an entry for this claim exists in claim_gen collection
#             query['type'] = "claim_generation"
#             query['references.claim_db_id'] = claim_id
#             query['references.table.rows_list'] = table["rows_list"]
            
#             if list(hit_result_collection.find(query)) == []:
#                 tables_for_generation.append(table)
            
            query = {"initial_claim_id": claim_id}
            query["table.id"] = table["id"]
            gen_claim_col_list = list(gen_claim_col.find(query))
            if gen_claim_col_list == [] or len(gen_claim_col_list)<2:
                tables_for_generation.append(table)
                
        except Exception as e: 
            print(f"Following error occurred during MongoDB querying: {e}")
    
    return tables_for_generation 


In [None]:
# If code being executed for task "claim_generation", execute this code

if task_type == "claim_generation":

    # Generating tasks with the following template: 
    # saving to json file 
    
    gen_claim_col = db.claim_generation
    table_column = "table_relevant" # TODO later use tables from 1st task results => after majority filtering

    if update_tasksets:
        task_template = {
            "claim_db_id": "",
            "claim": "",
            "table": {
                "id": "",
                "url": "",
                "caption": "",
                "header_horizontal": "", 
                "header_vertical": "", 
                "rows_list": "", 
                "html_table": ""
            }
        }

        # Iterate over all test entries, convert and save in json 
        task_list = []
        for index, row in df.iterrows():
            if not row[table_column] or len(row[table_column])==0:
                print("skip")
                continue

            db_table_list = select_table_claim_generation(row[table_column], str(row["_id"]))
            if not db_table_list or db_table_list==[]: 
                print("skip")
                continue 
            
            for db_table in db_table_list:
                task = copy.deepcopy(task_template)
                task["claim_db_id"] = str(row["_id"])
                task["claim"] = row["claim"]

                table = db_table
                table["id"] = str(db_table["id"])
                if "_id" in table : table.pop('_id')

    #             task["table"] = row["table"] #TODO => later uncomment!!!
                task["table"] = table
                task_list.append(task)


        print(f"{len(task_list)} tasks in total created.\n")

        path = os.path.join(data_folder, "TaskSets/pre_tasksets_{}.json".format(task_type))
        print(f"Saving generated task list at path {path}.\n")

        with open(path, "w", encoding="utf-8") as file:
            json.dump(task_list, file, indent=4, ensure_ascii=False)


In [None]:
# If code being executed for task "table_annotation", execute this code

if task_type == "adjusted_claim_annotation":

    # Generating tasks with the following template: 
    # saving to json file 

    table_column = "table"
    task_temp_provided = True

    if update_tasksets:
        task_template = {
            "claim_db_id": "",
            "claim": "",
            "label": "",
            "table": {
                "id": "",
                "url": "",
                "caption": "",
                "header_horizontal": "", 
                "header_vertical": "", 
                "rows_list": "", 
                "html_table": ""
            },
            "g_id": -1
        }

        # Iterate over all test entries, convert and save in json 
        task_list = []
        for index, row in df.iterrows():
#             check if this claim exists with {type: "adjusted_claim_annotation"} already in hit_results 
            if list(hit_result_collection.find({'type': "adjusted_claim_annotation", "references.claim_db_id": str(row["_id"])}))!=[]:
                continue 
            
            task = copy.deepcopy(task_template)
            task["claim_db_id"] = str(row["_id"])
            task["claim"] = row["claim"]
            task["label"] = row["label"]
            task["table"] = row["table"]

            task_list.append(task)


        print(f"{len(task_list)} tasks in total created.\n")

        path = os.path.join(data_folder, "TaskSets/pre_tasksets_{}.json".format(task_type))
        print(f"Saving generated task list at path {path}.\n")

        with open(path, "w", encoding="utf-8") as file:
            json.dump(task_list, file, indent=4, ensure_ascii=False)


In [None]:
# If code being executed for task "table_annotation", execute this code

if task_type == "table_annotation":
    # Generating tasks with the following template: 
    # saving to json file 

    table_column = "tables"
    task_temp_provided = True

    if update_tasksets:
        task_template = {
            "claim_db_id": "",
            "claim": "",
            "label": "",
            "table": {
                "id": "",
                "url": "",
                "caption": "",
                "header_horizontal": "", 
                "header_vertical": "", 
                "rows_list": "", 
                "html_table": ""
            },
            "g_id": -1
        }

        # Iterate over all data entries, create tasks for them and save in json file  
        task_list = []        
        for index, row in df.iterrows():
            if not row[table_column] or len(row[table_column])==0:
                continue

            task = copy.deepcopy(task_template) # copy the task template and fillout the empty fields
            task["claim_db_id"] = str(row["_id"])
            task["claim"] = row["claim"]
            task["label"] = row["label"]
            
            if not task_temp_provided:
                table = copy.deepcopy(task_template["table"])
                db_table = select_table(row[table_column], task["claim_db_id"]) # select first table in list fulfilling requirments
                if not db_table: 
                    print("skip")
                    continue 

                table["id"] = str(db_table["id"]) 
                table["url"] = db_table["url"] 
                table["caption"] = db_table["caption"]
                table["header_horizontal"] = db_table["header_horizontal"]
                table["header_vertical"] = db_table["header_vertical"]
                table["rows_list"] = db_table["rows_list"]
                table["html_table"] = db_table["html_table"] #     TODO: adjust html table here!!!
            else: 
                db_table = select_table(row[table_column], task["claim_db_id"]) # select from lists of tables, the one which should be send to AMT
                if not db_table: 
                    print("skip")
                    continue 
                table = db_table
                table["id"] = str(db_table["id"]) 
                
            task["table"] = table
            task_list.append(task) # append the created task to the task_list list 

        print(f"{len(task_list)} tasks in total created.\n")

        path = os.path.join(data_folder, "TaskSets/pre_tasksets_{}.json".format(task_type)) 
        print(f"Saving generated task list at path {path}.\n")

        with open(path, "w", encoding="utf-8") as file: # save the list of tasks given path
            json.dump(task_list, file, indent=4, ensure_ascii=False)


#### Split tasks in gold and non-gold samples

In [None]:
golden_standard_quota = 20 

if update_tasksets and task_type in ["table_annotation", "adjusted_claim_annotation"]: 
    with open(os.path.join(data_folder, "TaskSets/pre_tasksets_{}.json".format(task_type)), "rb") as file: 
        task_list = json.load(file)

    # shuffle list and select gold standards 
    random.shuffle(task_list)

    with open(os.path.join(data_folder, "TaskSets/task_list_gold_{}.json".format(task_type)), 'w+', encoding='utf8') as file:
        json.dump(task_list[:golden_standard_quota], file, indent=4, ensure_ascii=False)

    # to avoid duplicates in gold and non-gold lists
    with open(os.path.join(data_folder, "TaskSets/task_list_non_gold_{}.json".format(task_type)), 'w+', encoding='utf8') as file:
        json.dump(task_list[golden_standard_quota:], file, indent=4, ensure_ascii=False)


### TODO (!): Before next step: manually set gold labels in task_list_gold.json

#### Generating tasksets of 7 tasks each (two out of them are gold standards)

In [None]:
# Sampling claims: currently no sampling (decide later e.g. using active learning)

if update_tasksets and task_type in ["table_annotation", "adjusted_claim_annotation"]: 
    with open(os.path.join(data_folder, "TaskSets/task_list_gold_{}.json".format(task_type)), "rb") as file: 
        task_list_gold = json.load(file) # load task list of gold standards

    with open(os.path.join(data_folder, "TaskSets/task_list_non_gold_{}.json".format(task_type)), "rb") as file: 
        task_list_non_gold = json.load(file) # load non gold standard task list

    # set number of non-gold tasks to use accord. to how many labelled gold we have
    task_sets, counter = generate_taskset_w_ids(task_list_non_gold, task_list_gold)

    print(f"{len(task_sets)} tasksets created out of {len(task_list_gold)} gold tasks and {len(task_list_non_gold)} non-gold tasks.")

    # save tasksets 
    with open(os.path.join(data_folder, "TaskSets/tasksets_{}.json".format(task_type)), 'w+', encoding='utf8') as file:
        json.dump(task_sets, file, indent=4, ensure_ascii=False)

elif update_tasksets and task_type == "claim_generation":
    with open(os.path.join(data_folder, "TaskSets/pre_tasksets_{}.json".format(task_type)), "rb") as file: 
        task_list_non_gold = json.load(file)
    
    task_sets, counter = generate_taskset_w_ids_no_gold(task_list_non_gold, n = 5)
    print(f"{len(task_sets)} tasksets created out of {len(task_list_non_gold)} non-gold tasks.")
    
    # save tasksets 
    with open(os.path.join(data_folder, "TaskSets/tasksets_{}.json".format(task_type)), 'w+', encoding='utf8') as file:
        json.dump(task_sets, file, indent=4, ensure_ascii=False)
    
else: 
    print("Loading taskset...")        
    with open(os.path.join(data_folder, "TaskSets/tasksets_{}.json".format(task_type)), 'rb') as file:
        task_sets = json.load(file)
        
    print(f"Length of retrieved tasks is {len(task_sets)}")


### 2. Running Crowdsourcing

In [None]:
# Functions to preprocess tables for displaying in annotation UI

def _remove_attrs(soup):
    for tag in soup.findAll(True): 
        if tag.attrs:
            tag.attrs = dict((key,value) for key,value in tag.attrs.items() if key in ["colspan", "align", "text", "rowspan"])
    
    return soup


def _remove_tags(soup):
    for tag in ["image", "script", "caption", "noscript", "picture", "math", "figure", "name-content", "menu", "ie:menuitem", "iframe", "code", "gcse:search", "img", "index", "article", "map", "object"]: 
        for elem in soup(tag):
                elem.decompose()

def _remove_img(soup):
    for img in soup("img"):
        img.decompose()
        

def _remove_caption(soup):
    for cap in soup("caption"):
        cap.decompose()

def _remove_script(soup):
    for cap in soup("script"):
        cap.decompose()

        
def taskset_tostring(taskSet_list: list): 
    """
    Preprocess table representation before sending to UI
    """
    new_list = []
    for taskSet in taskSet_list:
        html_table = taskSet['table']['html_table']

        html_table_bs = BeautifulSoup(html_table)
        html_table_bs = _remove_attrs(html_table_bs)
        _remove_tags(html_table_bs)

        for tag in html_table_bs.findAll(["table", "th", "td"]):
            tag['style'] = "border: 1px solid black;"

#         html_table = str(html_table_bs.body.table).replace("\n", "")
#         html_table = str(html_table_bs.body.table).replace("\n", "").replace("'", "\'")

        taskSet['table']['html_table'] = html_table
    
    result_str = str(taskSet_list).replace("\'<table", "`<table").replace("table>\'", "table>`").replace("\xa0", " ").replace("\'caption\': None", "\'caption\': \'no caption given\'")
    result_str = result_str.encode('ascii', 'ignore').decode('ascii')

    return result_str


In [None]:
# Ban workers which are spammers
# After pilot round 

to_ban = False
if to_ban:
    with open('./config/banlist.json','r') as f:
        banlist = json.load(f)
    for w in banlist:
        try:
            print(w)
            response = mt.client.create_worker_block(WorkerId=w, Reason='Malicious behaviour.')
            assert(response['ResponseMetadata']['HTTPStatusCode'] == 200)
        except botocore.exceptions.ClientError as e:
            print(f"Following exception thrown ", e)
            continue
    print(f"{len(banlist)} workers banned in total.")
    
else: 
    print("No worker banned.")
            

In [None]:
# SETTING GENERAL THINGS, UI, ETC. 

""" Create the tasks by populating the HTML templates using the config file """

with open(config_folder + 'task_config_{}.json'.format(task_type),'r') as f: # load config file with worker qualifications, location, etc. 
    task_temp = json.load(f)
    
taskSets_all_lan = {}

task_content = copy.deepcopy(task_temp)
task_content['language'] = "en"

TaskAttributes = task_content['task_attributes']

with open('./config/' + task_content['instructions_project_text_file'],'r') as f:
    task_content['instructions_project_text'] = f.read().replace('\n',' ')
with open('./config/' + task_content['instructions_rules_text_file'],'r') as f:
    task_content['instructions_rules_text'] = f.read().replace('\n',' ')

html_layout = open(task_content['html_layout'], 'r').read()

# enter instruction texts in the html template 
html_layout = html_layout.\
    replace('${instructions_project_text}$', task_content['instructions_project_text']).\
    replace('${instructions_rules_text}$', task_content['instructions_rules_text']).\
    replace('${time_thr}$', task_content['time_thr'])

with open(data_folder + task_content['tasks'],'r') as f:
    taskSets = json.load(f)

# If you're only testing, just pick one hit and run it once, with no qualification barriers
if not create_hits_in_production:
    TaskAttributes.pop('QualificationRequirements')
    TaskAttributes['MaxAssignments'] = 1 
    random.seed(42)
    #taskSets = random.sample(taskSets,1)

print('Generated {} tasks with the following configs:'.format(len(taskSets)))
pprint(TaskAttributes,indent=1) #verify the properties before running the HITs
print('')


In [None]:
# HERE HITS ARE CREATED (=> AMT CALLED) AND SAVED IN MONGODB WITH CORRESP. HIT_ID, later based on these HIT_ID the results are updated

""" Create the batch of HITs """

results = []
batch_id = str(uuid.uuid4())

hit_type_id = ''
target_assignments = TaskAttributes['MaxAssignments']

for taskSet in taskSets[:10]: 

    TaskAttributes_hit = copy.deepcopy(TaskAttributes) # Adjust based on how many were already done in other batches
    TaskAttributes_hit['MaxAssignments'] = target_assignments -\
        sum([hit['hit']['NumberOfAssignmentsCompleted'] for hit in hit_result_collection.find({
            'taskSet_id':taskSet['_id'],
            'type': task_content['type'],
            'language': "English"
        })])
    if TaskAttributes_hit['MaxAssignments'] > 0:
        random.seed(None)
        language_questions = random.sample(qualification_tests,k=3) # Adjust with table questions
        try:
#             response = mt.create_hit(html_layout.replace('${references}$', mock_taskSet).\
#                                      replace('${lan_test_questions}$', json.dumps(language_questions)),
#                                      **TaskAttributes_hit)
            response = mt.create_hit(html_layout.replace('${references}$', taskset_tostring(taskSet['taskSet'])).\
                                     replace('${lan_test_questions}$', json.dumps(language_questions)),
                                     **TaskAttributes_hit)
            
        except Exception as e: 
            print(f"Exception occurred, continue with next entry in TaskSets: {e}")
            continue 

        hit_type_id = response['HIT']['HITTypeId']
        result = {
            '_id': response['HIT']['HITId'],
            'batch_id': batch_id,
            'type': task_content['type'],
            'references': taskSet['taskSet'],
            'language': "English",
            'taskSet_id':taskSet['_id'],
            'hit': response['HIT'],
            'timestamp': datetime.now()
        }
        results.append(result)
        try:
            hit_result_collection.insert_one(result) # ADD hit with ID returned from AMT to my MongoDB database
        except Exception:
            print(result)
            raise

# For you to go to the HITs you just created and test them
print('Launched tasks for table fact checking')
if not create_hits_in_production:
    print('You can view the HITs here:')
    print(mt.mturk_environment['preview']+"?groupId={}".format(hit_type_id))
else:
    print('Launched! Good Luck!')
     

In [None]:
''' Update all non-disposed hits in the database with correct results '''
""" Rejected assignments are ignored """

approve_payment = False # Set to true if automatically approve payment 

for hit in hit_result_collection.find({'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}, 
                                       'timestamp': {'$gte': datetime(2020, 6, 30)}}):
    
    print('Updating',hit['_id'],end='\r')
    try:
        hit_result_collection.update_one(
            {'_id': hit['_id']},
            {
                "$set": {
                    "hit": mt.client.get_hit(HITId = hit['_id'])['HIT'],
                    'answers': mt.get_hit_answers(hit['_id'], approve=approve_payment)
                }
            })
    except Exception as e:
        print(e,end='\n\n')
        continue
print('Done'+(' '*100))


#### Delete previously sent HITs

In [None]:
query = {'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}}
# query['type'] = "claim_generation"
len(list(hit_result_collection.find(query)))


In [None]:
""" If you set 'force' to TRUE, it will abort mission and force an expiry in all HITs and then delete them.
If you only want to remove the completed ones (make them Disposed so the update routine won't loop through tons of
HITs), keep it as FALSE."""

force = True

while True:
    ''' Dispose all hits in the database '''
    query = {'hit.HITStatus': {'$not': {'$eq': 'Disposed'}}, 'timestamp': {'$gte': datetime(2020, 6, 23)}}
#     query["type"] = "claim_generation"
#     query = {'timestamp': {'$gte': datetime.datetime(2021, 6, 29)}}
    
    if not force:
        query['hit.NumberOfAssignmentsPending'] = 0
        query['hit.NumberOfAssignmentsAvailable'] = 0
    elif force:
        query['hit.NumberOfAssignmentsPending'] = 0
#         query['hit.NumberOfAssignmentsCompleted'] = 0
    
    hit_result_collection_list = list(hit_result_collection.find(query))
    print(f"Length of retrieved HITs ", len(hit_result_collection_list))
    
    if (not force and len(hit_result_collection_list) == 0) or (force and mt.client.list_hits()['NumResults']==0):
        print('Finished')
        break
        
    for hit in hit_result_collection_list:
        try:
            mt.client.delete_hit(HITId = hit['_id'])
            print('Removed',hit['_id'])
        except Exception as e:
            print(hit['_id'], e)
            if force:
                print("force")
                try:
                    mt.client.update_expiration_for_hit(HITId = hit['_id'], ExpireAt=datetime(2018, 4, 10, 7, 22, 15))
                    mt.client.get_hit(HITId = hit['_id'])
                    mt.client.delete_hit(HITId = hit['_id'])
                    print('Removed',hit['_id'])
                except Exception as e:
                    pass
                    print(hit['_id'],e)
            continue
    break
    

In [None]:
hit_list = mt.client.list_hits(MaxResults=100)
print(f"Length of retrieved hits: {len(hit_list['HITs'])}")
force = True

for hit in hit_list["HITs"]:
    hitid = hit["HITId"]
    try:
        mt.client.update_expiration_for_hit(HITId = hitid, ExpireAt=datetime(2018, 1, 1))
        mt.client.get_hit(HITId = hitid)
        mt.client.delete_hit(HITId = hitid)
        print('Removed',hitid)
    except Exception as e:
        print(f"Following error occured while deleting HIT {hitid}, lets continue: {e}.")
        if force:
            try:
                mt.client.update_expiration_for_hit(hitid, ExpireAt=datetime(2017, 1, 1))
                mt.client.delete_hit(HITId = hitid)
                print('Removed', hitid)
            except Exception as e:
                pass
#                 print(hit['_id'],e)
        continue


#### Delete specific Task

In [None]:
force = True

while True:
    ''' Dispose all hits in the database '''
    
#     if not force:
#         query['hit.NumberOfAssignmentsPending'] = 0
#         query['hit.NumberOfAssignmentsAvailable'] = 0
#     elif force:
#         query['hit.NumberOfAssignmentsPending'] = 0
    
    hit_result_collection_list = [...] # TODO enter here HIT_IDs which should be deleted
    
#     if (not force and len(hit_result_collection_list) == 0) or (force and mt.client.list_hits()['NumResults']==0):
#         print('Finished')
#         break
        
    for hit_id in hit_result_collection_list:
        try:
            mt.client.update_expiration_for_hit(HITId = hitid, ExpireAt=datetime(2018, 1, 1))
            x = mt.client.delete_hit(HITId = hit_id)
            print('Removed',hit_id)
        except Exception as e:
            print(hit_id)
            if force:
                print("force")
                try:
                    mt.client.update_expiration_for_hit(HITId = hit_id, ExpireAt=datetime(2017, 1, 1))
                    mt.client.delete_hit(HITId = hit_id)
                    print('Removed',hit_id)
                except Exception as e:
                    pass
                    print(hit_id,e)
            continue
    break
    

#### Update HITs

In [None]:
hitid = '...' # TODO enter HIT_ID you want to update

mt.client.update_expiration_for_hit(HITId = hitid, ExpireAt=datetime(2015, 1, 1))

# mt.client.update_hit_review_status(HITId = hitid, Revert=True)
# mt.client.update_hit_type_of_hit(HITId = hitid, HITTypeId='1623495307575')

# mt.get_hit_answers(hitid, approve=True)
pprint(mt.client.get_hit(HITId = hitid))

### Preparing html tables for UI

In [None]:
with open(data_folder + "TaskSets/task_sets_en_table.json",'r') as f:
    task_sets = json.load(f)
    
subset = []
for task_set in task_sets: 
    for task in task_set['taskSet']: 
        if (task['table']['header_horizontal']!=[] and any(task['table']['header_horizontal'])) or \
        (task['table']['header_vertical']!=[] and any(task['table']['header_vertical'])): 
            subset.append(task)
            
len(subset)

In [None]:
from lxml import html
from lxml.etree import tostring


def del_col_row(table, row_i = None, col_i = None):
    """remove columns and rows from table"""

    if type(table)==str:
        table = html.fragment_fromstring(table)
        
    # remove column i
    if col_i != None:
        for row in table.getchildren()[0].iterchildren():
            row.remove(row.getchildren()[col_i])
        
    # remove row i
    if row_i != None:
        for index, row in zip(range(len(table.getchildren()[0].getchildren())), table.getchildren()[0].iterchildren()):
            if index == row_i:
                row.getparent().remove(row)
    
    return table 


In [None]:
# EXAMPLE 1

url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_income"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")

table = tables[3]

# TODO replace in function del_col_row() e.g. table.getchildren() by table.getchildren()[0].getchildren()
# drop columns and rows unnecessary 
table = html.tostring(del_col_row(str(table), col_i = 3)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 3)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 4)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 4)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 0)).decode('utf-8')

for i in range(8):
    table = html.tostring(del_col_row(str(table), row_i = 2)).decode('utf-8')
for i in range(6):
    table = html.tostring(del_col_row(str(table), row_i = 9)).decode('utf-8')
for i in range(35):
    table = html.tostring(del_col_row(str(table), row_i = 13)).decode('utf-8')
    
# BeautifulSoup(table)


In [None]:
# EXAMPLE 2

url = "https://www.cdc.gov/flu/about/burden/index.html"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")

table = tables[0]

# drop columns and rows unnecessary 
# table = html.tostring(del_col_row(str(table), col_i = None)).decode('utf-8')

for i in range(8):
    table = html.tostring(del_col_row(str(table), row_i = 0)).decode('utf-8')
    
table = html.tostring(del_col_row(str(table), col_i = 2)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 3)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 4)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 5)).decode('utf-8')

# Delete second header row.. 
table = html.fragment_fromstring(table)
table.getchildren()[1].remove(table.getchildren()[1].getchildren()[1])
# table.getchildren()[1].remove(table.getchildren()[1].getchildren()[0])

table = html.tostring(table).decode('utf-8')


In [None]:
# EXAMPLE 3

url = "https://www.macrotrends.net/countries/AUS/australia/crime-rate-statistics"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")
table = tables[1]


In [None]:
# EXAMPLE 4

url = "https://www.cebm.net/covid-19/global-covid-19-case-fatality-rates"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")
table = tables[0]


In [None]:
# EXAMPLE 5

url = "https://www.nimh.nih.gov/health/statistics/suicide.shtml"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")
table = tables[0]

for i in range(5):
    table = html.tostring(del_col_row(str(table), row_i = 5)).decode('utf-8')


In [None]:
# Table Test

url = "https://en.wikipedia.org/wiki/Demographics_of_the_United_States"
resp = requests.get(url)
page = BeautifulSoup(resp.text, 'html.parser')
tables = page.findAll("table")
table = tables[23]

for i in range(5):
    table = html.tostring(del_col_row(str(table), row_i = 10)).decode('utf-8')

table = html.tostring(del_col_row(str(table), row_i = 0)).decode('utf-8')
table = html.tostring(del_col_row(str(table), col_i = 0)).decode('utf-8')

table = html.fragment_fromstring(table)
table.getchildren()[0].getchildren()[0].remove(table.getchildren()[0].getchildren()[0].getchildren()[4])

for row in table.getchildren()[0].iterchildren():
    if row.getchildren():
#         print(f"This: {html.tostring(row.getchildren()[0])}")
        row.remove(row.getchildren()[2])
        
# table = html.tostring(del_col_row(str(table), col_i = 0)).decode('utf-8')

table = html.tostring(table).decode('utf-8')
BeautifulSoup(table)


In [None]:
# task = subset[100]

# if not task['table']['caption']:
#     task['table']['caption'] = ''
# html_table = task['table']['html_table']

html_table_bs = BeautifulSoup(table)
# html_table_bs = table
html_table_bs = _remove_attrs(html_table_bs)
_remove_img(html_table_bs)
_remove_caption(html_table_bs)

for tag in html_table_bs.findAll(["table", "th", "td"]):
    tag['style'] = "border: 1px solid black;"

html_table = str(html_table_bs).replace("\n", "").replace("'", "\'")
# html_table_bs = BeautifulSoup(html_table)

print(html_table)


In [None]:
# Load taskset for table task for MOCK HTML page


html_table = task_sets[i]["taskSet"][entry]['table']['html_table']
html_table = html_table.replace("\n", "")
html_table = html_table.replace("'", "\'")
# html_table = html_table.replace("%", "\%")
# html_table = html_table.replace("", "\'")

html_table_bs = BeautifulSoup(html_table)
html_table_bs = _remove_attrs(html_table_bs)
html_table_bs

### 3. Updating HITs

In [None]:
html_table_list = []
tags = []

for table_list in test_data["tables_wikipedia_references"]:
    if table_list and type(table_list)!=float:
        for table in table_list: 
            if table and "html_table" in table and type(table["html_table"])==str:
                soup = BeautifulSoup(table["html_table"])
                tags.extend([tag.name for tag in soup.find_all()])
                html_table_list.append(table["html_table"])
            
len(html_table_list)

In [None]:
tags = list(set(tags))
tags

#### Send some taskSets again

In [None]:
with open(r'./data/production/TaskSets/final_table_annotation/tasksets_table_annotation_1.json', "r") as file: 
    taskset_1 = json.load(file)
    
with open(r'./data/production/TaskSets/final_table_annotation/tasksets_table_annotation_3.json', "r") as file: 
    taskset_3 = json.load(file)


In [None]:
relevant_taskSets = []

for entry in taskset_1+taskset_3: 
    for task in entry["taskSet"]:
        if task["claim_db_id"] == "6072bd2a000ca92c09d11fb5":
            task["table"]["header_horizontal"] = []
            task["g_id"] = 3
            relevant_taskSets.append(entry)
            
        elif task["claim_db_id"] == "6072bd2d000ca92c09d145b8":
            task["g_id"] = 3
            relevant_taskSets.append(entry)

    

In [None]:
with open(r'./data/production/TaskSets/final_table_annotation/tasksets_table_annotation_faultyHITs_updated.json', "w", encoding="utf-8") as file: 
    json.dump(relevant_taskSets, file, indent=4)
