In [1]:
import requests
import json
import pandas as pd
from typing import Any, List, Mapping, Optional
from pandas import DataFrame

In [33]:
def get_sentence(mention:Mapping[str, Any], documents:Mapping[str, Any]):
	""" Returns the text containing the mention """
	doc_id = mention['document']
	sentence_ix = mention['sentence']

	sentence = documents[doc_id]['sentences'][sentence_ix]

	return ' '.join(sentence['words'])


def build_groundings(mention:Mapping[str, Any]) -> Optional[str]:
	""" Extracts the grounding ID of the grounding associated to the current mention """

	
	if mention['type'] == "TextBoundMention":
		groundings = list()
		for a in mention['attachments']:
			# If score is a field in the attachment, then this is a grounding
			if type(a) == list:
				for g in a[0]:
					print("Weep")
					score = "{:.2f}".format(g['score'])
					groundings.append(f"({g['name']}, {g['id']}, {score})")
		if len(groundings) > 0:
			return ', '.join(groundings)
			
	

In [18]:
def annotate_json(file_path:str, endpoint:str = "http://localhost:9000/cosmos_json_to_mentions") -> List[Mapping[str, Any]]:
    """ Annotates an existing json file on the server with the text reading pipeline """

    data = requests.post(endpoint, json={'pathToCosmosJson': file_path}).json()

    return data


def build_data_frame(mentions:List[Mapping[str, Any]], documents:Mapping[str, Any]) -> DataFrame:
    """ Builds a data frame from the output of text reading """
    
    def get_arguments_data(m):
        if 'arguments' in m:
            args_data = dict()
            for (ix, (arg_name, arg_data)) in enumerate(m['arguments'].items()):
                arg_data = arg_data[0]
                ix += 1
                args_data[f"arg_name_{ix}"] = arg_name
                args_data[f"arg_grounding_{ix}"] = build_groundings(arg_data)
                
            return args_data
        else:
            return {}
        

    return DataFrame(
            {
                'text': mention['text'],
                'sentence': get_sentence(mention, documents),
                'start_token': mention['tokenInterval']['start'],
                'end_token': mention['tokenInterval']['end'],
                'mention_type': mention['type'],
                'label': mention['labels'][0],
                'grounding_id': build_groundings(mention),
#                 **get_arguments_data(mention)
                # 'sentence_ix': mention['sentence'],				
            }
            for mention in mentions
        )

def text_reading_mentions(file_path:str) -> DataFrame:
    """ Puts together annotation and creating the data frame"""
    
    return build_data_frame(**annotate_json(file_path))

In [None]:
def get_paper_by_doi(doi:str) -> Optional[str]:
	""" Mocks getting the Cosmos output by the paper's DOI """
	papers = {
		"10.1016/j.chaos.2021.110689": "/media/evo870/github/COSMOS-data/output_files/documents_5Febcovid19--COSMOS-data.json"
	}

	return papers.get(doi)

In [None]:
pd.set_option('display.max_colwidth', None)

paper_path = get_paper_by_doi("10.1016/j.chaos.2021.110689")

text_reading_mentions(paper_path)

In [26]:
extractions = annotate_json("/media/evo870/github/COSMOS-data/output_files/CHIME_SIR/documents_5FebCHIME_SIR--COSMOS-data.json")

In [27]:
extractions

{'mentions': [{'id': 'T:1975857200',
   'type': 'TextBoundMention',
   'text': 'model hasbeen',
   'labels': ['Phrase', 'Entity'],
   'tokenInterval': {'start': 1, 'end': 3},
   'characterStartOffset': 525,
   'characterEndOffset': 538,
   'sentence': 3,
   'document': '-647136105',
   'keep': True,
   'foundBy': 'simple-np',
   'attachments': [{'filename': 'CHIME_SIR.pdf',
     'pageNum': [2],
     'blockIdx': [1],
     'attType': 'MentionLocation'}]},
  {'id': 'T:2011592377',
   'type': 'TextBoundMention',
   'text': 'several epidemiologists',
   'labels': ['Phrase', 'Entity'],
   'tokenInterval': {'start': 5, 'end': 7},
   'characterStartOffset': 551,
   'characterEndOffset': 574,
   'sentence': 3,
   'document': '-647136105',
   'keep': True,
   'foundBy': 'simple-np',
   'attachments': [{'filename': 'CHIME_SIR.pdf',
     'pageNum': [2],
     'blockIdx': [1],
     'attType': 'MentionLocation'}]},
  {'id': 'T:-1487347468',
   'type': 'TextBoundMention',
   'text': 'PhD',
   'labels'

In [None]:
with open('/home/enrique/github/skema/skema/text_reading/mention_linking/data/extractions/BUCKY.json', 'w') as f:
	json.dump(extractions, f)

In [34]:
frame = build_data_frame(**extractions)

In [30]:
frame.to_csv('chime_manual.csv')

In [23]:
[e for e in extractions['mentions'] if e['type'] == 'TextBoundMention' and len(e['attachments']) > 1]

[]