In [1]:
import requests
import json
import pandas as pd
from typing import Any, List, Mapping, Optional
from pandas import DataFrame

In [93]:
def get_sentence(mention:Mapping[str, Any], documents:Mapping[str, Any]):
	""" Returns the text containing the mention """
	doc_id = mention['document']
	sentence_ix = mention['sentence']

	sentence = documents[doc_id]['sentences'][sentence_ix]

	return ' '.join(sentence['words'])


def build_groundings(mention:Mapping[str, Any]) -> Optional[str]:
	""" Extracts the grounding ID of the grounding associated to the current mention """

	
	if mention['type'] == "TextBoundMention":
		groundings = list()
		for a in mention['attachments']:
			# If score is a field in the attachment, then this is a grounding
			if type(a) == list:
				for g in a[0]:
					score = "{:.2f}".format(g['score'])
					groundings.append(f"({g['name']}, {g['id']}, {score})")
		if len(groundings) > 0:
			return ', '.join(groundings)
			
	

In [95]:
def annotate_json(file_path:str, endpoint:str = "http://localhost:9000/cosmos_json_to_mentions") -> List[Mapping[str, Any]]:
	""" Annotates an existing json file on the server with the text reading pipeline """

	data = requests.post(endpoint, json={'pathToCosmosJson': file_path}).json()

	return data


def build_data_frame(mentions:List[Mapping[str, Any]], documents:Mapping[str, Any]) -> DataFrame:
	""" Builds a data frame from the output of text reading """

	return DataFrame(
			{
				'text': mention['text'],
				'sentence': get_sentence(mention, documents),
				'start_token': mention['tokenInterval']['start'],
				'end_token': mention['tokenInterval']['end'],
				'mention_type': mention['type'],
				'label': mention['labels'][0],
				'grounding_id': build_groundings(mention),
				# 'sentence_ix': mention['sentence'],				
			}
			for mention in mentions
		)

def text_reading_mentions(file_path:str) -> DataFrame:
	""" Puts together annotation and creating the data frame"""
	
	return build_data_frame(**annotate_json(file_path))

In [3]:
def get_paper_by_doi(doi:str) -> Optional[str]:
	""" Mocks getting the Cosmos output by the paper's DOI """
	papers = {
		"10.1016/j.chaos.2021.110689": "/media/evo870/github/COSMOS-data/output_files/documents_5Febcovid19--COSMOS-data.json"
	}

	return papers.get(doi)

In [109]:
pd.set_option('display.max_colwidth', None)

paper_path = get_paper_by_doi("10.1016/j.chaos.2021.110689")

text_reading_mentions(paper_path)

Unnamed: 0,text,sentence,start_token,end_token,mention_type,label,grounding_id
0,fractional dynamics,"Projections and fractional dynamics of COVID-19 with optimal control strategies * a , c Khondoker Nazmoon Nabi , Pushpendra Kumar b , Vedat Suat Erturk",2,4,TextBoundMention,Phrase,
1,Projections,"Projections and fractional dynamics of COVID-19 with optimal control strategies * a , c Khondoker Nazmoon Nabi , Pushpendra Kumar b , Vedat Suat Erturk",0,1,TextBoundMention,Phrase,
2,b,"Projections and fractional dynamics of COVID-19 with optimal control strategies * a , c Khondoker Nazmoon Nabi , Pushpendra Kumar b , Vedat Suat Erturk",20,21,TextBoundMention,Phrase,"(AIDSVAX B/B, vo:0000405, 0.90), (BoNT/B, vo:0010904, 0.79), (Enterovirus B, ncbitaxon:138949, 0.78), (Diphtheria-Tetanus-Pertussis-Hepatitis B vaccine, vo:0003172, 0.77), (Diphtheria-Tetanus-Pertussis-Haemophilus b vaccine, vo:0000942, 0.77)"
3,optimal control strategies,"Projections and fractional dynamics of COVID-19 with optimal control strategies * a , c Khondoker Nazmoon Nabi , Pushpendra Kumar b , Vedat Suat Erturk",7,10,TextBoundMention,Phrase,
4,COVID-19,"Projections and fractional dynamics of COVID-19 with optimal control strategies * a , c Khondoker Nazmoon Nabi , Pushpendra Kumar b , Vedat Suat Erturk",5,6,TextBoundMention,Phrase,"(COVID-19, doid:0080600, 0.86), (severe COVID-19, doid:0081013, 0.83), (critical COVID-19, doid:0081012, 0.82), (non-severe COVID-19, doid:0081014, 0.81), (COVID-19 vaccine, vo:0004908, 0.76)"
...,...,...,...,...,...,...,...
4069,2021,"Chaos , Solitons and Fractals 145 ( 2021 ) 110689",7,8,TextBoundMention,Phrase,
4070,Fractals 145,"Chaos , Solitons and Fractals 145 ( 2021 ) 110689",4,6,TextBoundMention,Phrase,
4071,110689,"Chaos , Solitons and Fractals 145 ( 2021 ) 110689",9,10,TextBoundMention,Value,
4072,145,"Chaos , Solitons and Fractals 145 ( 2021 ) 110689",5,6,TextBoundMention,Value,
