# MIT (TA1): From Paper and Code to annotated extraction

## 0. Preprocessing

In [15]:
import ast, json, requests, os
from gpt_key import *
API_ROOT = "http://localhost:8000/"

#### We can run a local script to consolidate the "content" fields to get just the text of the paper:

## 1. Extracting variables and annotating them from the text and the DKG

#### We extract variables from the paper alongside a list of possible definitions, and ground each of these variables to the MIRA DKG.

In [None]:
with open("../../resources/models/Bucky/bucky_short.txt", "r") as f:
        text = f.read()           
        dct_extract = {"text":text, "gpt_key":GPT_KEY}           
        json_str = requests.post(API_ROOT + "annotation/find_text_vars/", params=dct_extract).text

In [None]:
ast.literal_eval(json_str)

## 2. Adding annotations from dataset columns

#### Alongside the text, we might also have discovered a collection of datasets that we think might be relevant:

In [None]:
with open("../../resources/dataset/covid_confirmed_usafacts.csv") as f:
    dataset_1 = f.read()
    print(dataset_1[:150])

In [None]:
with open("../../resources/dataset/covid_deaths_usafacts.csv") as f:
    dataset_2 = f.read()
    print(dataset_2[:150])

#### Let's collect just the column names into a single file:

In [None]:
dir = "../../resources/dataset/"
with open(os.path.join(dir,"headers.txt"), "w+") as fw:
    for filename in os.listdir(dir):
        file = os.path.join(dir, filename)
        if os.path.isfile(file) and file.endswith(".csv"):
            fw.write("{}:\t{}".format(filename, open(file, "r").readline()))

In [None]:
with open(os.path.join(dir,"headers.txt")) as f:
    dataset_str = f.read()
    print(dataset_str[:419])

#### Now we can call our `annotation/link_datasets_to_vars` endpoint to map variables discovered earlier to any matching dataset columns (GIGO warning here):

In [None]:
dct_cols = {"json_str":json_str, "dataset_str": dataset_str, "gpt_key":GPT_KEY}           
json_str = requests.post(API_ROOT + "annotation/link_datasets_to_vars/", params=dct_cols).text
json_str

In [None]:
ast.literal_eval(json_str)

## 3. Annotate table headers from the DKG

#### Let's collect column names from a csv file:

In [18]:
with open("../../resources/dataset/us-counties.csv", "r") as f:
    col_str = f.read()
print(col_str)

date,county,state,fips,cases,deaths
2020-01-21,Snohomish,Washington,53061,1,0
2020-01-22,Snohomish,Washington,53061,1,0
2020-01-23,Snohomish,Washington,53061,1,0
2020-01-24,Cook,Illinois,17031,1,0
2020-01-24,Snohomish,Washington,53061,1,0
2020-01-25,Orange,California,06059,1,0
2020-01-25,Cook,Illinois,17031,1,0


In [19]:
with open("../../resources/dataset/us-counties_doc.txt", "r") as f:
    col_doc = f.read()
print(col_doc)

The data is the product of dozens of journalists working across several time zones to monitor news conferences, analyze data releases and seek clarification from public officials on how they categorize cases.

It is also a response to a fragmented American public health system in which overwhelmed public servants at the state, county and territorial level have sometimes struggled to report information accurately, consistently and speedily. On several occasions, officials have corrected information hours or days after first reporting it. At times, cases have disappeared from a local government database, or officials have moved a patient first identified in one state or county to another, often with no explanation. In those instances, which have become more common as the number of cases has grown, our team has made every effort to update the data to reflect the most current, accurate information while ensuring that every known case is counted.

When the information is available, we count

#### Now we can call our `annotation/link_dataset_col_to_dkg` endpoint to ground each of these column names to the MIRA DKG:

In [20]:
dct_cols_dkg = {"csv_str":col_str, "doc":col_doc, "gpt_key":GPT_KEY}
ground_res = requests.post(API_ROOT + "annotation/link_dataset_col_to_dkg/", params=dct_cols_dkg).text
ast.literal_eval(ground_res)

{'date': {'col_name': 'date',
  'concept': 'Date',
  'unit': 'YYYY-MM-DD',
  'description': 'The date when the cumulative number of confirmed cases and deaths were reported.',
  'dkg_groundings': [['apollosv:00000429', 'date'],
   ['oboinowl:date', 'date'],
   ['dc:date', 'Date'],
   ['geonames:2130188', 'Hakodate'],
   ['oboinowl:hasDate', 'has_date'],
   ['idocovid19:0001277', 'COVID-19 incidence', 'class'],
   ['ido:0000480', 'infection incidence', 'class'],
   ['idocovid19:0001283', 'SARS-CoV-2 incidence', 'class'],
   ['hp:0001402', 'Hepatocellular carcinoma', 'class'],
   ['oae:0000178', 'AE incidence rate', 'class'],
   ['orphanet.ordo:409966', 'point prevalence', 'class'],
   ['obcs:0000064', 'period prevalence', 'class'],
   ['cemo:weighted_prevalence', 'weighted prevalence', 'class'],
   ['idocovid19:0001272', 'COVID-19 prevalence', 'class'],
   ['ido:0000486', 'infection prevalence', 'class']]},
 'county': {'col_name': 'county',
  'concept': 'County',
  'unit': 'Text',
  'de

## 4. Getting a Petri net (as a pyascet) from code

#### Let's now turn our attention to code. We have a python function that describes the Bucky dynamics:

In [None]:
with open("../../resources/models/Bucky/bucky.py", "r") as f:
    code = f.read()
print(code)

#### Using calls to the public MIT API, we can get Petri net components (places, transitions, hypothesized arcs) from this piece of code.

In [None]:
dict_petri = {"code": code, "gpt_key": GPT_KEY}
places = requests.post(API_ROOT + "petri/get_places", params=dict_petri).text
ast.literal_eval(places)

In [None]:
transitions = requests.post(API_ROOT + "petri/get_transitions", params=dict_petri).text
ast.literal_eval(transitions)

In [None]:
arcs = requests.post(API_ROOT + "petri/get_arcs", params=dict_petri).text
ast.literal_eval(arcs)

#### We can then convert these outputs into a py-acset (thanks to Justin Lieffers from Arizona for some of the conversion code and to Owen Lynch for the py-acset code!)

In [None]:
dict_acset = {"places_str": places, "transitions_str": transitions, "arcs_str": arcs}
pyacset_str = requests.post(API_ROOT + "petri/get_pyacset", params=dict_acset).text

In [None]:
ast.literal_eval(pyacset_str)

## 5. MIT annotation end to end pipeline

#### Finally, we bring every annotation step together: for the original paper text, let's integrate the above extraction modules and output the MIT extraction:

In [4]:
with open('../../resources/models/Bucky/bucky.txt', 'rb') as f:
    files = {'file': ('filename', f)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "annotation/upload_file_extract/",  params=params,  files=files)
    json_str = response.text
ast.literal_eval(json_str)


[{'type': 'variable',
  'name': 'S_i j',
  'id': 'v0',
  'text_annotations': [' Proportion of individuals who are susceptible to the virus'],
  'dkg_annotations': [['geonames:2479536', 'Skikda'],
   ['geonames:487495', 'Sterlitamak']],
  'title': 'd096ade60e503c888b756724e3818835__filename',
  'data_annotations': []},
 {'type': 'variable',
  'name': 'E_i j',
  'id': 'v1',
  'text_annotations': [' Proportion of individuals who have been exposed to the virus'],
  'dkg_annotations': [['geonames:1816670', 'Beijing'],
   ['geonames:1799491', 'Neijiang']],
  'title': 'd096ade60e503c888b756724e3818835__filename',
  'data_annotations': []},
 {'type': 'variable',
  'name': 'I_i, j^hosp',
  'id': 'v2',
  'text_annotations': [' Proportion of individuals that are exhibiting severe disease symptoms and are in need of hospitalization'],
  'dkg_annotations': [],
  'title': 'd096ade60e503c888b756724e3818835__filename',
  'data_annotations': [[[6,
     'usa-hospitalizations.csv',
     2,
     'new_hosp

## 6. Interacting with the University of Arizona extraction

#### With both the University of Arizona and MIT extractions, we first build the entity matching mapping for all the extracted variable entities, and then integrate the two integration together with the unified TA1 data model.

In [17]:
with open('../../resources/xDD/mit-extraction/bucky__mit-extraction_id.json', 'rb') as f_mit, open('../../resources/xDD/arizona-extraction/bucky_arizona_output_example.json', 'rb') as f_arizona:
    files = { 'mit_file': ('bucky__mit-extraction_id.json', f_mit, 'application/json'),
        'arizona_file': ('bucky_arizona_output_example.json', f_arizona, 'application/json')}
    params = {"gpt_key": GPT_KEY}
    print(GPT_KEY)
    response = requests.post(API_ROOT + "integration/get_mapping", params=params, files=files)
    print(response.text)

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
{"attributes":[{"type":"anchored_extraction","amr_element_id":null,"payload":{"id":{"id":"R:190348269"},"names":[{"id":{"id":"T:-1709799622"},"name":"Bucky","extraction_source":{"page":0,"block":0,"char_start":738,"char_end":743,"document_reference":{"id":"buckymodel_webdocs.pdf"}},"provenance":{"method":"Skema TR Pipeline rules","timestamp":"2023-06-23T07:08:06.271183"}}],"descriptions":[{"id":{"id":"T:-486841659"},"source":"time","grounding":[{"grounding_text":"time since time scale zero","grounding_id":"apollosv:00000272","source":[],"score":0.8945620059967041,"provenance":{"method":"SKEMA-TR-Embedding","timestamp":"2023-06-23T07:08:06.276472"}}],"extraction_source":{"page":0,"block":0,"char_start":732,"char_end":736,"document_reference":{"id":"buckymodel_webdocs.pdf"}},"provenance":{"method":"Skema TR Pipeline rules","timestamp":"2023-06-23T07:08:06.271183"}}],"value_specs":[],"groundings":[],"data_columns":null}},{"type":"anchore