# MIT (TA1): From Paper and Code to annotated Petri Nets

#### Mike Cafarella, Chunwei Liu, Markos Markakis, Peter Chen

## 0. Preprocessing

In [2]:
import ast, json, requests, os
from IPython import display
from gpt_key import *
PARAM = "/Users/chunwei/research/mitaskem/resources/xDD/params/"
API_ROOT = "http://localhost:8000/"

In [3]:
from ensemble.ensemble import load_paper_info

papers = load_paper_info("/Users/chunwei/research/mitaskem/resources/xDD/xdd_paper.json")
paper = papers[0]
paper

{'title': 'COVID-19 Vaccine Effectiveness by Product and Timing in New York State',
 'doi': '10.1101/2021.10.08.21264595',
 'url': 'https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1'}

#### We can run a local script to consolidate the "content" fields to get just the text of the paper:

## 1. Extracting variables and annotating them from the text and the DKG

#### Using our API (powered by GPT-3), we can extract variables from the paper alongside a list of possible definitions, and ground each of these variables to the MIRA DKG (thanks Harvard team!). If you're interested, the JSON format of our intermediate output can be found [here](https://github.com/mikecafarella/mitaskem/blob/main/JSONformat.md).

In [4]:
with open(PARAM+paper["title"]+"_vars.txt", "r") as f:
        text = f.read()           
        dct_extract = {"text":text, "gpt_key":GPT_KEY}           
        json_str = requests.post(API_ROOT + "annotation/find_text_vars/", params=dct_extract).text

In [5]:
ast.literal_eval(json_str)

[{'type': 'variable',
  'name': 'VE',
  'id': 'v0',
  'text_annotations': [' Vaccine Effectiveness', ' Vaccine Effectiveness'],
  'dkg_annotations': [['']]},
 {'type': 'variable',
  'name': 'Pfizer-BioNTech',
  'id': 'v1',
  'text_annotations': [' Vaccine Product',
   ' Vaccine primarily distributed to NYS'],
  'dkg_annotations': []},
 {'type': 'variable',
  'name': 'Moderna',
  'id': 'v2',
  'text_annotations': [' Vaccine Product', ' Vaccine product'],
  'dkg_annotations': [['ncit:C177124', 'Moderna'], ['vo:0004953', 'Moderna']]},
 {'type': 'variable',
  'name': 'Janssen',
  'id': 'v3',
  'text_annotations': [' Vaccine Product', ' Vaccine product'],
  'dkg_annotations': [['vo:0004932', 'Janssen Pharmaceutica'],
   ['ncit:C164191', 'Janssen Pharmaceuticals']]},
 {'type': 'variable',
  'name': 'Delta variant',
  'id': 'v4',
  'text_annotations': [' Variant of the virus',
   ' Percentage of the population that has the Delta variant of the virus.VE for cases'],
  'dkg_annotations': [['cid

In [6]:
dkg_json = json.loads(json_str)
for variable in dkg_json:
    variable["title"] = paper["title"]
    variable["doi"] = paper["doi"]
    variable["url"] = paper["url"]
dkg_json_string = json.dumps(dkg_json)
dkg_json_string

'[{"type": "variable", "name": "VE", "id": "v0", "text_annotations": [" Vaccine Effectiveness", " Vaccine Effectiveness"], "dkg_annotations": [[""]], "title": "COVID-19 Vaccine Effectiveness by Product and Timing in New York State", "doi": "10.1101/2021.10.08.21264595", "url": "https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1"}, {"type": "variable", "name": "Pfizer-BioNTech", "id": "v1", "text_annotations": [" Vaccine Product", " Vaccine primarily distributed to NYS"], "dkg_annotations": [], "title": "COVID-19 Vaccine Effectiveness by Product and Timing in New York State", "doi": "10.1101/2021.10.08.21264595", "url": "https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1"}, {"type": "variable", "name": "Moderna", "id": "v2", "text_annotations": [" Vaccine Product", " Vaccine product"], "dkg_annotations": [["ncit:C177124", "Moderna"], ["vo:0004953", "Moderna"]], "title": "COVID-19 Vaccine Effectiveness by Product and Timing in New York State", "doi": "10.1101/2021.1

## 2. Adding annotations from dataset columns

#### Alongside the text, we might also have discovered a collection of datasets that we think might be relevant:

#### Let's collect just the column names into a single file:

In [7]:
dir = "../../resources/dataset/ensemble/"
with open(os.path.join(dir,"headers.txt"), "w+") as fw:
    for filename in os.listdir(dir):
        file = os.path.join(dir, filename)
        if os.path.isfile(file) and file.endswith(".csv"):
            fw.write("{}:\t{}".format(filename, open(file, "r").readline()))

In [8]:
with open(os.path.join(dir,"headers.txt")) as f:
    dataset_str = f.read()
    print(dataset_str[:419])

usa-cases-deaths.csv:	date,location_key,new_confirmed,new_deceased,new_recovered,new_tested,cumulative_confirmed,cumulative_deceased,cumulative_recovered,cumulative_tested
hospitalization-rate-by-month.csv:	month,num_hosp_month,num_cases_month
age-stratified-by-month-hosp-cases-deaths.csv:	age_group,case_month,num_cases,num_hosp,num_death
vaccination-hazard-rates-age-month.csv:	Month,Age Group,Vaccinated Pop,Unvacci


#### Now we can call our `annotation/link_datasets_to_vars` endpoint to map variables discovered earlier to any matching dataset columns (GIGO warning here):

In [9]:
from connect import vars_dataset_connection

json_str, success = vars_dataset_connection(dkg_json_string,dataset_str,GPT_KEY)
json_str

100%|██████████| 11/11 [00:00<00:00, 59226.37it/s]
100%|██████████| 36/36 [02:13<00:00,  3.71s/it]


'[{"type": "variable", "name": "VE", "id": "v0", "text_annotations": [" Vaccine Effectiveness", " Vaccine Effectiveness"], "dkg_annotations": [[""]], "title": "COVID-19 Vaccine Effectiveness by Product and Timing in New York State", "doi": "10.1101/2021.10.08.21264595", "url": "https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1", "data_annotations": [["usa-vaccinations.csv", "new_persons_vaccinated"], ["usa-vaccinations.csv", "cumulative_persons_vaccinated"]]}, {"type": "variable", "name": "Pfizer-BioNTech", "id": "v1", "text_annotations": [" Vaccine Product", " Vaccine primarily distributed to NYS"], "dkg_annotations": [], "title": "COVID-19 Vaccine Effectiveness by Product and Timing in New York State", "doi": "10.1101/2021.10.08.21264595", "url": "https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1", "data_annotations": [["usa-vaccinations.csv", "new_persons_vaccinated_pfizer"], ["usa-vaccinations.csv", "cumulative_persons_vaccinated_pfizer"]]}, {"type": "variab

In [10]:
ast.literal_eval(json_str)

[{'type': 'variable',
  'name': 'VE',
  'id': 'v0',
  'text_annotations': [' Vaccine Effectiveness', ' Vaccine Effectiveness'],
  'dkg_annotations': [['']],
  'title': 'COVID-19 Vaccine Effectiveness by Product and Timing in New York State',
  'doi': '10.1101/2021.10.08.21264595',
  'url': 'https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1',
  'data_annotations': [['usa-vaccinations.csv', 'new_persons_vaccinated'],
   ['usa-vaccinations.csv', 'cumulative_persons_vaccinated']]},
 {'type': 'variable',
  'name': 'Pfizer-BioNTech',
  'id': 'v1',
  'text_annotations': [' Vaccine Product',
   ' Vaccine primarily distributed to NYS'],
  'dkg_annotations': [],
  'title': 'COVID-19 Vaccine Effectiveness by Product and Timing in New York State',
  'doi': '10.1101/2021.10.08.21264595',
  'url': 'https://www.medrxiv.org/content/10.1101/2021.10.08.21264595v1',
  'data_annotations': [['usa-vaccinations.csv',
    'new_persons_vaccinated_pfizer'],
   ['usa-vaccinations.csv', 'cumulative_pe

In [11]:
paper = papers[1]
paper
with open(PARAM+paper["title"]+"_vars.txt", "r") as f:
        text = f.read()
        dct_extract = {"text":text, "gpt_key":GPT_KEY}
        json_str1 = requests.post(API_ROOT + "annotation/find_text_vars/", params=dct_extract).text
dkg_json1 = json.loads(json_str)
for variable in dkg_json1:
    variable["title"] = paper["title"]
    variable["doi"] = paper["doi"]
    variable["url"] = paper["url"]
dkg_json_string1 = json.dumps(dkg_json1)
json_str1, success = vars_dataset_connection(dkg_json_string1,dataset_str,GPT_KEY)

data_json = json.loads(json_str)
data_json1 = json.loads(json_str1)
data_json.extend(data_json1)

100%|██████████| 11/11 [00:00<00:00, 26022.19it/s]
100%|██████████| 36/36 [02:22<00:00,  3.96s/it]


In [12]:
with open('mit-extraction.json', 'w', encoding='utf-8') as json_file:
    json.dump(data_json, json_file, ensure_ascii=False, indent=4)

## 3. Getting a Petri net (as a pyascet) from code

#### Let's now turn our attention to code. We have a python function that describes the Bucky dynamics:

In [None]:
with open("../../demos/2023-03-19/bucky.py", "r") as f:
    code = f.read()
print(code)

#### Using calls to the public MIT API, we can get Petri net components (places, transitions, hypothesized arcs) from this piece of code.

In [None]:
dict_petri = {"code": code, "gpt_key": GPT_KEY}
places = requests.post(API_ROOT + "petri/get_places", params=dict_petri).text
ast.literal_eval(places)

In [None]:
transitions = requests.post(API_ROOT + "petri/get_transitions", params=dict_petri).text
ast.literal_eval(transitions)

In [None]:
arcs = requests.post(API_ROOT + "petri/get_arcs", params=dict_petri).text
ast.literal_eval(arcs)

#### We can then convert these outputs into a py-acset (thanks to Justin Lieffers from Arizona for some of the conversion code and to Owen Lynch for the py-acset code!)

In [None]:
dict_acset = {"places_str": places, "transitions_str": transitions, "arcs_str": arcs}
pyacset_str = requests.post(API_ROOT + "petri/get_pyacset", params=dict_acset).text

In [None]:
ast.literal_eval(pyacset_str)

## 4. Linking the annotations to the py-acset and paper info [WIP]

#### Finally, we bring everything together: for every place and transition in the pyacset, let's map it to the annotations from earlier:

In [None]:
dct_link = {"pyacset_str":pyacset_str, "annotations_str":json_str, "info_str":""}
metadata_str = requests.post(API_ROOT + "annotation/link_annos_to_pyacset/", params=dct_link).text
print(metadata_str)

In [None]:
ast.literal_eval(metadata_str)

#### Data in this format can be ingested, visualized and edited by TA4! Let's export the related files:

In [None]:
with open("output-mit-pyacset.json", "w+") as f:
    json.dump(ast.literal_eval(pyacset_str), f)
with open("output-mit-metadata.json", "w+") as f:
    json.dump(ast.literal_eval(metadata_str), f)

## 5. Interacting with the University of Arizona codepaths

#### The University of Arizona team can also produce an annotated py-acset as an output. We can integrate the two outputs by matching on the names of places and transitions, to get a more complete picture of the model. The metadata extracted by both teams can then be accessible by using the associated `uid` of each place/transition as a key into the metadata JSON file.