In [1]:
import json
import pandas as pd
from pathlib import Path
from askem_extractions.data_model import AttributeCollection

* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
# Read the linked AMR
with (Path("data") / "linked_seirhd_eval_sc1_latex_amr.json").open() as f:
    amr = json.load(f)
    metadata_extractions = AttributeCollection.from_json(amr['metadata'])

In [3]:
from askem_extractions.data_model import AttributeType

# Number of extractions
print(f"Total metadata extractions: {len(metadata_extractions.attributes)}")
linked_metadata = [a for a in metadata_extractions.attributes if a.amr_element_id is not None]
print(f"Linked metadata extractions: {len(linked_metadata)}")

Total metadata extractions: 4070
Linked metadata extractions: 51


In [43]:
# Build the dictionary of AMR elements
from collections import defaultdict
states = defaultdict(list)
params = defaultdict(list)

for state in amr['model']['states']:
    states[state['id']].append(state)

for param in amr['semantics']['ode']['parameters']:
    params[param['id']].append(param)

print({**params})

{'D0': [{'id': 'D0', 'name': 'D0', 'description': 'The total D population at timestep 0'}], 'E0': [{'id': 'E0', 'name': 'E0', 'description': 'The total E population at timestep 0'}], 'H0': [{'id': 'H0', 'name': 'H0', 'description': 'The total H population at timestep 0'}], 'I0': [{'id': 'I0', 'name': 'I0', 'description': 'The total I population at timestep 0'}], 'N': [{'id': 'N', 'name': 'N', 'description': 'N rate'}], 'R0': [{'id': 'R0', 'name': 'R0', 'description': 'The total R population at timestep 0'}], 'S0': [{'id': 'S0', 'name': 'S0', 'description': 'The total S population at timestep 0'}], 'p_{HD}': [{'id': 'p_{HD}', 'name': 'p_{HD}', 'description': 'p_{HD} rate'}], 'p_{HR}': [{'id': 'p_{HR}', 'name': 'p_{HR}', 'description': 'p_{HR} rate'}], 'p_{IH}': [{'id': 'p_{IH}', 'name': 'p_{IH}', 'description': 'p_{IH} rate'}], 'p_{IR}': [{'id': 'p_{IR}', 'name': 'p_{IR}', 'description': 'p_{IR} rate'}], 'r_{EI}': [{'id': 'r_{EI}', 'name': 'r_{EI}', 'description': 'r_{EI} rate'}], 'r_{H

In [137]:
raw_rows = list()
for extraction in linked_metadata:
    key = extraction.amr_element_id
    for element in states[key]:
        raw_rows.append(("state", element['name'], extraction))
    for element in params[key]:
        raw_rows.append(("parameter", element['name'], extraction))

print(len(raw_rows))

51


In [138]:
# Get Scenario Contexts
scontext = [m for m in metadata_extractions.attributes if m.type == AttributeType.scenario_context]

context_index = defaultdict(list)
for c in scontext:
    for ex in c.payload.extractions:
        eid = ex.id
        context_index[eid].append(c.payload)

context_index

defaultdict(list,
            {'R:-1621708045': [ScenarioContext(id=ID(id='1452381753295979931'), extractions=[ID(id='R:-1621708045')], location=LocationContext(location='Moderna', provenance=Provenance(method='SKEMA-TR-Context-1.0', timestamp=datetime.datetime(2023, 7, 18, 16, 20, 11, 947767)), grounding=None, extraction_source=None), time=None),
              ScenarioContext(id=ID(id='-2004954064515446602'), extractions=[ID(id='R:-1621708045')], location=None, time=TemporalContext(datetime='July 2021', start_datetime=None, end_datetime=None, provenance=Provenance(method='SKEMA-TR-Context-1.0', timestamp=datetime.datetime(2023, 7, 18, 16, 20, 11, 947788)), grounding=None)),
              ScenarioContext(id=ID(id='-2004954064515446602'), extractions=[ID(id='R:-1621708045')], location=None, time=TemporalContext(datetime='September 2021', start_datetime=None, end_datetime=None, provenance=Provenance(method='SKEMA-TR-Context-1.0', timestamp=datetime.datetime(2023, 7, 18, 16, 20, 11, 94780

In [139]:
from askem_extractions.data_model import LocationContext, TemporalContext

rows = list()
for type_, element, extraction in raw_rows:
    extraction = extraction.payload
    for name, desc in zip(extraction.names, extraction.descriptions):

        pipeline = "SKEMA" if "Skema" in name.provenance.method else "MIT"
        if pipeline == "SKEMA":
            id_ = extraction.id.id
            ctx = list()
            for c in context_index[id_]:
                if c.location is not None:
                    ctx.append(f"Location: {c.location.location}")
                elif c.time is not None:
                    ctx.append(f"Time: {c.time.datetime}")
            ctx = ", ".join(ctx)
        else:
            ctx = ""

        # print(ctx)

        rows.append({
            "Type": type_,
            "AMR Elem": element,
            "Metadata Name": name.name,
            "Metadata Description": desc.source,
            "Pipeline": pipeline,
            "SKEMA Values": ", ".join(v.value.source for v in extraction.value_specs),
            "Scenario Context": ctx
            # "Method": name.provenance.method
        })



frame = pd.DataFrame(rows).drop_duplicates().sort_values("AMR Elem")
frame

Unnamed: 0,Type,AMR Elem,Metadata Name,Metadata Description,Pipeline,SKEMA Values,Scenario Context
0,state,E,E,Exposed,SKEMA,,Location: Wolfel
3,state,E,E,Exposed,SKEMA,λIS,
4,state,E,S0,Susceptible population,MIT,λIS,
31,state,E,E,Exposed,MIT,,
30,state,E,AZ,Tempe,SKEMA,,"Location: Arizona State University, Location: ..."
20,state,E,E,exposed,SKEMA,,
17,state,E,E,Exposed population,MIT,"1, 0",
16,state,E,)S,Number of exposed human population,SKEMA,"1, 0",
2,state,I,I,Infectious,SKEMA,,Location: Wolfel
6,state,I,I,Infectious,SKEMA,kE δI,


In [167]:
# Aggregate for nicer experience

g = frame.groupby('Metadata Name').agg({
    "AMR Elem": lambda d: next(iter(set(d))),
    "Metadata Description": lambda s: set(s),
    "SKEMA Values": lambda s: set(s),
    "Scenario Context": lambda s: set(x for x in s if x)
}).reset_index()

g['counts'] = g['Metadata Description'].map(len)
g = g.sort_values('counts', ascending=False)
g

Unnamed: 0,Metadata Name,AMR Elem,Metadata Description,SKEMA Values,Scenario Context,counts
4,E,E,"{Exposed, exposed, Exposed population}","{, 1, 0, λIS}",{Location: Wolfel},3
10,R,R,"{Recovered population, Recovered}","{, 85-90, 0}",{Location: Wolfel},2
11,S,S,"{Number of susceptible human population, Susce...","{, Λ/µ}",{},2
0,)S,E,{Number of exposed human population},"{1, 0}",{},1
14,USA,S,{Tempe},{},"{Location: Arizona State University, Location:...",1
22,βS,β,{inoculum release rate of symptomatic infectio...,{1},{},1
21,βA,β,{inoculum release rate of asymptomatic infecti...,{2.71},{},1
20,phis,S,{are hospitalized at the rate phis while the i...,{},{},1
19,passage of time,R,{Number of recovered human population},"{85-90, 0}",{},1
18,k1,β,{Arbitrary constant},{},{},1


In [170]:
def print_row(r):
    print(f"Metadata Name: \"{r['Metadata Name']}\" - Num extractions: {r.counts}")
    print(f"AMR Element: {r['AMR Elem']}")
    print("\tDescriptions:")
    for d in r['Metadata Description']:
        print(f"\t\t-{d}")
    print("\tValues:")
    if len(r['SKEMA Values']) > 0:
        for d in r['SKEMA Values']:
            if d:
                print(f"\t\t-{d}")
    print("\tScenario Context:")
    if len(r['Scenario Context']) > 0:
        for d in r['Scenario Context']:
            if d:
                print(f"\t\t-{d}")

print_row(g.iloc[0])

Metadata Name: "E" - Num extractions: 3
AMR Element: E
	Descriptions:
		-Exposed
		-exposed
		-Exposed population
	Values:
		-1, 0
		-λIS
	Scenario Context:
		-Location: Wolfel


In [172]:
for _, r in g.iterrows():
    print_row(r)
    print()

Metadata Name: "E" - Num extractions: 3
AMR Element: E
	Descriptions:
		-Exposed
		-exposed
		-Exposed population
	Values:
		-1, 0
		-λIS
	Scenario Context:
		-Location: Wolfel

Metadata Name: "R" - Num extractions: 2
AMR Element: R
	Descriptions:
		-Recovered population
		-Recovered
	Values:
		-85-90, 0
	Scenario Context:
		-Location: Wolfel

Metadata Name: "S" - Num extractions: 2
AMR Element: S
	Descriptions:
		-Number of susceptible human population
		-Susceptible
	Values:
		-Λ/µ
	Scenario Context:

Metadata Name: ")S" - Num extractions: 1
AMR Element: E
	Descriptions:
		-Number of exposed human population
	Values:
		-1, 0
	Scenario Context:

Metadata Name: "USA" - Num extractions: 1
AMR Element: S
	Descriptions:
		-Tempe
	Values:
	Scenario Context:
		-Location: Arizona State University, Location: Statistical Sciences, Time: 16 April 2020, Time: 6 April 2020

Metadata Name: "βS" - Num extractions: 1
AMR Element: β
	Descriptions:
		-inoculum release rate of symptomatic infectious in