In [2]:
import json, requests, os
import io
GPT_KEY = os.environ.get('OPENAI_API_KEY_MIT')
assert GPT_KEY, "Please set the OPENAI_API_KEY_MIT environment variable"
API_ROOT = "http://localhost:8000/"

In [3]:
from pathlib import Path

def skema_integrated_pdf_extractions(pdf_path):
    if isinstance(pdf_path, str):
        pdf_path = Path(pdf_path)

    pdf_path : Path = pdf_path.expanduser()
    assert pdf_path.exists()

    output_path = pdf_path.with_suffix('.skema_integrated.json')
    if output_path.exists():
        with open(output_path) as f:
            return json.load(f)

    URL= "https://api.askem.lum.ai"

    params = {
       "annotate_skema":True,
       "annotate_mit": True,
    }

    files = [("pdfs", (str(pdf_path), open(str(pdf_path), "rb")))]
    response = requests.post(f"{URL}/text-reading/integrated-pdf-extractions", params=params, files=files)
    if response.status_code == 200:
        data = response.json()
        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2)
        return data
    else:
        raise  Exception(f'{response.status_code=} {response.text=}')

In [4]:
import importlib
import mitaskem.api.routers.annotation
importlib.reload(mitaskem.api.routers.annotation)

* 'schema_extra' has been renamed to 'json_schema_extra'


<module 'mitaskem.api.routers.annotation' from '/Users/orm/repos/mitaskem/mitaskem/api/routers/annotation.py'>

In [58]:
from pathlib import Path
example_data = Path("~/example_data/").expanduser()
for file in example_data.iterdir():
    if file.suffix == ".pdf":
        print(file)
        try:
            data = skema_integrated_pdf_extractions(file)
            df = list_scenarios_local(None, extractions=data, return_early=True)
            print(df)
        except Exception as e:
            print(e)


/Users/orm/example_data/Patty-reviewed-1-28-24-Why is it difficult to accurately predict the COVID-19 epidemic (1).pdf
    location             id name  value  \
56     Wuhan   E:-999142283    R    2.0   
58     Wuhan  E:-1416661983    R    2.0   
108      AIC   E:1393547595    N   40.0   
109      AIC  E:-2085445646    N   40.0   

                                               passage  
56   At the time of this manuscript , the consensus...  
58   At the time of this manuscript , the consensus...  
108  This definition should be used when K < N = 40...  
109  This definition should be used when K < N = 40...  
/Users/orm/example_data/Patty-reviewed-1-28-24-bertozzi-et-al-2020-the-challenges-of-modeling-and-forecasting-the-spread-of-covid-19.pdf
                                              location             id  \
36                                           Arlington    R:889375648   
37                                         Los Angeles    R:889375648   
38                      

"Scenarios for the impact of short-term social distancing : fraction of population vs. date .
( Left ) California SIR model based on mortality data with parameters from Table 1 ( R0 = 2.7 , gamma = .12 , I0 = .1 ) under two scenarios : R0 constant in time ( light blue ) and R0 cut in half from 27 March ( 1 wk from the start of the California shutdown ) to 5 May but then returned to its original value , to represent a short-term distancing strategy ( dark blue ) .
( Right ) New York SIR model with parameters from Table 1 ( R0 = 4.1 , gamma = .1 , I0 = 05 ) under the same two scenarios but with short-term distancing occurring over the dates of 30 March ( 1 wk from the start of the New York shutdown ) to 5 May ."

[
{'location':'California', 'variable':'R0', 'value':2.7 },
{'location':'California', 'variable':'gamma', 'value':.12},
{'location':'California', 'variable':'I0', 'value':.1},

{'location':'California', 'variable':'R0', 'value':1.35 },
{'location':'California', 'variable':'gamma', 'value':.12},
{'location':'California', 'variable':'I0', 'value':.1},

{'location':'New York', 'variable':'R0', 'value':4.1 },
{'location':'New York', 'variable':'gamma', 'value':.1},
{'location':'New York', 'variable':'I0', 'value':.05},

{'location':'New York', 'variable':'R0', 'value':2.05 },
{'location':'New York', 'variable':'gamma', 'value':.1},
{'location':'New York', 'variable':'I0', 'value':.05},

]



In [5]:
def scenario_extraction(integrated_json):
    ## send post request to
    params = {
        "gpt_key": GPT_KEY,
    }

    files = [("extractions_file", ('', io.BytesIO(json.dumps(data).encode('utf-8'))))]

    response = requests.post('http://localhost:8000/annotation/list_scenarios/', params=params, files=files)
    if response.status_code == 200:
        return response.json()
    else:
        raise  Exception(f'{response.status_code=} {response.text=}')

In [11]:
paths = [
    '/Users/orm/example_data/Patty-reviewed-1-28-24-Why is it difficult to accurately predict the COVID-19 epidemic (1).pdf',
    '/Users/orm/example_data/Patty-reviewed-1-28-24-bertozzi-et-al-2020-the-challenges-of-modeling-and-forecasting-the-spread-of-covid-19.pdf',
    '/Users/orm/example_data/Patty-reviewed-1-28-24-Mathematical modeling of COVID-19 transmission dynamics with a case study of Wuhan.pdf',
]


In [15]:
for path in paths:
    print(Path(path).exists())

import pandas as pd

True
True
True


In [27]:
for path in paths:
    data = skema_integrated_pdf_extractions(path)
    extr  = scenario_extraction(data)
    print(pd.DataFrame(extr))

                         varname value    geo
0  basic reproduction number R 0     2  Wuhan
1                              K   < N   None
2                              N    40   None
3                              K   > N   None
                  varname  value            geo
0                 R ( t )   1.00          China
1                 R ( t )   1.00          Italy
2     reproduction number   2.50  United States
3   B ( CID :88 ) R ( T )   3.00             IK
4                       K   1.00           None
5                      R0   2.70     California
6                   gamma   0.12     California
7                      I0   0.10     California
8                      R0   4.10       New York
9                   gamma   0.10       New York
10                     I0   0.05       New York
      varname                                      value   geo
0   L C ( 0 )  number of confirmed cases 04 January 2020  None
1  L C ( 65 )    number of confirmed cases 09 March 2020  None
2   L

In [80]:
def file_extract_enhanced(*, text : str = None, path : str = None):
    ## send post request to
    params = {
        "gpt_key": GPT_KEY,
    }

    if isinstance(text, str):
        files = [("file", ('', io.BytesIO(text.encode('utf-8'))))]
    elif isinstance(path, str):
        files = [("file", ('', open(path, 'rb')))]
    elif isinstance(path, Path):
        files = [("file", ('', path.open('rb')))]
    else:
        assert False

    response = requests.post('http://localhost:8000/annotation/upload_file_extract_enhanced', params=params, files=files)
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 500:
        raise  Exception(f'{response.status_code=} {str(response.text)=}')
    else:
        raise  Exception(f'{response.status_code=} {json.dumps(response.json(), indent=2)}')

In [81]:
res = file_extract_enhanced(text="the model has variables v0, v1 and v2, with values .1, .2, and .3 respectively")

In [82]:
res

{'attributes': [{'type': 'anchored_entity',
   'amr_element_id': None,
   'payload': {'id': {'id': 'mit0'},
    'mentions': [{'id': {'id': 'mit0'},
      'name': 'v0',
      'extraction_source': None,
      'provenance': {'method': 'MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)',
       'timestamp': '2024-02-28T18:02:22.687396'}}],
    'text_descriptions': [{'id': {'id': 'mit0'},
      'description': 'model variable v0',
      'grounding': None,
      'extraction_source': None,
      'provenance': {'method': 'MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)',
       'timestamp': '2024-02-28T18:02:22.687396'}}],
    'value_descriptions': [{'id': {'id': 'mit0-value'},
      'value': {'amount': '0.1', 'grounding': None, 'extraction_source': None},
      'units': None,
      'type': None,
      'bounds': None,
      'provenance': {'method': 'MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)',
       'timestamp': '202

In [72]:
import jmespath as jp

In [77]:
jp.search("attributes[? type == 'anchored_entity'].payload.value_descriptions[0].value.amount",res)

['0.1', '0.2', '0.3']