# MIT (TA1): Data and model cards


## 0. Preprocessing

In [7]:
import ast, json, requests, os
from gpt_key import *
API_ROOT = "http://3.83.68.208/"
# "http://localhost:8000/" #

## 1. MIT annotation end to end pipeline

#### We bring every annotation step together: for the original paper text, let's integrate all the extraction modules and output the MIT extraction:
[http://3.83.68.208/#/Paper-2-annotated-vars/upload_file_annotate_annotation_upload_file_extract__post](http://3.83.68.208/#/Paper-2-annotated-vars/upload_file_annotate_annotation_upload_file_extract__post)

In [10]:
# We assume plain text from arizona team.
scenario_1_paper = "sidarthe.txt"
with open(scenario_1_paper, 'rb') as f:
    files = {'file': ('filename', f)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "annotation/upload_file_extract/",  params=params,  files=files)
    json_str = response.text
print(json_str)


{"attributes":[{"type":"anchored_extraction","amr_element_id":null,"payload":{"id":{"id":"v0"},"names":[{"id":{"id":"v0"},"name":"SIDARTHE","extraction_source":null,"provenance":{"method":"MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)","timestamp":"2023-08-15T14:32:17.234476"}}],"descriptions":[{"id":{"id":"v0"},"source":"Model for predicting the course of the COVID-19 epidemic","grounding":null,"extraction_source":null,"provenance":{"method":"MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)","timestamp":"2023-08-15T14:32:17.234476"}}],"value_specs":null,"groundings":[{"grounding_text":"COVID-19 epidemic","grounding_id":"cido:0000154","source":[],"score":1.0,"provenance":{"method":"MIT extractor V1.0 - text, dataset, formula annotation (chunwei@mit.edu)","timestamp":"2023-08-15T14:32:17.234476"}},{"grounding_text":"COVID-19 disease course","grounding_id":"idocovid19:0001143","source":[],"score":1.0,"provenance":{"method":"MIT extractor

## 2. Get model card

#### Model cards provide general information about the model.
[http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

In [6]:
text_name = "sidarthe.txt"
code_name = "sidarthe-code.txt"

with open(text_name, 'rb') as f_text, open(code_name, 'rb') as f_code:
    files = { 'text_file': ('filename', f_text),'code_file': ('filename', f_code)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_model_card", params=params, files=files)
#     print(response.text)

ast.literal_eval(response.text)

{'DESCRIPTION': 'Modelling the COVID-19 epidemic and implementation of population-wide interventions in Italy',
 'AUTHOR_INST': 'University of Trento, Trento, Italy',
 'AUTHOR_AUTHOR': 'Giulia Giordano, Franco Blanchini, Raffaele Bruno, Patrizio Colaneri, Alessandro Di Filippo, Angela Di Matteo, Marta Colaneri',
 'AUTHOR_EMAIL': 'giulia.giordano@unitn.it',
 'DATE': 'UNKNOWN',
 'SCHEMA': 'The model considers eight stages of infection: susceptible (S), infected (I), diagnosed (D), ailing (A), recognized (R), threatened (T), healed (H), and extinct (E), collectively termed SIDARTHE.',
 'PROVENANCE': 'The model was trained using data from the COVID-19 epidemic in Italy.',
 'DATASET': 'UNKNOWN',
 'COMPLEXITY': 'UNKNOWN',
 'USAGE': 'The model can be used to predict the course of the COVID-19 epidemic and evaluate the impact of different control strategies.',
 'LICENSE': 'UNKNOWN'}

## 3. Get data card

####  TODO (we need to find the corresponding dataset)
Data cards summarize general information about the dataset.
[http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post](http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post)

The data card endpoint supports both tabular and matrix (e.g. transmission probabilities) data.
It will automatically detect which one has been passed, based on whether or not the first row in the dataset contains purely numeric data.

In addition to the fields which are populated based on the documentation provided (e.g. `DESCRIPTION`, `AUTHOR_NAME`, etc.), the data card also includes the data profiling results.
When the data is tabular, this consists of column-level DKG groundings and summary statistics.
When it is a matrix, it instead consists of matrix-level summary statistics.

**Note:** When the data is tabular, `EXAMPLES` is a dictionary from `column name: example column value`. When the data is a matrix, `EXAMPLES` is a list consisting of a *sample row*.


In [34]:
# Tabular example
csv_name = "../../resources/dataset/ensemble/abm.csv"
doc_name = "../../resources/dataset/ensemble/abm_doc.txt"

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
# print(json_str)
ast.literal_eval(json_str)


{'DESCRIPTION': 'A Simple SEIR model for wastewater-based epidemiological modeling of COVID-19 outbreak.',
 'AUTHOR_NAME': 'Mehrdad Fazli, Samuel Sklar, Michael D Porter, Brent A French, Heman Shakeri',
 'AUTHOR_EMAIL': 'UNKNOWN',
 'DATE': '2021',
 'PROVENANCE': 'The data was collected through wastewater surveillance as a continuous pooled sampling technique.',
 'SENSITIVITY': 'UNKNOWN',
 'LICENSE': 'UNKNOWN',
 'SCHEMA': ['dates',
  'VAX_count',
  'day',
  'sdm',
  'events',
  'I_1',
  'I_2',
  'I_3',
  'Y_1',
  'Y_2',
  'Y_3',
  'V_1',
  'V_2',
  'V_3',
  'Infected',
  'Y',
  'V',
  'logV'],
 'EXAMPLES': {'dates': '2020-05-01',
  'VAX_count': 0.0,
  'day': 52.0,
  'sdm': 0.086,
  'events': 0.0,
  'I_1': 33.0,
  'I_2': 8.0,
  'I_3': 116.0,
  'Y_1': 1.0,
  'Y_2': 0.0,
  'Y_3': 1.0,
  'V_1': 48330.96993832172,
  'V_2': 30791.74221975949,
  'V_3': 199986.2197207459,
  'Infected': 157.0,
  'Y': 2.0,
  'V': 83871.45174036581,
  'logV': 11.337040569240429},
 'DATA_PROFILING_RESULT': {'dates'