# MIT (TA1): Data and model cards


## 0. Preprocessing

In [23]:
import ast, json, requests, os
GPT_KEY = os.environ.get('OPENAI_API_KEY')
API_ROOT = "http://localhost:8000/" # "http://3.83.68.208/"

## 1. Get data card

#### Data cards summarize general information about the dataset.
[http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post](http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post)

The data card endpoint supports both tabular and matrix (e.g. transmission probabilities) data.
It will automatically detect which one has been passed, based on whether or not the first row in the dataset contains purely numeric data.

In addition to the fields which are populated based on the documentation provided (e.g. `DESCRIPTION`, `AUTHOR_NAME`, etc.), the data card also includes the data profiling results.
When the data is tabular, this consists of column-level DKG groundings and summary statistics.
When it is a matrix, it instead consists of matrix-level summary statistics.

**Note:** When the data is tabular, `EXAMPLES` is a dictionary from `column name: example column value`. When the data is a matrix, `EXAMPLES` is a list consisting of a *sample row*.


In [25]:
# Tabular example
csv_name = "../../mitaskem/resources/dataset/ensemble/abm.csv"
doc_name = "../../mitaskem/resources/dataset/ensemble/abm_doc.txt"

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
# print(json_str)
ast.literal_eval(json_str)


{'DESCRIPTION': 'A Simple SEIR model for wastewater-based epidemiological modeling of COVID-19 outbreak.',
 'AUTHOR_NAME': 'Mehrdad Fazli, Samuel Sklar, Michael D Porter, Brent A French, Heman Shakeri',
 'AUTHOR_EMAIL': 'UNKNOWN',
 'DATE': '2021',
 'PROVENANCE': 'The data was collected through wastewater surveillance as a continuous pooled sampling technique.',
 'SENSITIVITY': 'UNKNOWN',
 'LICENSE': 'UNKNOWN',
 'DATASET_TYPE': 'tabular',
 'SCHEMA': ['dates',
  'VAX_count',
  'day',
  'sdm',
  'events',
  'I_1',
  'I_2',
  'I_3',
  'Y_1',
  'Y_2',
  'Y_3',
  'V_1',
  'V_2',
  'V_3',
  'Infected',
  'Y',
  'V',
  'logV'],
 'DATA_PROFILING_RESULT': {'dates': {'col_name': 'dates',
   'concept': 'Date of data collection',
   'unit': 'Date',
   'description': 'The date when the data was collected.',
   'dkg_groundings': [['dc:date', 'Date', 'property'],
    ['oboinowl:date', 'date', 'property'],
    ['opmi:0000488', 'visit end date', 'class'],
    ['obib:0000714', 'date of specimen collectio

In [26]:
!git clone https://github.com/mobs-lab/mixing-patterns ~/.cache/mixing-patterns

fatal: destination path '/Users/orm/.cache/mixing-patterns' already exists and is not an empty directory.


In [28]:
# Matrix example - make sure to download the data from https://github.com/mobs-lab/mixing-patterns first
csv_name = os.path.expanduser('~/.cache/mixing-patterns/data/contact_matrices/United_States_subnational_New_York_M_overall_contact_matrix_18.csv')
doc_name = os.path.expanduser('~/.cache/mixing-patterns/README.md')

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
# print(json_str)
ast.literal_eval(json_str)


{'DESCRIPTION': 'Different locations might include only a single country level matrix or additional subnational matrices as indicated below.',
 'AUTHOR_NAME': 'UNKNOWN',
 'AUTHOR_EMAIL': 'UNKNOWN',
 'DATE': 'UNKNOWN',
 'PROVENANCE': 'UNKNOWN',
 'SENSITIVITY': 'UNKNOWN',
 'LICENSE': 'UNKNOWN',
 'DATASET_TYPE': 'matrix',
 'DATA_PROFILING_RESULT': {'matrix_stats': {'num_null_entries': 0,
   'type': 'numeric',
   'min': 0.0492537573404737,
   'max': 7.422581547144762,
   'mean': 0.6399015998362235,
   'std': 0.7700007890649679,
   '25%': 0.24465994009854003,
   '50%': 0.44931271067347417,
   '75%': 0.8522124667881429}},
 'EXAMPLES': [0.2516467228441472,
  0.3373295854802683,
  0.3958656213922929,
  0.597678404998955,
  0.9215187432135592,
  0.982902109605631,
  0.8386027987698976,
  0.7759817646527434,
  0.842062807003874,
  0.970409854763161,
  1.256898425448872,
  1.3637642003999793,
  0.9053222383990704,
  0.2967428433543169,
  0.1560832704228411,
  0.1152383792629327,
  0.1001252658685

## 2. Get model card

#### Model cards provide general information about the model.
[http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

#### Model cards provide general information about the model.
[http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

In [31]:
text_name = "text_s41598-022-06159-x.txt"
code_name = "scenario2code.txt"

with open(text_name, 'rb') as f_text, open(code_name, 'rb') as f_code:
    files = { 'text_file': ('filename', f_text),'code_file': ('filename', f_code)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_model_card", params=params, files=files)
#     print(response.text)
    
ast.literal_eval(response.text)

{'DESCRIPTION': 'Understanding the dynamics of SARS‑CoV‑2 variants of concern in Ontario, Canada: a modeling study',
 'AUTHOR_INST': 'University of Waterloo',
 'AUTHOR_AUTHOR': 'Anita T. Layton, Mehrshad Sadria',
 'AUTHOR_EMAIL': 'anita.layton@uwaterloo.ca',
 'DATE': 'UNKNOWN',
 'SCHEMA': 'UNKNOWN',
 'PROVENANCE': 'The model was developed and applied to better understand the spread of multiple variants of concern (VOC) of SARS-CoV-2 in Ontario, Canada.',
 'DATASET': 'UNKNOWN',
 'COMPLEXITY': 'The complexity of the model is not specified.',
 'USAGE': 'The model should be used to assess the effectiveness of vaccination and non-pharmaceutical interventions (NPI) in controlling the spread of SARS-CoV-2 variants of concern in Ontario, Canada.',
 'LICENSE': 'UNKNOWN'}