# MIT (TA1): Data and model cards


## 0. Preprocessing

In [33]:
import ast, json, requests, os
from gpt_key import *
API_ROOT = "http://localhost:8000/" # "http://3.83.68.208/"

## 1. Get data card

#### Data cards summarize general information about the dataset.
[http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post](http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post)

The data card endpoint supports both tabular and matrix (e.g. transmission probabilities) data.
It will automatically detect which one has been passed, based on whether or not the first row in the dataset contains purely numeric data.

In addition to the fields which are populated based on the documentation provided (e.g. `DESCRIPTION`, `AUTHOR_NAME`, etc.), the data card also includes the data profiling results.
When the data is tabular, this consists of column-level DKG groundings and summary statistics.
When it is a matrix, it instead consists of matrix-level summary statistics.

**Note:** When the data is tabular, `EXAMPLES` is a dictionary from `column name: example column value`. When the data is a matrix, `EXAMPLES` is a list consisting of a *sample row*.


In [34]:
# Tabular example
csv_name = "../../resources/dataset/ensemble/abm.csv"
doc_name = "../../resources/dataset/ensemble/abm_doc.txt"

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
# print(json_str)
ast.literal_eval(json_str)


{'DESCRIPTION': 'A Simple SEIR model for wastewater-based epidemiological modeling of COVID-19 outbreak.',
 'AUTHOR_NAME': 'Mehrdad Fazli, Samuel Sklar, Michael D Porter, Brent A French, Heman Shakeri',
 'AUTHOR_EMAIL': 'UNKNOWN',
 'DATE': '2021',
 'PROVENANCE': 'The data was collected through wastewater surveillance as a continuous pooled sampling technique.',
 'SENSITIVITY': 'UNKNOWN',
 'LICENSE': 'UNKNOWN',
 'SCHEMA': ['dates',
  'VAX_count',
  'day',
  'sdm',
  'events',
  'I_1',
  'I_2',
  'I_3',
  'Y_1',
  'Y_2',
  'Y_3',
  'V_1',
  'V_2',
  'V_3',
  'Infected',
  'Y',
  'V',
  'logV'],
 'EXAMPLES': {'dates': '2020-05-01',
  'VAX_count': 0.0,
  'day': 52.0,
  'sdm': 0.086,
  'events': 0.0,
  'I_1': 33.0,
  'I_2': 8.0,
  'I_3': 116.0,
  'Y_1': 1.0,
  'Y_2': 0.0,
  'Y_3': 1.0,
  'V_1': 48330.96993832172,
  'V_2': 30791.74221975949,
  'V_3': 199986.2197207459,
  'Infected': 157.0,
  'Y': 2.0,
  'V': 83871.45174036581,
  'logV': 11.337040569240429},
 'DATA_PROFILING_RESULT': {'dates'

In [35]:
# Matrix example - make sure to download the data from https://github.com/mobs-lab/mixing-patterns first
csv_name = "../../../mixing-patterns/data/contact_matrices/United_States_subnational_New_York_M_overall_contact_matrix_18.csv"
doc_name = "../../../mixing-patterns/README.md"

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
# print(json_str)
ast.literal_eval(json_str)


{'DESCRIPTION': 'Different locations might include only a single country level matrix or additional subnational matrices as indicated below.',
 'AUTHOR_NAME': 'UNKNOWN',
 'AUTHOR_EMAIL': 'UNKNOWN',
 'DATE': 'UNKNOWN',
 'PROVENANCE': 'UNKNOWN',
 'SENSITIVITY': 'UNKNOWN',
 'LICENSE': 'UNKNOWN',
 'CELL_INTERPRETATION': 'Each contact matrix has dimensions 85x85. For the 4 settings (households, schools, workplaces, and the community) contact matrices, the matrix element F^k_ij represents the per capita probability of contact for an individual of age i with individuals of age j in that setting k. Thus the element F^k_00 is the per capita probability of contact for an individual of age [0, 1) with individuals of age [0, 1), the element F^k_01 is the per capita probability of contact for an individual of age [0, 1) with individuals of age [1, 2), etc. Each row and column refers to a single year of age, except for the last row and column which both refer to ages 84 years old and over.',
 'DATA_

## 2. Get model card

#### Model cards provide general information about the model.
[http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

#### Model cards provide general information about the model.
[http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

In [37]:
text_name = "text_s41598-022-06159-x.txt"
code_name = "scenario2code.txt"

with open(text_name, 'rb') as f_text, open(code_name, 'rb') as f_code:
    files = { 'text_file': ('filename', f_text),'code_file': ('filename', f_code)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_model_card", params=params, files=files)
#     print(response.text)
    
ast.literal_eval(response.text)

{'DESCRIPTION': 'Understanding the dynamics of SARS‑CoV‑2 variants of concern in Ontario, Canada: a modeling study',
 'AUTHOR_INST': 'University of Waterloo',
 'AUTHOR_AUTHOR': 'Anita T. Layton, Mehrshad Sadria',
 'AUTHOR_EMAIL': 'anita.layton@uwaterloo.ca',
 'DATE': 'UNKNOWN',
 'SCHEMA': 'UNKNOWN',
 'PROVENANCE': 'The model was developed and applied to better understand the spread of multiple variants of concern (VOC) of SARS-CoV-2 in Ontario, Canada. The model incorporates competition among VOC and assesses the effectiveness of vaccination and non-pharmaceutical interventions (NPI) in controlling the spread of the virus.',
 'DATASET': 'UNKNOWN',
 'COMPLEXITY': 'The complexity of the model is not specified.',
 'USAGE': 'The model should be used to understand the dynamics of SARS-CoV-2 variants of concern and to assess the effectiveness of vaccination and NPI in controlling the spread of the virus.',
 'LICENSE': 'UNKNOWN'}