# MIT (TA1): Data and model cards


## 0. Preprocessing

In [1]:
import ast, json, requests, os
GPT_KEY = os.environ.get('OPENAI_API_KEY')
API_ROOT = "http://localhost:8000/" # "http://3.83.68.208/"

## 1. Get data card

#### Data cards summarize general information about the dataset.
[http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post](http://3.83.68.208/#/Data-and-model-cards/get_data_card_cards_get_data_card_post)

The data card endpoint supports both tabular and matrix (e.g. transmission probabilities) data.
It will automatically detect which one has been passed, based on whether or not the first row in the dataset contains purely numeric data.

In addition to the fields which are populated based on the documentation provided (e.g. `DESCRIPTION`, `AUTHOR_NAME`, etc.), the data card also includes the data profiling results.
When the data is tabular, this consists of column-level DKG groundings and summary statistics.
When it is a matrix, it instead consists of matrix-level summary statistics.

**Note:** When the data is tabular, `EXAMPLES` is a dictionary from `column name: example column value`. When the data is a matrix, `EXAMPLES` is a list consisting of a *sample row*.


In [2]:
from ta1viewer import display_json

In [3]:
# Tabular example
csv_name = "../../mitaskem/resources/dataset/ensemble/abm.csv"
doc_name = "../../mitaskem/resources/dataset/ensemble/abm_doc.txt"

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text
print(json_str)
#ast.literal_eval(json_str)



{"DESCRIPTION":"A Simple SEIR model for wastewater-based epidemiological modeling of COVID-19 outbreak.","AUTHOR_NAME":"Mehrdad Fazli, Samuel Sklar, Michael D Porter, Brent A French, Heman Shakeri","AUTHOR_EMAIL":"UNKNOWN","DATE":"2021","PROVENANCE":"The data was collected through wastewater surveillance as a continuous pooled sampling technique.","SENSITIVITY":"UNKNOWN","LICENSE":"UNKNOWN","DATASET_TYPE":"tabular","SCHEMA":["dates","VAX_count","day","sdm","events","I_1","I_2","I_3","Y_1","Y_2","Y_3","V_1","V_2","V_3","Infected","Y","V","logV"],"DATA_PROFILING_RESULT":{"dates":{"col_name":"dates","concept":"Date of data collection","unit":"Date","description":"The date when the data was collected.","dkg_groundings":[["dc:date","Date","property"],["oboinowl:date","date","property"],["opmi:0000488","visit end date","class"],["obib:0000714","date of specimen collection","class"]],"column_stats":{"num_null_entries":0,"type":"date","num_unique_entries":540,"most_common_entries":{"2020-03-10

In [4]:
display_json(json_str)

In [5]:
!ls ~/.cache/mixing-patterns || git clone https://github.com/mobs-lab/mixing-patterns ~/.cache/mixing-patterns

CHANGELOG.md     README.md        [34mdata[m[m
LICENSE          [34manalysis_results[m[m requirements.txt


In [6]:
# Matrix example - make sure to download the data from https://github.com/mobs-lab/mixing-patterns first
csv_name = os.path.expanduser('~/.cache/mixing-patterns/data/contact_matrices/United_States_subnational_New_York_M_overall_contact_matrix_18.csv')
doc_name = os.path.expanduser('~/.cache/mixing-patterns/README.md')

with open(csv_name, 'rb') as f_csv, open(doc_name,  'rb') as f_doc:
    files = {'csv_file': ('filename', f_csv), 'doc_file': ('filename', f_doc)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_data_card/",  params=params,  files=files)
    json_str = response.text

display_json(json_str)

## 2. Get model card

#### Model cards provide general information about the model.
[http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://3.83.68.208/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

#### Model cards provide general information about the model.
[http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post](http://100.26.10.46/#/Data-and-model-cards/get_model_card_cards_get_model_card_post)

In [7]:
text_name = "text_s41598-022-06159-x.txt"
code_name = "scenario2code.txt"

with open(text_name, 'rb') as f_text, open(code_name, 'rb') as f_code:
    files = { 'text_file': ('filename', f_text),'code_file': ('filename', f_code)}
    params = {"gpt_key": GPT_KEY}
    response = requests.post(API_ROOT + "cards/get_model_card", params=params, files=files)
    
display_json(response.text)