In [None]:
# Note: include in the tests folder a .env file that contains the API keys for the services used in the tests    
import os
import pandas as pd  # type: ignore
import palimpzest as pz

if not os.environ.get('OPENAI_API_KEY'):
    with open('.env') as f:
        for line in f:
            key, value = line.strip().split('=')
            os.environ[key] = value

### Columns definition
In the following cell we target schema of the case study we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [2]:
case_data_cols = [
    {"name": "case_submitter_id", "type": str, "desc": "The ID of the case"},
    {"name": "age_at_diagnosis", "type": int | float, "desc": "The age of the patient at the time of diagnosis"},
    {"name": "race", "type": str, "desc": "An arbitrary classification of a taxonomic group that is a division of a species."},
    {"name": "ethnicity", "type": str, "desc": "Whether an individual describes themselves as Hispanic or Latino or not."},
    {"name": "gender", "type": str, "desc": "Text designations that identify gender."},
    {"name": "vital_status", "type": str, "desc": "The vital status of the patient"},
    {"name": "ajcc_pathologic_t", "type": str, "desc": "Code of pathological T (primary tumor) to define the size or contiguous extension of the primary tumor (T), using staging criteria from the American Joint Committee on Cancer (AJCC)."},
    {"name": "ajcc_pathologic_n", "type": str, "desc": "The codes that represent the stage of cancer based on the nodes present (N stage) according to criteria based on multiple editions of the AJCC's Cancer Staging Manual."},
    {"name": "ajcc_pathologic_stage", "type": str, "desc": "The extent of a cancer, especially whether the disease has spread from the original site to other parts of the body based on AJCC staging criteria."},
    {"name": "tumor_grade", "type": int | float, "desc": "Numeric value to express the degree of abnormality of cancer cells, a measure of differentiation and aggressiveness."},
    {"name": "tumor_focality", "type": str, "desc": "The text term used to describe whether the patient's disease originated in a single location or multiple locations."},
    {"name": "tumor_largest_dimension_diameter", "type": int | float, "desc": "The tumor largest dimension diameter."},
    {"name": "primary_diagnosis", "type": str, "desc": "Text term used to describe the patient's histologic diagnosis, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O)."},
    {"name": "morphology", "type": str, "desc": "The Morphological code of the tumor, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O)."},
    {"name": "tissue_or_organ_of_origin", "type": str, "desc": "The text term used to describe the anatomic site of origin, of the patient's malignant disease, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O)."},
    {"name": "study", "type": str, "desc": "The last name of the author of the study, from the table name"}
]

file_cols = [
    {"name": "filename", "type": str, "desc": "The name of the file"},
    {"name": "contents", "type": bytes, "desc": "The contents of the file"}
]

xls_cols = file_cols + [
    {"name": "number_sheets", "type": int, "desc": "The number of sheets in the Excel file"},
    {"name": "sheet_names", "type": list[str], "desc": "The names of the sheets in the Excel file"},
]

table_cols = [
    {"name": "rows", "type": list[str], "desc": "The rows of the table"},
    {"name": "header", "type": list[str], "desc": "The header of the table"},
    {"name": "name", "type": str, "desc": "The name of the table"},
    {"name": "filename", "type": str, "desc": "The name of the file the table was extracted from"}
]


print("Case Data columns:")
for col in case_data_cols[:5]:
    print(col)

Case Data columns:
{'name': 'case_submitter_id', 'type': <class 'str'>, 'desc': 'The ID of the case'}
{'name': 'age_at_diagnosis', 'type': int | float, 'desc': 'The age of the patient at the time of diagnosis'}
{'name': 'race', 'type': <class 'str'>, 'desc': 'An arbitrary classification of a taxonomic group that is a division of a species.'}
{'name': 'ethnicity', 'type': <class 'str'>, 'desc': 'Whether an individual describes themselves as Hispanic or Latino or not.'}
{'name': 'gender', 'type': <class 'str'>, 'desc': 'Text designations that identify gender.'}


In [10]:
from palimpzest.core.elements.records import DataRecord

def print_result(records):
    df = DataRecord.to_df(records)
    cols = records[0].schema.field_names()
    display(df[cols])

def print_stats(stats):
    print("Chosen plan")
    print(result.executed_plans[0])
    print("Total execution time:", stats.total_execution_time,"seconds")
    print("Total execution cost:", stats.total_execution_cost, "USD")
    # print("Total time optimizing pipeline:", stats.total_optimization_time, "seconds")

## Printing base dataset
In the following cell we print the base dataset we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [11]:
papers_html = pz.Dataset("../testdata/biofabric-html")
output = papers_html

policy = pz.MinCost()
config = pz.QueryProcessorConfig(
    policy=policy,
    nocache=True,
    processing_strategy="no_sentinel",
)
result = output.run(config)

print_result(result.data_records)
# print_stats(result.execution_stats)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=11), HTML(value='<pre>Ini…

Unnamed: 0,filename,html,text,timestamp
0,cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1724867000.0
1,clark.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Integrated Proteogenomic Characterization of C...,1724867000.0
2,dou.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic Characterization of Endometrial ...,1724867000.0
3,gilette.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic Characterization Reveals Therape...,1724867000.0
4,huang.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic insights into the biology and tr...,1724867000.0
5,krug.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic Landscape of Breast Cancer Tumor...,1724867000.0
6,li.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic data and resources for pan-cance...,1724867000.0
7,mcdermott.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic Characterization of Ovarian HGSC...,1724867000.0
8,satpathy.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",A proteogenomic portrait of lung squamous cell...,1724867000.0
9,vasaikar.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic Analysis of Human Colon Cancer R...,1724867000.0


### Downloading relevant tables from the papers

In [12]:
from palimpzest.utils import udfs

papers_html = pz.Dataset("../testdata/biofabric-html-tiny")
table_urls = papers_html.sem_add_columns(
    [{"name": "url", "type": str, "desc": "The URLs of the XLS tables from the page"}],
    cardinality=pz.Cardinality.ONE_TO_MANY)
files = table_urls.download()
output = files

policy = pz.MinCost()
config = pz.QueryProcessorConfig(policy=policy, nocache=True)
result = output.run(config)

print_result(result.data_records)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=3), HTML(value='<pre>Init…

Unnamed: 0,contents,filename,html,text,timestamp,url
0,b'',cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,[]
1,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/d1c...
2,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/0f1...
3,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/40d...
4,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/01c...
5,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/9be...
6,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/a1c...
7,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/39e...
8,b'',cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,[]
9,"b'<!DOCTYPE html><html lang=""en-US""><head><tit...",cao.htm,"<!DOCTYPE html> <html lang=""en"" class=""pb-page...",Proteogenomic characterization of pancreatic d...,1742491000.0,/cms/10.1016/j.cell.2021.08.023/attachment/d1c...


### Filtering stage
In the following cell we define the first part of the workload, that comprises a filtering stage responsible for selecting the tables from all the spreadsheets that contain relevant biometric information about the patient.


In [13]:
from palimpzest.utils import udfs
xls = pz.Dataset("../testdata/biofabric-tiny")
patient_tables = xls.add_columns(udf=udfs.xls_to_tables, cols=table_cols, cardinality=pz.Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.sem_filter("The table contains biometric information about the patient")

output = patient_tables

policy = pz.MinCost()
config = pz.QueryProcessorConfig(policy=policy,nocache=True)
result = patient_tables.run(config)

print_result(result.data_records)
print_stats(result.execution_stats)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=9), HTML(value='<pre>Init…

Unnamed: 0,contents,filename,header,name,number_sheets,rows,sheet_names
0,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,1,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1]
1,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,vasaikar_mmc1.xlsx,"[Column name, Column definition, Unnamed: 2, U...",vasaikar_mmc1.xlsx_A-Annotation,3,"[[SampleID, ID of the prospective participant,...","[Description, A-Annotation, B-ClinicalData]"
2,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,3,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]"
3,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,5,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...","[README, clinical_data, additional_annotations..."
4,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,wang_mmc2.xlsx,"[case, sample_type, multiomic, nmf_consensus, ...",wang_mmc2.xlsx_additional_annotations,5,"[[C3L-00104, tumor, nmf1, nmf1, 0.743, Proneur...","[README, clinical_data, additional_annotations..."
5,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,wang_mmc2.xlsx,"[aliquot_id, sample_id, original_aliquot_id, b...",wang_mmc2.xlsx_cbttc_sample_info,5,"[[7316-288-TISS [894482], 7316-288, 7316-288-T...","[README, clinical_data, additional_annotations..."


Chosen plan
 0. MarshalAndScanDataOp -> XLSFile 

 1. XLSFile -> NonLLMConvert -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, number_she) -> (contents, filename, header, na)
    UDF: <function xls_to_tables at 0x7fcdc4214720>

 2. Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']] -> LLMFilter -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, header, na) -> (contents, filename, header, na)
    Model: Model.GPT_4o_MINI
    Filter: The table contains biometric information about the patient


Total execution time: 15.247003078460693 seconds
Total execution cost: 0.0029549999999999993 USD


In [14]:
policy = pz.MaxQuality()

config = pz.QueryProcessorConfig(policy=policy, nocache=True)
result = patient_tables.run(config)
print_result(result.data_records)
print_stats(result.execution_stats)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=9), HTML(value='<pre>Init…

Unnamed: 0,contents,filename,header,name,number_sheets,rows,sheet_names
0,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,1,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1]
1,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,3,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]"
2,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,wang_mmc2.xlsx,"[README for Table S1, Unnamed: 1, Unnamed: 2]",wang_mmc2.xlsx_README,5,"[[nan, nan, nan], [Tab1, clinical_data, Clinic...","[README, clinical_data, additional_annotations..."
3,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,5,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...","[README, clinical_data, additional_annotations..."


Chosen plan
 0. MarshalAndScanDataOp -> XLSFile 

 1. XLSFile -> NonLLMConvert -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, number_she) -> (contents, filename, header, na)
    UDF: <function xls_to_tables at 0x7fcdc4214720>

 2. Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']] -> LLMFilter -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, header, na) -> (contents, filename, header, na)
    Model: Model.GPT_4o
    Filter: The table contains biometric information about the patient


Total execution time: 20.54369831085205 seconds
Total execution cost: 0.05419000000000001 USD


## Matching stage
Once filtered, we can define the second part of the workload, that matches the patient records in the different tables and merging the information into a single table.

Minimum cost: GPT 3.5

In [None]:
from palimpzest.utils import udfs   
input_dataset = pz.Dataset("../testdata/biofabric-tiny-filtered")
patient_tables = input_dataset.add_columns(udf=udfs.xls_to_tables, cols=table_cols, cardinality=pz.Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.sem_filter("The table contains biometric information about the patient")
case_data = patient_tables.sem_add_columns(case_data_cols, cardinality=pz.Cardinality.ONE_TO_MANY)

policy = pz.MinCost()
config = pz.QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_code_synth=False,
    processing_strategy="no_sentinel",
)
data_record_collection = case_data.run(config) 

print_result(data_record_collection.data_records)
print_stats(data_record_collection.execution_stats)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=12), HTML(value='<pre>Ini…

Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,contents,ethnicity,filename,gender,header,...,primary_diagnosis,race,rows,sheet_names,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,38.88,pN0,Stage I,pT1a (FIGO IA),C3L-00006,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou,Anterior endometrium,Unifocal,FIGO grade 1,2.9,No
1,39.76,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou,Posterior endometrium,Unifocal,FIGO grade 1,3.5,No
2,51.19,pN0,Stage I,pT1a (FIGO IA),C3L-00032,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou,Anterior and Posterior endometrium,Unifocal,FIGO grade 2,4.5,No
3,,,,,C3L-00084,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,No,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Carcinosarcoma,,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou,,,,,Yes
4,32.69,pNX,Stage I,pT1a (FIGO IA),C3L-00090,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou,Anterior and Posterior endometrium,Unifocal,FIGO grade 2,3.5,No
5,,,,,,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,,"[Sheet, Description]",...,,,"[[A-Annotation, Column annotations], [B-Clinic...","[Description, B-ClinicalData]",,,,,,
6,729.0,N2b,Stage III,T4a,01CO001,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Male,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, B-ClinicalData]",Vasaikar,Sigmoid Colon,,,,Living
7,838.0,N0,Stage II,T3,01CO005,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, B-ClinicalData]",Vasaikar,Sigmoid Colon,,,,Deceased
8,904.0,N2b,Stage III,T4a,01CO006,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, B-ClinicalData]",Vasaikar,Ascending Colon,,,,Living
9,652.0,N0,Stage II,T3,01CO008,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, B-ClinicalData]",Vasaikar,Descending Colon,,,,Living


Chosen plan
 0. MarshalAndScanDataOp -> XLSFile 

 1. XLSFile -> NonLLMConvert -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, number_she) -> (contents, filename, header, na)
    UDF: <function xls_to_tables at 0x7faeec21bba0>

 2. Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']] -> LLMFilter -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, header, na) -> (contents, filename, header, na)
    Model: Model.GPT_4o_MINI
    Filter: The table contains biometric information about the patient


Total execution time: 50.044567823410034 seconds
Total execution cost: 0.0061902 USD


## End to end

Minimum cost : GPT 3.5

In [8]:
papers_html = pz.Dataset("../testdata/biofabric-html-tiny")
table_urls = papers_html.sem_add_columns(
    [{"name": "url", "type": str, "desc": "The URLs of the XLS tables from the page"}],
    cardinality=pz.Cardinality.ONE_TO_MANY)
files = table_urls.add_columns(udf=udfs.url_to_file, cols=file_cols)
# xls = files.download()
xls = pz.Dataset("../testdata/biofabric-tiny")
tables = xls.add_columns(udf=udfs.xls_to_tables, cols=table_cols, cardinality=pz.Cardinality.ONE_TO_MANY)
patient_tables = tables.sem_filter("The table contains biometric information about the patient")
case_data = patient_tables.sem_add_columns(case_data_cols, cardinality=pz.Cardinality.ONE_TO_MANY)

policy = pz.MinCost()
config = pz.QueryProcessorConfig(policy=policy, nocache=True)
result = case_data.run(config)

print_result(result.data_records)
print_stats(result.execution_stats)

VBox(children=(IntProgress(value=0, bar_style='info', description='Processing:', max=12), HTML(value='<pre>Ini…

Error parsing answers: Extra data: line 1 column 266 (char 265)


Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,contents,ethnicity,filename,gender,header,...,primary_diagnosis,race,rows,sheet_names,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,38.88,pN0,Stage I,pT1a (FIGO IA),C3L-00006,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou_mmc1,Anterior endometrium,Unifocal,FIGO grade 1,2.9,No
1,39.76,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou_mmc1,Posterior endometrium,Unifocal,FIGO grade 1,3.5,No
2,51.19,pN0,Stage I,pT1a (FIGO IA),C3L-00032,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou_mmc1,Anterior and Posterior endometrium,Unifocal,FIGO grade 2,4.5,No
3,,,,,C3L-00084,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,No,dou_mmc1.xlsx,,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Carcinosarcoma,,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou_mmc1,,,,,Yes
4,32.69,pNX,Stage I,pT1a (FIGO IA),C3L-00090,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,Not-Hispanic or Latino,dou_mmc1.xlsx,Female,"[idx, Proteomics_Participant_ID, Case_excluded...",...,Endometrioid,White,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",[UCEC_CPTAC3_meta_table_V2.1],dou_mmc1,Anterior and Posterior endometrium,Unifocal,FIGO grade 2,3.5,No
5,,,,,,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,,"[Column name, Column definition, Unnamed: 2, U...",...,,,"[[SampleID, ID of the prospective participant,...","[Description, A-Annotation, B-ClinicalData]",,,,,,
6,729.0,N2b,Stage III,T4a,01CO001,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Male,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]",Vasaikar,Sigmoid Colon,,,,Living
7,838.0,N0,Stage II,T3,01CO005,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]",Vasaikar,Sigmoid Colon,,,,Deceased
8,904.0,N2b,Stage III,T4a,01CO006,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]",Vasaikar,Ascending Colon,,,,Living
9,652.0,N0,Stage II,T3,01CO008,b'PK\x03\x04\x14\x00\x06\x00\x08\x00\x00\x00!\...,,vasaikar_mmc1.xlsx,Female,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",...,,,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...","[Description, A-Annotation, B-ClinicalData]",Vasaikar,Descending Colon,,,,Living


Chosen plan
 0. MarshalAndScanDataOp -> XLSFile 

 1. XLSFile -> NonLLMConvert -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, number_she) -> (contents, filename, header, na)
    UDF: <function xls_to_tables at 0x7fcdc4214720>

 2. Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']] -> LLMFilter -> Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']]
    (contents, filename, header, na) -> (contents, filename, header, na)
    Model: Model.GPT_4o_MINI
    Filter: The table contains biometric information about the patient

 3. Schema[['contents', 'filename', 'filename', 'header', 'name', 'number_sheets', 'rows', 'sheet_names']] -> LLMConvertBonded -> Schema[['age_at_diagnosis', 'ajcc_pathologic_n', 'ajcc_pathologic_stage', 'ajcc_pathologic_t', 'case_submitter_id', 'contents', 'ethnicity', 'filename', 'gende

In [None]:
output_table.__dict__.keys()

Maximum Cost: GPT 4

In [None]:
xls = pz.Dataset("testdata/biofabric-tiny")
patient_tables = xls.sem_add_columns(table_cols, cardinality=pz.Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.sem_filter("The table contains biometric information about the patient")
case_data = patient_tables.sem_add_columns(case_data_cols, cardinality=pz.Cardinality.ONE_TO_MANY)

policy = pz.MaxQuality()
config = pz.QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_token_reduction=False,
    allow_code_synth=False,
    processing_strategy="streaming",
    execution_strategy="sequential",
)
iterable = case_data.run(config)

output_rows = []
for data_record_collection in iterable:  # noqa: B007
    for output_table in data_record_collection:
        print(output_table.to_dict().keys())
        output_rows.append(output_table.to_dict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

LOGICAL PLANS: 1
INITIAL PLANS: 27
DEDUP PLANS: 27
PARETO PLANS: 17
----------
Policy is: Maximum Quality
Chosen plan: Time est: 411.647 -- Cost est: 0.973 -- Quality est: 0.486
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_4
    (contents,filena...) -> (filename,header...)

 3. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_4
    Filter: "The table contains biometric information about the patient"
    (filename,header...) -> (filename,header...)

 4. Table -> InduceFromCandidateOp -> CaseData 
    Using Model.GPT_4
    (filename,header...) -> (age_at_diagnosi...)

Bonded query processing error: No output objects were generated with bonded query - trying with conventional query...
BondedQuery Error: No output objects were generated with bonded query - trying with conventional query...
Falling back to co

Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,64.0,pN0,Stage I,pT1a (FIGO IA),C3L-00006,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Anterior endometrium,Unifocal,Cannot be determined,2.9,No
1,58.0,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Posterior endometrium,Unifocal,Cannot be determined,3.5,No
2,50.0,pN0,Stage I,pT1a (FIGO IA),C3L-00032,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,4.5,Yes
3,,,,,C3L-00084,,,Carcinosarcoma,,,UCEC_CPTAC3,,,,,
4,75.0,pNX,Stage I,pT1a (FIGO IA),C3L-00090,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,3.5,No
5,729.0,N2b,Stage III,T4a,01CO001,,Male,Mucinous,,,Vasaikar,Sigmoid Colon,,,,Living
6,838.0,N0,Stage II,T3,01CO005,,Female,Not Mucinous,,,Vasaikar,Sigmoid Colon,,,,Deceased
7,904.0,N2b,Stage III,T4a,01CO006,,Female,Mucinous,,,Vasaikar,Ascending Colon,,,,Living
8,652.0,N0,Stage II,T3,01CO008,,Female,Mucinous,,,Vasaikar,Descending Colon,,,,Living
9,58.0,,,,C3L-00104,Not-Hispanic or Latino,Male,,,White,wang,Frontal Lobe,,,,Deceased
