In [2]:
import os

import palimpzest as pz

if not os.environ.get('OPENAI_API_KEY'):
    from palimpzest.utils import load_env
    load_env()

pz.DataDirectory().clearCache(keep_registry=True)

In [3]:
class ScientificPaper(pz.PDFFile):
   """Represents a scientific research paper, which in practice is usually from a PDF file"""
   title = pz.Field(desc="The title of the paper. This is a natural language title, not a number or letter.", required=True)
   publicationYear = pz.Field(desc="The year the paper was published. This is a number.", required=False)
   author = pz.Field(desc="The name of the first author of the paper", required=True)
   journal = pz.Field(desc="The name of the journal the paper was published in", required=True)
   subject = pz.Field(desc="A summary of the paper contribution in one sentence", required=False)
   doiURL = pz.Field(desc="The DOI URL for the paper", required=True)

def print_table(output):

    for table in output:
        header = table.header
        subset_rows = table.rows[:3]

        print("Table name:", table.name)
        print(" | ".join(header)[:100], "...")
        for row in subset_rows:
            print(" | ".join(row.cells)[:100], "...")
        print()

In [4]:
policy = pz.MinCost()

papers = pz.Dataset("biofabric-pdf", schema=ScientificPaper)
paperURLs = papers.convert(pz.URL, desc="The DOI url of the paper") 
htmlDOI = paperURLs.map(pz.DownloadHTMLFunction())
tableURLS = htmlDOI.convert(pz.URL, desc="The URLs of the XLS tables from the page", cardinality="oneToMany")

# urlFile = pz.Dataset("biofabric-urls", schema=pz.TextFile)
# tableURLS = urlFile.convert(pz.URL, desc="The URLs of the tables")
binary_tables = tableURLS.map(pz.DownloadBinaryFunction())
tables = binary_tables.convert(pz.File)
xls = tables.convert(pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")

engine = pz.NoSentinelExecution
output = patient_tables
tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 4


: 

In [None]:
policy = pz.MinCost()

patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table explains the meaning of attributes")

output = patient_tables

tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)

In [None]:
pz.DataDirectory().clearCache(keep_registry=True)
policy = pz.MaxQuality()

patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table explains the meaning of attributes")

output = patient_tables

tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)

In [None]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table records if the patient is excluded from the study")

output = patient_tables
tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)

In [None]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table contains patient biometric data")

output = patient_tables
tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)

In [None]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table contains proteomic data")

output = patient_tables
tables, plan, stats  =  pz.Execute(patient_tables,
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  allow_token_reduction=False,
                                  execution_engine=engine)

print_table(tables)
print(plan)
print(stats)