In [17]:
import context
from palimpzest.constants import PZ_DIR
import palimpzest as pz

import gradio as gr
import numpy as np
import pandas as pd

import argparse
import requests
import json
import time
import os

pz.DataDirectory().clearCache(keep_registry=True)

In [18]:
class ScientificPaper(pz.PDFFile):
   """Represents a scientific research paper, which in practice is usually from a PDF file"""
   title = pz.Field(desc="The title of the paper. This is a natural language title, not a number or letter.", required=True)
   publicationYear = pz.Field(desc="The year the paper was published. This is a number.", required=False)
   author = pz.Field(desc="The name of the first author of the paper", required=True)
   journal = pz.Field(desc="The name of the journal the paper was published in", required=True)
   subject = pz.Field(desc="A summary of the paper contribution in one sentence", required=False)
   doiURL = pz.Field(desc="The DOI URL for the paper", required=True)

class TabularRow(pz.Schema):
    """A Row is a list of cells. For generality, we assume that all cell values are strings."""
    cells = pz.ListField(element_type=pz.StringField, desc="The cells in the row", required=True)

class Table(pz.Schema):
    """A Table is an object composed of a header and rows."""
    name = pz.StringField(desc="The name of the table", required=False)
    header = pz.ListField(element_type=pz.StringField, desc="The header of the table", required=True)
    rows = pz.ListField(element_type=pz.TabularRow, desc="The rows of the table", required=True)

def execute(output, policy):
    execution = pz.SimpleExecution(output, policy)
    physicalTree = execution.executeAndOptimize(verbose=True)

    for table in physicalTree:
        header = table.header
        subset_rows = table.rows[:3]

        print("Table name:", table.name)
        print(" | ".join(header)[:100], "...")
        for row in subset_rows:
            print(" | ".join(row.cells)[:100], "...")
        print()

In [19]:
policy = pz.MinCost()

papers = pz.Dataset("biofabric-pdf", schema=ScientificPaper)
paperURLs = papers.convert(pz.URL, desc="The DOI url of the paper") 
htmlDOI = paperURLs.map(pz.DownloadHTMLFunction())
tableURLS = htmlDOI.convert(pz.URL, desc="The URLs of the XLS tables from the page", cardinality="oneToMany")

urlFile = pz.Dataset("biofabric-urls", schema=pz.TextFile)
tableURLS = urlFile.convert(pz.URL, desc="The URLs of the tables")
binary_tables = tableURLS.map(pz.DownloadBinaryFunction())
tables = binary_tables.convert(pz.File)
xls = tables.convert(pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")

output = patient_tables
execute(output, policy)

----------
Policy is: Minimum Cost
Chosen plan: Time est: 0.178 -- Cost est: 0.000 -- Quality est: 0.240
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> TextFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. TextFile -> InduceFromCandidateOp -> URL 
    Using Model.GPT_3_5
    (contents,filena...) -> (url...)

 3. URL -> ApplyUserFunctionOp -> Download 
    (url...) -> (content,timesta...)

 4. Download -> InduceFromCandidateOp -> File 
    Using Model.GPT_3_5
    (content,timesta...) -> (contents,filena...)

 5. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 6. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_3_5
    (contents,filena...) -> (header,name,row...)

Table name: 1-s2.0-S1535610823002192-mmc2.xlsx - Data_dictionary
Column | Description ...
Patient_ID | Unique ID of each patient.  ...
tumor_code | Tumor acronym (BRCA--Breast Cancer; ccR

In [20]:
policy = pz.MinCost()

patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filterByStr("The table explains the meaning of attributes")

output = patient_tables

execute(output, policy)

----------
Policy is: Minimum Cost
Chosen plan: Time est: 0.114 -- Cost est: 0.000 -- Quality est: 0.841
 0. CacheScanDataOp -> Table 

 1. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_3_5
    Filter: "The table explains the meaning of attributes"
    (header,name,row...) -> (header,name,row...)

Table name: 1-s2.0-S1535610823002192-mmc2.xlsx - Data_dictionary
Column | Description ...
Patient_ID | Unique ID of each patient.  ...
tumor_code | Tumor acronym (BRCA--Breast Cancer; ccRCC--clear cell renal cell carcinoma;  COAD--colo ...
tumor_sample_id_protein | Sample ID for the tumor tissue ( used as column names in the combined prot ...

Table name: 1-s2.0-S1535610823002192-mmc2.xlsx - Supplementary Table 1
Case_ID | tumor_code | tumor_sample_id_protein | normal_sample_id_protein | is_excluded_from_pancanc ...
11BR047 | BRCA | 11BR047-T | nan | nan | nan | 0.0 | nan | 81116212-b7e6-454b-9579-105cf3 | ef52c640 ...
11BR043 | BRCA | 11BR043-T | nan | nan | nan | 0.0 | nan | 6d34

In [21]:
pz.DataDirectory().clearCache(keep_registry=True)
policy = pz.MaxQuality()

patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filterByStr("The table explains the meaning of attributes")

output = patient_tables
execute(output, policy)

----------
Policy is: Maximum Quality
Chosen plan: Time est: 0.853 -- Cost est: 0.003 -- Quality est: 0.518
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> TextFile 
    Using Model.GPT_4
    (contents,filena...) -> (contents,filena...)

 2. TextFile -> InduceFromCandidateOp -> URL 
    Using Model.GPT_4
    (contents,filena...) -> (url...)

 3. URL -> ApplyUserFunctionOp -> Download 
    (url...) -> (content,timesta...)

 4. Download -> InduceFromCandidateOp -> File 
    Using Model.GPT_4
    (content,timesta...) -> (contents,filena...)

 5. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_4
    (contents,filena...) -> (contents,filena...)

 6. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_4
    (contents,filena...) -> (header,name,row...)

 7. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_4
    Filter: "The table explains the meaning of attributes"
    (header,name,row...) -> (header,name,row...)

Table name: 1-s2.0-S15

In [22]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filterByStr("The table records if the patient is excluded from the study")

output = patient_tables
execute(output, policy)

----------
Policy is: Minimum Cost
Chosen plan: Time est: 486657.457 -- Cost est: 277.019 -- Quality est: 0.412
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_3_5
    (contents,filena...) -> (header,name,row...)

 3. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_3_5
    Filter: "The table records if the patient is excluded from the study"
    (header,name,row...) -> (header,name,row...)

Table name: cao_mmc1.xlsx - Clinical_data
case_id | tumor_included_for_the_study | normal_included_for_the_study | histology_diagnosis | age | ...
C3L-00102 | yes | yes | PDAC | 42 | Male | White | United States | head | Unifocal | 3.0 | Not ident ...
C3L-00189 | yes | yes | PDAC | 68 | Female | nan | Canada | head | Unifocal | 2.7 | Not identified | ...
C3L-00277 | yes | yes | PDAC | 69 | Male | White | Other | tail | 

  warn(msg)
  warn(msg)


Table name: clark_S044_CPTAC_CCRCC_Discovery_Cohort_Specimens_r1_Sept2018.xlsx - CCRCC_TMT
Batch | TMT plex | TMT channel | ParticipantID | Parent Sample ID(s) | Aliquot ID | Group | OCT | TC ...
1 | 1 | 126 | C3L-01287 | C3L-01287-06 | CPT0079430001 | Normal | No | C3L-01287-26 | https://pathol ...
1 | 1 | 127N | C3L-00561 | C3L-00561-06 | CPT0023360001 | Normal | No | C3L-00561-26 | https://patho ...
1 | 1 | 127C | C3L-00561 | C3L-00561-01 | CPT0023350003 | Tumor | No | C3L-00561-21 | https://pathol ...

Table name: dou_mmc3.xlsx - E-Mutation-adjacent
Sample | CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | FORMAT | NORMAL | TUMOR ...
C3L-00006 | chr9 | 94325425 | . | T | C | . | PASS | AC=1;AF=0.25;AN=4;DP=364;SOMATIC;set=varscan-mu ...
C3L-00006 | chr16 | 1256631 | . | C | T | . | PASS | AC=1;AF=0.25;AN=4;DP=227;SOMATIC;set=varscan-mu ...
C3L-00932 | chr1 | 22597319 | . | A | C | . | PASS | AC=1;AF=0.25;AN=4;DP=96;SOMATIC;set=varscan-mut ...



  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Table name: dou_mmc4.xlsx - B-Novel Splice Junctions
Peptide | P-value | Q-value | MassDiff | Hyperscore | Expectation | Chromosome | 5' Splice Site | 3' ...
MGEEEALQSQVTK | 0.0 | 0.0 | -0.0041 | 42.033 | 3.521e-26 | chr9 | 136474486 | 136475198 | 3 | S078,S ...
WCFGPDGTGPNLLTDLTK | 0.0 | 0.0 | 1.0087 | 34.943 | 6.72799999999999e-24 | chr19 | 3964834 | 3977873  ...
VEAGDEADYYCQVWDTR | 0.0 | 0.0 | 0.0063 | 59.0569999999999 | 1.699e-23 | chr22 | 22713183 | 22993459  ...

Table name: gillette_mmc7.xlsx - Table S7J
Signature.set.id | TN.STK11.outlier | TN.EGFR.outlier | TN.TP53.outlier | TN.KRAS.outlier | TN.STK11 ...
PERT-PSP_ERLOTINIB | 1 | 1 | 0 | 1 | 3.7034 | -2.7473 | -0.0373 | 2.0159 ...
PERT-PSP_WORTMANNIN | 1 | 0 | 0 | 1 | 2.7349 | -1.5854 | -0.4037 | 2.0359 ...
PERT-PSP_GEFITINIB | 0 | 0 | 0 | 0 | 0.9997 | -1.4017 | -0.4305 | 0.7878 ...

Table name: huang_mmc2.xlsx - Data availabilty
case_id | WES | WES_BN | WGS | WGS_BN | RNA-Seq | RNA-Seq_AN | MiRNA | MiRNA_AN | Methy | Methy_AN

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Table name: satpathy_mmc7.xlsx - Table S7B
Gene | medianDifference | pvalue | Normal_N | Tumor_N | pair | FDR | signedFDR | RNAevidence | Phosp ...
S100A2 | 6.517 | 5.63e-14 | 99 | 99 | 99 | 1.8e-13 | 12.74391 | 1 | 0 | 1 ...
KRT6A | 5.9276 | 4e-15 | 99 | 99 | 99 | 1.53e-14 | 13.81486 | 1 | 1 | 1 ...
SERPINB5 | 5.6931 | 1.34e-16 | 99 | 99 | 99 | 7.07e-16 | 15.15054 | 1 | 0 | 0 ...

Table name: satpathy_mmc7.xlsx - Table S7C
ID | Gene Type | Measurement type | Change | Basal inclusive | Classical | EMT enriched | Inflamed s ...
TGFBR2 | Tumor Suppressor | Global protein | Downregulated | X | X | X | X | X | 4.89274980284654e-2 ...
MSI2 | Oncogene | Global protein | Upregulated | X | X | X | X | X | 1.77339394872147e-18 ...
SPRED1 | Tumor Suppressor | Global protein | Downregulated | X | X | X | X | X | 6.43750606951931e-1 ...

Table name: vasaikar_mmc3.xlsx - A-somatic_events
Chr | Start | End | Ref | Alt | Variant_Type | Variant_Function | Gene | mRNA | mRNA_Change | Protei ...
chr19 |

In [None]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filterByStr("The table contains patient biometric data")

output = patient_tables
execute(output, policy)

In [None]:
policy = pz.MinCost()

xls = pz.Dataset('biofabric-xls', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filterByStr("The table contains proteomic data")

output = patient_tables
execute(output, policy)