In [2]:
# Note: include in the tests folder a .env file that contains the API keys for the services used in the tests
import os
if not os.environ.get('OPENAI_API_KEY'):
    import context
    
from palimpzest.constants import PZ_DIR
import palimpzest as pz

import pandas as pd
import os

pz.DataDirectory().clearCache(keep_registry=True)

### Schema definition
In the following cell we target schema of the case study we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [3]:
class CaseData(pz.Schema):
    """An individual row extracted from a table containing medical study data."""
    case_submitter_id = pz.Field(desc="The ID of the case", required=True)
    age_at_diagnosis = pz.Field(desc="The age of the patient at the time of diagnosis", required=False)
    race = pz.Field(desc="An arbitrary classification of a taxonomic group that is a division of a species.", required=False)
    ethnicity = pz.Field(desc="Whether an individual describes themselves as Hispanic or Latino or not.", required=False)
    gender = pz.Field(desc="Text designations that identify gender.", required=False)
    vital_status = pz.Field(desc="The vital status of the patient", required=False)
    ajcc_pathologic_t = pz.Field(desc="The AJCC pathologic T", required=False)
    ajcc_pathologic_n = pz.Field(desc="The AJCC pathologic N", required=False)
    ajcc_pathologic_stage = pz.Field(desc="The AJCC pathologic stage", required=False)
    tumor_grade = pz.Field(desc="The tumor grade", required=False)
    tumor_focality = pz.Field(desc="The tumor focality", required=False)
    tumor_largest_dimension_diameter = pz.Field(desc="The tumor largest dimension diameter", required=False)
    primary_diagnosis = pz.Field(desc="The primary diagnosis", required=False)
    morphology = pz.Field(desc="The morphology", required=False)
    tissue_or_organ_of_origin = pz.Field(desc="The tissue or organ of origin", required=False)
    # tumor_code = pz.Field(desc="The tumor code", required=False)
    study = pz.Field(desc="The last name of the author of the study, from the table name", required=False)

## Printing base dataset
In the following cell we print the base dataset we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [3]:
def print_tables(output):
    for table in output:
        header = table.header
        subset_rows = table.rows[:3]

        print("Table name:", table.name)
        print(" | ".join(header)[:100], "...")
        for row in subset_rows:
            print(" | ".join(row)[:100], "...")
        print()


xls = pz.Dataset('biofabric-tiny', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
output = patient_tables

policy = pz.MaxQuality()
engine = pz.SequentialSingleThreadExecution
records, plan, stats = pz.Execute(patient_tables, 
                                  policy = policy,
                                  nocache=True,
                                  execution_engine=engine)

print_tables(records)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 1
DEDUP PLANS: 1
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: vasaikar_mmc1.xlsx_Description
Sheet | Description ...
A-Annotation | Column annotations ...
B-ClinicalData | Clinical Information for 110 Colon cancer patients ...

Table name: vasaikar_mmc1.xlsx_A-Annotation
Column name | Column definition | Unnamed: 2 | Unnamed: 3 | Data type | Definition ...
SampleID | ID of the prospective participant | nan | nan | Bin | Binary ...
Clinical | Clinical data availability |

### Filtering stage
In the following cell we define the first part of the workload, that comprises a filtering stage responsible for selecting the tables from all the spreadsheets that contain relevant biometric information about the patient.


In [4]:
import time
# Make sure to run
# pz reg --name biofabric-tiny --path testdata/biofabric-tiny
pz.DataDirectory().clearCache(keep_registry=True)

xls = pz.Dataset('biofabric-tiny', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table contains biometric information about the patient")

output = patient_tables

policy = pz.MinCost()
engine = pz.SequentialSingleThreadExecution
tables, plan, stats = pz.Execute(patient_tables, 
                                  policy = policy,
                                  nocache=True,
                                  execution_engine=engine)

for table in tables:
    header = table.header
    subset_rows = table.rows[:3]

    print("Table name:", table.name)
    print(" | ".join(header)[:100], "...")
    for row in subset_rows:
        print(" | ".join(row)[:100], "...")
    print()

print(stats)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 2
DEDUP PLANS: 2
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: wang_mmc2.xlsx_clinical_data
case_id | age | gender | height | weight | bmi | country_of_origin | race | ethnicity | ethnicity_se ...
C3L-00104 | 58 | Male | 188.0 | 115.0 | 32.54 | United States | White | Not-Hispanic or Latino | Cau ...
C3L-00365 | 59 | Female | 162.0 | 54.0 | 20.61 | United States | White | Not-Hispanic or Latino | Ca ...
C3L-00674 | 45 | Male | 193.0 | 102.0 | 27.44 | nan | White | Not-Hi

In [5]:
policy = pz.MinCost()
engine = pz.PipelinedParallelExecution
tables, plan, stats = pz.Execute(patient_tables, 
                                  policy = policy,
                                  nocache=True,
                                  execution_engine=engine)

for table in tables:
    header = table.header
    subset_rows = table.rows[:3]

    print("Table name:", table.name)
    print(" | ".join(header)[:100], "...")
    for row in subset_rows:
        print(" | ".join(row)[:100], "...")
    print()

print(stats)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 2
DEDUP PLANS: 2
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: wang_mmc2.xlsx_clinical_data
case_id | age | gender | height | weight | bmi | country_of_origin | race | ethnicity | ethnicity_se ...
C3L-00104 | 58 | Male | 188.0 | 115.0 | 32.54 | United States | White | Not-Hispanic or Latino | Cau ...
C3L-00365 | 59 | Female | 162.0 | 54.0 | 20.61 | United States | White | Not-Hispanic or Latino | Ca ...
C3L-00674 | 45 | Male | 193.0 | 102.0 | 27.44 | nan | White | Not-Hi

In [7]:
print("Chosen plan:")
print(plan, "\n")
print("Stats:", stats)

Chosen plan:
PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 
 

Stats: Total_plan_time=0.5795912742614746 
Total_plan_cost=0.0 
0. MarshalAndScanDataOp time=0.06175112724304199 cost=0.0 
1. ConvertXLSToTable time=0.516960620880127 cost=0.0 



## Matching stage
Once filtered, we can define the second part of the workload, that matches the patient records in the different tables and merging the information into a single table.

Minimum cost: GPT 3.5

In [8]:
# Make sure to run
# pz reg --name biofabric-tiny-filtered --path testdata/biofabric-tiny-filtered
pz.DataDirectory().clearCache(keep_registry=True)

input_dataset = pz.Dataset('biofabric-tiny-filtered', schema=pz.XLSFile)
patient_tables = input_dataset.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table",cardinality="oneToMany")

policy = pz.MinCost()
engine = pz.PipelinedParallelExecution
matched_tables, plan, stats = pz.Execute(case_data, 
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  execution_engine=engine)

output_rows = []
for output_table in matched_tables:
    output_rows.append(output_table._asDict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 14
DEDUP PLANS: 2
PARETO PLANS: 1


Unnamed: 0,rows,filename,header,name,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,"[[A-Annotation, Column annotations], [B-Clinic...",vasaikar_mmc1.xlsx,"[Sheet, Description]",vasaikar_mmc1.xlsx_Description,Value based on data in the rows,The AJCC pathologic N,The AJCC pathologic stage,The AJCC pathologic T,ID123,Whether an individual describes themselves as ...,Text designations that identify gender,The morphology,,An arbitrary classification of a taxonomic gro...,vasaikar,,,The tumor grade,The tumor largest dimension diameter,
1,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...",vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,Age,PN,Stage,PT,SampleID,Hispanic or Latino,Gender,,B-ClinicalData,An arbitrary classification of a taxonomic gro...,vasaikar,Colon,TMT Proteome,Grade 2,TumorPurity,Vital.Status
2,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Age of the patient at the time of diagnosis,The AJCC pathologic N,The AJCC pathologic stage,Path_Stage_Primary_Tumor-pT,Proteomics_Participant_ID,Whether an individual describes themselves as ...,Text designations that identify gender,The morphology field value extracted from the ...,Endometrial Carcinoma,An arbitrary classification of a taxonomic gro...,CPTAC3,Country,Tumor focality information extracted from the ...,The tumor grade,tumor_largest_dimension_diameter,The vital status of the patient
3,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...",wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,age,AJCC pathologic N,AJCC pathologic stage value,AJCC pathologic T,case_id,ethnicity_self_identify,Text designations that identify gender,Additional information or clarification needed,The primary diagnosis,An arbitrary classification of a taxonomic gro...,wang,,tumor_focality,The tumor grade,tumor_largest_dimension_diameter,The vital status of the patient


In [9]:
print(plan, "\n")
print("Stats:", stats)

PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 

2. Table -> LLMConvertConventional -> CaseData
(filename,header,name,rows) -> (age_at_diagnosis,ajcc_patholog) 
Using model: gpt-3.5-turbo-0125
Prompt strategy: dspy-chain-of-thought-question
Query strategy: bonded-with-fallback
 

Stats: Total_plan_time=84.75389361381531 
Total_plan_cost=0.028860499999999994 
0. MarshalAndScanDataOp time=0.04571676254272461 cost=0.0 
1. ConvertXLSToTable time=0.46941590309143066 cost=0.0 
2. LLMConvertConventional time=84.23336219787598 cost=0.028860499999999994 

dict_keys(['plan_id', 'operator_stats', 'total_plan_time', 'total_plan_cost', 'plan_idx'])


Maximum Quality: GPT 4

In [10]:
pz.DataDirectory().clearCache(keep_registry=True)

input_dataset = pz.Dataset('biofabric-tiny-filtered', schema=pz.XLSFile)
patient_tables = input_dataset.convert(pz.Table, cardinality="oneToMany")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table",cardinality="oneToMany")

policy = pz.MaxQuality()
engine = pz.PipelinedParallelExecution
matched_tables, plan, stats = pz.Execute(case_data, 
                                  policy = policy,
                                  nocache=True,
                                  allow_code_synth=False,
                                  execution_engine=engine)

output_rows = []
for output_table in matched_tables:
    output_rows.append(output_table._asDict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 14
DEDUP PLANS: 2
PARETO PLANS: 1
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!


Unnamed: 0,rows,filename,header,name,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Age,,[],[],Proteomics_Participant_ID,Description of whether an individual describes...,,,,,dou,,Tumor_Focality,,,
1,"[[A-Annotation, Column annotations], [B-Clinic...",vasaikar_mmc1.xlsx,"[Sheet, Description]",vasaikar_mmc1.xlsx_Description,,,[],[],,,,,,,vasaikar,,,,[],[]
2,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...",vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,Derived from the 'Age' column in the table,,Stage,,SampleID,[],,,,[],vasaikar,,[],,,
3,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...",wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,,,[],[],,,,,,An arbitrary classification of a taxonomic gro...,wang,,,,,unknown


In [13]:
print(plan, "\n")
print("Stats:", _)

PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 

2. Table -> LLMConvertConventional -> CaseData
(filename,header,name,rows) -> (age_at_diagnosis,ajcc_patholog) 
Using model: gpt-3.5-turbo-0125
Prompt strategy: dspy-chain-of-thought-question
Query strategy: bonded-with-fallback
 

Stats: Total_plan_time=381.9855651855469 
Total_plan_cost=0.70077 
0. MarshalAndScanDataOp time=0.13936471939086914 cost=0.0 
1. ConvertXLSToTable time=0.8186140060424805 cost=0.0 
2. LLMConvertConventional time=381.0207107067108 cost=0.70077 



## End to end

Minimum cost : GPT 3.5

In [17]:
pz.DataDirectory().clearCache(keep_registry=True)

policy = pz.MinCost()

xls = pz.Dataset('biofabric-tiny', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table contains biometric information about the patient")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table",cardinality="oneToMany")

matched_tables = pz.SequentialSingleThreadExecution(case_data, policy)   
matched_tables = matched_tables.executeAndOptimize(verbose=True)

output_rows = []
for output_table in matched_tables:
    output_rows.append(output_table._asDict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

LOGICAL PLANS: 1
INITIAL PLANS: 27
DEDUP PLANS: 27
PARETO PLANS: 17
----------
Policy is: Minimum Cost
Chosen plan: Time est: 303.905 -- Cost est: 0.173 -- Quality est: 0.288
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_3_5
    (contents,filena...) -> (filename,header...)

 3. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_3_5
    Filter: "The table contains biometric information about the patient"
    (filename,header...) -> (filename,header...)

 4. Table -> InduceFromCandidateOp -> CaseData 
    Using Model.GPT_3_5
    (filename,header...) -> (age_at_diagnosi...)



Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,38.88,pN0,Stage I,pT1a (FIGO IA),C3L-00006,Not-Hispanic or Latino,Female,Endometrioid,Endometrioid,White,dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Anterior endometrium,Unifocal,Cannot be determined,2.9,PASS
1,39.76,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,Not-Hispanic or Latino,Female,Endometrioid,Endometrioid,White,dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Posterior endometrium,Unifocal,Cannot be determined,3.5,PASS
2,51.19,pN0,Stage I,pT1a (FIGO IA),C3L-00032,Not-Hispanic or Latino,Female,Endometrioid,Endometrioid,White,dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Anterior and Posterior endometrium,Unifocal,Cannot be determined,4.5,PASS
3,32.69,pNX,Stage I,pT1a (FIGO IA),C3L-00090,Not-Hispanic or Latino,Female,Endometrioid,Endometrioid,White,dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Anterior and Posterior endometrium,Unifocal,Cannot be determined,3.5,PASS
4,58.0,,12 Months,,C3L-00104,Not-Hispanic or Latino,Male,Diagnostic pathology report,Deceased,White,wang,Frontal Lobe,,PASS,0.0,Deceased
5,59.0,,12 Months,,C3L-00365,Not-Hispanic or Latino,Female,Diagnostic pathology report,Deceased,White,wang,Parietal Lobe,,FAIL,1.0,Deceased
6,45.0,,24 Months,,C3L-00674,Not-Hispanic or Latino,Male,MRI review,Deceased,White,wang,Frontal Lobe,,PASS,1.0,Deceased
7,69.0,,12 Months,,C3L-00677,Not-Hispanic or Latino,Female,MRI review,Deceased,White,wang,Frontal Lobe,,PASS,1.0,Deceased
8,77.0,,24 Months,,C3L-01040,,Male,MRI review,Living,,wang,Frontal Lobe,,PASS,2.0,Living


Maximum Cost: GPT 4

In [18]:
pz.DataDirectory().clearCache(keep_registry=True)

policy = pz.MaxQuality()

xls = pz.Dataset('biofabric-tiny', schema=pz.XLSFile)
patient_tables = xls.convert(pz.Table, desc="All tables in the file", cardinality="oneToMany")
patient_tables = patient_tables.filter("The table contains biometric information about the patient")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table",cardinality="oneToMany")

matched_tables = pz.SequentialSingleThreadExecution(case_data, policy)   
matched_tables = matched_tables.executeAndOptimize(verbose=True)

output_rows = []
for output_table in matched_tables:
    output_rows.append(output_table._asDict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

LOGICAL PLANS: 1
INITIAL PLANS: 27
DEDUP PLANS: 27
PARETO PLANS: 17
----------
Policy is: Maximum Quality
Chosen plan: Time est: 411.647 -- Cost est: 0.973 -- Quality est: 0.486
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_4
    (contents,filena...) -> (filename,header...)

 3. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_4
    Filter: "The table contains biometric information about the patient"
    (filename,header...) -> (filename,header...)

 4. Table -> InduceFromCandidateOp -> CaseData 
    Using Model.GPT_4
    (filename,header...) -> (age_at_diagnosi...)

Bonded query processing error: No output objects were generated with bonded query - trying with conventional query...
BondedQuery Error: No output objects were generated with bonded query - trying with conventional query...
Falling back to co

Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,64.0,pN0,Stage I,pT1a (FIGO IA),C3L-00006,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Anterior endometrium,Unifocal,Cannot be determined,2.9,No
1,58.0,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Posterior endometrium,Unifocal,Cannot be determined,3.5,No
2,50.0,pN0,Stage I,pT1a (FIGO IA),C3L-00032,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,4.5,Yes
3,,,,,C3L-00084,,,Carcinosarcoma,,,UCEC_CPTAC3,,,,,
4,75.0,pNX,Stage I,pT1a (FIGO IA),C3L-00090,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,3.5,No
5,729.0,N2b,Stage III,T4a,01CO001,,Male,Mucinous,,,Vasaikar,Sigmoid Colon,,,,Living
6,838.0,N0,Stage II,T3,01CO005,,Female,Not Mucinous,,,Vasaikar,Sigmoid Colon,,,,Deceased
7,904.0,N2b,Stage III,T4a,01CO006,,Female,Mucinous,,,Vasaikar,Ascending Colon,,,,Living
8,652.0,N0,Stage II,T3,01CO008,,Female,Mucinous,,,Vasaikar,Descending Colon,,,,Living
9,58.0,,,,C3L-00104,Not-Hispanic or Latino,Male,,,White,wang,Frontal Lobe,,,,Deceased
