In [1]:
# Note: include in the tests folder a .env file that contains the API keys for the services used in the tests    
import os

import pandas as pd  # type: ignore

from palimpzest.constants import Cardinality
from palimpzest.core.lib.fields import Field
from palimpzest.core.lib.schemas import Schema, Table, XLSFile
from palimpzest.datamanager.datamanager import DataDirectory
from palimpzest.policy import MaxQuality, MinCost
from palimpzest.query.processor.config import QueryProcessorConfig
from palimpzest.sets import Dataset

if not os.environ.get('OPENAI_API_KEY'):
    from palimpzest.utils import load_env
    load_env()

DataDirectory().clear_cache(keep_registry=True)

### Schema definition
In the following cell we target schema of the case study we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [2]:
class CaseData(Schema):
    """An individual row extracted from a table containing medical study data."""
    case_submitter_id = Field(desc="The ID of the case")
    age_at_diagnosis = Field(desc="The age of the patient at the time of diagnosis")
    race = Field(desc="An arbitrary classification of a taxonomic group that is a division of a species.")
    ethnicity = Field(desc="Whether an individual describes themselves as Hispanic or Latino or not.")
    gender = Field(desc="Text designations that identify gender.")
    vital_status = Field(desc="The vital status of the patient")
    ajcc_pathologic_t = Field(desc="Code of pathological T (primary tumor) to define the size or contiguous extension of the primary tumor (T), using staging criteria from the American Joint Committee on Cancer (AJCC).")
    ajcc_pathologic_n = Field(desc="The codes that represent the stage of cancer based on the nodes present (N stage) according to criteria based on multiple editions of the AJCC's Cancer Staging Manual.")
    ajcc_pathologic_stage = Field(desc="The extent of a cancer, especially whether the disease has spread from the original site to other parts of the body based on AJCC staging criteria.")
    tumor_grade = Field(desc="Numeric value to express the degree of abnormality of cancer cells, a measure of differentiation and aggressiveness.")
    tumor_focality = Field(desc="The text term used to describe whether the patient's disease originated in a single location or multiple locations.")
    tumor_largest_dimension_diameter = Field(desc="The tumor largest dimension diameter.")
    primary_diagnosis = Field(desc="Text term used to describe the patient's histologic diagnosis, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O).")
    morphology = Field(desc="The Morphological code of the tumor, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O).")
    tissue_or_organ_of_origin = Field(desc="The text term used to describe the anatomic site of origin, of the patient's malignant disease, as described by the World Health Organization's (WHO) International Classification of Diseases for Oncology (ICD-O).")
    # tumor_code = Field(desc="The tumor code")
    study = Field(desc="The last name of the author of the study, from the table name")

## Printing base dataset
In the following cell we print the base dataset we are interested in. Note how we are not specifying target attributes with fine grained metadata but rather with natural language, possibly ambiguous, specifications.

In [5]:
def print_tables(output):
    for table in output:
        header = table.header
        subset_rows = table.rows[:3]

        print("Table name:", table.name)
        print(" | ".join(header)[:100], "...")
        for row in subset_rows:
            print(" | ".join(row)[:100], "...")
        print()


xls = Dataset('biofabric-tiny', schema=XLSFile)
patient_tables = xls.convert(Table, desc="All tables in the file", cardinality=Cardinality.ONE_TO_MANY)
output = patient_tables

policy = MinCost()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    processing_strategy="no_sentinel",
)
data_record_collection = output.run(config)

print_tables(data_record_collection.data_records)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 1
DEDUP PLANS: 1
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: vasaikar_mmc1.xlsx_Description
Sheet | Description ...
A-Annotation | Column annotations ...
B-ClinicalData | Clinical Information for 110 Colon cancer patients ...

Table name: vasaikar_mmc1.xlsx_A-Annotation
Column name | Column definition | Unnamed: 2 | Unnamed: 3 | Data type | Definition ...
SampleID | ID of the prospective participant | nan | nan | Bin | Binary ...
Clinical | Clinical data availability |

### Filtering stage
In the following cell we define the first part of the workload, that comprises a filtering stage responsible for selecting the tables from all the spreadsheets that contain relevant biometric information about the patient.


In [4]:

# Make sure to run
# pz reg --name biofabric-tiny --path testdata/biofabric-tiny
DataDirectory().clear_cache(keep_registry=True)

xls = Dataset('biofabric-tiny', schema=XLSFile)
patient_tables = xls.convert(Table, desc="All tables in the file", cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The table contains biometric information about the patient")

output = patient_tables

policy = MinCost()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    processing_strategy="no_sentinel",
)
data_record_collection = patient_tables.run(config)

for table in data_record_collection:
    header = table.header
    subset_rows = table.rows[:3]

    print("Table name:", table.name)
    print(" | ".join(header)[:100], "...")
    for row in subset_rows:
        print(" | ".join(row)[:100], "...")
    print()

print(data_record_collection.execution_stats)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 2
DEDUP PLANS: 2
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: wang_mmc2.xlsx_clinical_data
case_id | age | gender | height | weight | bmi | country_of_origin | race | ethnicity | ethnicity_se ...
C3L-00104 | 58 | Male | 188.0 | 115.0 | 32.54 | United States | White | Not-Hispanic or Latino | Cau ...
C3L-00365 | 59 | Female | 162.0 | 54.0 | 20.61 | United States | White | Not-Hispanic or Latino | Ca ...
C3L-00674 | 45 | Male | 193.0 | 102.0 | 27.44 | nan | White | Not-Hi

In [5]:
policy = MinCost()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    processing_strategy="no_sentinel",
    execution_strategy="pipelined_parallel",
)
data_record_collection = patient_tables.run(config)

for table in data_record_collection:
    header = table.header
    subset_rows = table.rows[:3]

    print("Table name:", table.name)
    print(" | ".join(header)[:100], "...")
    for row in subset_rows:
        print(" | ".join(row)[:100], "...")
    print()

print(data_record_collection.execution_stats)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 2
DEDUP PLANS: 2
PARETO PLANS: 1
Table name: dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1
idx | Proteomics_Participant_ID | Case_excluded | Proteomics_TMT_batch | Proteomics_TMT_plex | Prote ...
S001 | C3L-00006 | No | 2 | 5 | 128N | C3L-00006-01 | CPT0001460012 | Tumor | No | United States | F ...
S002 | C3L-00008 | No | 4 | 16 | 130N | C3L-00008-01 | CPT0001300009 | Tumor | No | United States |  ...
S003 | C3L-00032 | No | 1 | 2 | 131 | C3L-00032-01 | CPT0001420009 | Tumor | No | United States | FI ...

Table name: wang_mmc2.xlsx_clinical_data
case_id | age | gender | height | weight | bmi | country_of_origin | race | ethnicity | ethnicity_se ...
C3L-00104 | 58 | Male | 188.0 | 115.0 | 32.54 | United States | White | Not-Hispanic or Latino | Cau ...
C3L-00365 | 59 | Female | 162.0 | 54.0 | 20.61 | United States | White | Not-Hispanic or Latino | Ca ...
C3L-00674 | 45 | Male | 193.0 | 102.0 | 27.44 | nan | White | Not-Hi

In [7]:
print("Chosen plan:")
print(data_record_collection.executed_plans, "\n")
print("Stats:", data_record_collection.execution_stats)

Chosen plan:
PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 
 

Stats: Total_plan_time=0.5795912742614746 
Total_plan_cost=0.0 
0. MarshalAndScanDataOp time=0.06175112724304199 cost=0.0 
1. ConvertXLSToTable time=0.516960620880127 cost=0.0 



## Matching stage
Once filtered, we can define the second part of the workload, that matches the patient records in the different tables and merging the information into a single table.

Minimum cost: GPT 3.5

In [8]:
# Make sure to run
# pz reg --name biofabric-tiny-filtered --path testdata/biofabric-tiny-filtered
DataDirectory().clear_cache(keep_registry=True)

input_dataset = Dataset('biofabric-tiny-filtered', schema=XLSFile)
patient_tables = input_dataset.convert(Table, desc="All tables in the file", cardinality=Cardinality.ONE_TO_MANY)
case_data = patient_tables.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)

policy = MinCost()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_code_synth=False,
    processing_strategy="no_sentinel",
    execution_strategy="pipelined_parallel",
)
data_record_collection = case_data.run(config) 

output_rows = []
for output_table in data_record_collection:
    output_rows.append(output_table.to_dict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 14
DEDUP PLANS: 2
PARETO PLANS: 1


Unnamed: 0,rows,filename,header,name,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,"[[A-Annotation, Column annotations], [B-Clinic...",vasaikar_mmc1.xlsx,"[Sheet, Description]",vasaikar_mmc1.xlsx_Description,Value based on data in the rows,The AJCC pathologic N,The AJCC pathologic stage,The AJCC pathologic T,ID123,Whether an individual describes themselves as ...,Text designations that identify gender,The morphology,,An arbitrary classification of a taxonomic gro...,vasaikar,,,The tumor grade,The tumor largest dimension diameter,
1,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...",vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,Age,PN,Stage,PT,SampleID,Hispanic or Latino,Gender,,B-ClinicalData,An arbitrary classification of a taxonomic gro...,vasaikar,Colon,TMT Proteome,Grade 2,TumorPurity,Vital.Status
2,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Age of the patient at the time of diagnosis,The AJCC pathologic N,The AJCC pathologic stage,Path_Stage_Primary_Tumor-pT,Proteomics_Participant_ID,Whether an individual describes themselves as ...,Text designations that identify gender,The morphology field value extracted from the ...,Endometrial Carcinoma,An arbitrary classification of a taxonomic gro...,CPTAC3,Country,Tumor focality information extracted from the ...,The tumor grade,tumor_largest_dimension_diameter,The vital status of the patient
3,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...",wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,age,AJCC pathologic N,AJCC pathologic stage value,AJCC pathologic T,case_id,ethnicity_self_identify,Text designations that identify gender,Additional information or clarification needed,The primary diagnosis,An arbitrary classification of a taxonomic gro...,wang,,tumor_focality,The tumor grade,tumor_largest_dimension_diameter,The vital status of the patient


In [9]:
print(data_record_collection.executed_plans, "\n")
print("Stats:", data_record_collection.execution_stats)

PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 

2. Table -> LLMConvertConventional -> CaseData
(filename,header,name,rows) -> (age_at_diagnosis,ajcc_patholog) 
Using model: gpt-3.5-turbo-0125
Prompt strategy: dspy-chain-of-thought-question
Query strategy: bonded-with-fallback
 

Stats: Total_plan_time=84.75389361381531 
Total_plan_cost=0.028860499999999994 
0. MarshalAndScanDataOp time=0.04571676254272461 cost=0.0 
1. ConvertXLSToTable time=0.46941590309143066 cost=0.0 
2. LLMConvertConventional time=84.23336219787598 cost=0.028860499999999994 

dict_keys(['plan_id', 'operator_stats', 'total_plan_time', 'total_plan_cost', 'plan_idx'])


Maximum Quality: GPT 4

In [10]:
DataDirectory().clear_cache(keep_registry=True)

input_dataset = Dataset('biofabric-tiny-filtered', schema=XLSFile)
patient_tables = input_dataset.convert(Table, cardinality=Cardinality.ONE_TO_MANY)
case_data = patient_tables.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)

policy = MaxQuality()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_code_synth=False,
    processing_strategy="no_sentinel",
    execution_strategy="pipelined_parallel",
)
data_record_collection = case_data.run(config)

output_rows = []
for output_table in data_record_collection:
    output_rows.append(output_table.to_dict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 1
INITIAL PLANS: 14
DEDUP PLANS: 2
PARETO PLANS: 1
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!
Error parsing answer: No output was found!


Unnamed: 0,rows,filename,header,name,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Age,,[],[],Proteomics_Participant_ID,Description of whether an individual describes...,,,,,dou,,Tumor_Focality,,,
1,"[[A-Annotation, Column annotations], [B-Clinic...",vasaikar_mmc1.xlsx,"[Sheet, Description]",vasaikar_mmc1.xlsx_Description,,,[],[],,,,,,,vasaikar,,,,[],[]
2,"[[Type, BIN, BIN, BIN, BIN, BIN, BIN, BIN, BIN...",vasaikar_mmc1.xlsx,"[SampleID, Clinical, Mutation, CNV, miRNA-seq,...",vasaikar_mmc1.xlsx_B-ClinicalData,Derived from the 'Age' column in the table,,Stage,,SampleID,[],,,,[],vasaikar,,[],,,
3,"[[C3L-00104, 58, Male, 188.0, 115.0, 32.54, Un...",wang_mmc2.xlsx,"[case_id, age, gender, height, weight, bmi, co...",wang_mmc2.xlsx_clinical_data,,,[],[],,,,,,An arbitrary classification of a taxonomic gro...,wang,,,,,unknown


In [13]:
print(data_record_collection.executed_plans, "\n")
print("Stats:", data_record_collection.execution_stats)

PhysicalPlan:
0. dir -> MarshalAndScanDataOp -> XLSFile(contents, filename, number_sheets, sheet_names)
(contents,filename,number_sheet) 

1. XLSFile -> ConvertXLSToTable -> Table
(contents,filename,number_sheet) -> (filename,header,name,rows) 

2. Table -> LLMConvertConventional -> CaseData
(filename,header,name,rows) -> (age_at_diagnosis,ajcc_patholog) 
Using model: gpt-3.5-turbo-0125
Prompt strategy: dspy-chain-of-thought-question
Query strategy: bonded-with-fallback
 

Stats: Total_plan_time=381.9855651855469 
Total_plan_cost=0.70077 
0. MarshalAndScanDataOp time=0.13936471939086914 cost=0.0 
1. ConvertXLSToTable time=0.8186140060424805 cost=0.0 
2. LLMConvertConventional time=381.0207107067108 cost=0.70077 



## End to end

Minimum cost : GPT 3.5

In [14]:
DataDirectory().clear_cache(keep_registry=True)

xls = Dataset('biofabric-tiny', schema=XLSFile)
patient_tables = xls.convert(Table, desc="All tables in the file", cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The table contains biometric information about the patient")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table", cardinality=Cardinality.ONE_TO_MANY)

policy = MinCost()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_code_synth=False,
    allow_token_reduction=False,
    processing_strategy="streaming",
    execution_strategy="sequential",
)
iterable = case_data.run(config)

output_rows = []
for data_record_collection in iterable:  # noqa: B007
    for output_table in data_record_collection:
        print(output_table.to_dict().keys())
        output_rows.append(output_table.to_dict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

Available models:  [GPT_3_5, GPT_4]
LOGICAL PLANS: 2
INITIAL PLANS: 6
INITIAL PLANS: 4
DEDUP PLANS: 8
PARETO PLANS: 1
Time for planning: 0.02268195152282715
Iteration number:  1 Last record:  False
dict_keys(['rows', 'filename', 'header', 'name', 'age_at_diagnosis', 'ajcc_pathologic_n', 'ajcc_pathologic_stage', 'ajcc_pathologic_t', 'case_submitter_id', 'ethnicity', 'gender', 'morphology', 'primary_diagnosis', 'race', 'study', 'tissue_or_organ_of_origin', 'tumor_focality', 'tumor_grade', 'tumor_largest_dimension_diameter', 'vital_status'])
Iteration number:  2 Last record:  False
Iteration number:  3 Last record:  True


Unnamed: 0,rows,filename,header,name,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,"[[S001, C3L-00006, No, 2, 5, 128N, C3L-00006-0...",dou_mmc1.xlsx,"[idx, Proteomics_Participant_ID, Case_excluded...",dou_mmc1.xlsx_UCEC_CPTAC3_meta_table_V2.1,Age of the patient at the time of diagnosis,The codes that represent the stage of cancer b...,"The extent of a cancer, especially whether the...",Path_Stage_Primary_Tumor-pT,Proteomics_Participant_ID,Whether an individual describes themselves as ...,Text designations that identify gender,ICD-O Morphological code,Text term used to describe the patient's histo...,An arbitrary classification of a taxonomic gro...,CPTAC3,The text term used to describe the anatomic si...,The text term used to describe whether the pat...,Numeric value to express the degree of abnorma...,value,The vital status of the patient


In [None]:
output_table.__dict__.keys()

Maximum Cost: GPT 4

In [18]:
DataDirectory().clear_cache(keep_registry=True)

xls = Dataset('biofabric-tiny', schema=XLSFile)
patient_tables = xls.convert(Table, desc="All tables in the file", cardinality=Cardinality.ONE_TO_MANY)
patient_tables = patient_tables.filter("The table contains biometric information about the patient")
case_data = patient_tables.convert(CaseData, desc="The patient data in the table",cardinality=Cardinality.ONE_TO_MANY)

policy = MaxQuality()
config = QueryProcessorConfig(
    policy=policy,
    nocache=True,
    allow_token_reduction=False,
    allow_code_synth=False,
    processing_strategy="streaming",
    execution_strategy="sequential",
)
iterable = case_data.run(config)

output_rows = []
for data_record_collection in iterable:  # noqa: B007
    for output_table in data_record_collection:
        print(output_table.to_dict().keys())
        output_rows.append(output_table.to_dict()) 

output_df = pd.DataFrame(output_rows)
display(output_df)

LOGICAL PLANS: 1
INITIAL PLANS: 27
DEDUP PLANS: 27
PARETO PLANS: 17
----------
Policy is: Maximum Quality
Chosen plan: Time est: 411.647 -- Cost est: 0.973 -- Quality est: 0.486
 0. MarshalAndScanDataOp -> File 

 1. File -> InduceFromCandidateOp -> XLSFile 
    Using Model.GPT_3_5
    (contents,filena...) -> (contents,filena...)

 2. XLSFile -> InduceFromCandidateOp -> Table 
    Using Model.GPT_4
    (contents,filena...) -> (filename,header...)

 3. Table -> FilterCandidateOp -> Table 
    Using Model.GPT_4
    Filter: "The table contains biometric information about the patient"
    (filename,header...) -> (filename,header...)

 4. Table -> InduceFromCandidateOp -> CaseData 
    Using Model.GPT_4
    (filename,header...) -> (age_at_diagnosi...)

Bonded query processing error: No output objects were generated with bonded query - trying with conventional query...
BondedQuery Error: No output objects were generated with bonded query - trying with conventional query...
Falling back to co

Unnamed: 0,age_at_diagnosis,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,case_submitter_id,ethnicity,gender,morphology,primary_diagnosis,race,study,tissue_or_organ_of_origin,tumor_focality,tumor_grade,tumor_largest_dimension_diameter,vital_status
0,64.0,pN0,Stage I,pT1a (FIGO IA),C3L-00006,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Anterior endometrium,Unifocal,Cannot be determined,2.9,No
1,58.0,pNX,Stage IV,pT1a (FIGO IA),C3L-00008,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 1,White,UCEC_CPTAC3,Posterior endometrium,Unifocal,Cannot be determined,3.5,No
2,50.0,pN0,Stage I,pT1a (FIGO IA),C3L-00032,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,4.5,Yes
3,,,,,C3L-00084,,,Carcinosarcoma,,,UCEC_CPTAC3,,,,,
4,75.0,pNX,Stage I,pT1a (FIGO IA),C3L-00090,Not-Hispanic or Latino,Female,Endometrioid,FIGO grade 2,White,UCEC_CPTAC3,"Other, specify",Unifocal,Cannot be determined,3.5,No
5,729.0,N2b,Stage III,T4a,01CO001,,Male,Mucinous,,,Vasaikar,Sigmoid Colon,,,,Living
6,838.0,N0,Stage II,T3,01CO005,,Female,Not Mucinous,,,Vasaikar,Sigmoid Colon,,,,Deceased
7,904.0,N2b,Stage III,T4a,01CO006,,Female,Mucinous,,,Vasaikar,Ascending Colon,,,,Living
8,652.0,N0,Stage II,T3,01CO008,,Female,Mucinous,,,Vasaikar,Descending Colon,,,,Living
9,58.0,,,,C3L-00104,Not-Hispanic or Latino,Male,,,White,wang,Frontal Lobe,,,,Deceased
