In [None]:
import json
import requests

# I will use the mappings from "files" when filtering in my download command, but for any mappings of "files" that begin with "case", I might want to explore the full set of options with the "cases" endpoint (or go on the TCGA website in cohort builder)
def return_all_mappings_for_endpoint(endpoint = "files"):  # can replace 'files' with 'cases' or any of the other endpoints here: https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/
    # Define the endpoint
    endpoint_full = f"https://api.gdc.cancer.gov/{endpoint}/_mapping"  # can replace 'files' with 'cases' or any of the other endpoints here: https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/

    # Make the GET request
    response = requests.get(endpoint_full)

    # Check the response
    if response.status_code == 200:
        fields = response.json()["fields"]
        return fields
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None


def return_all_options_for_a_mapping(mapping, endpoint = "files"):  # can replace 'files' with 'cases' or any of the other endpoints here: https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/
    # Define the endpoint
    endpoint_full = f"https://api.gdc.cancer.gov/{endpoint}"  # can replace 'files' with 'cases' or any of the other endpoints here: https://docs.gdc.cancer.gov/API/Users_Guide/Getting_Started/

    # Define the parameters
    params = {
        "facets": mapping,
        "size": 0  # No actual data retrieval, just the facet counts
    }

    # Make the request
    response = requests.get(endpoint_full, params=params)

    # Check the response
    if response.status_code == 200:
        # Extract and display unique values
        buckets = response.json()["data"]["aggregations"][mapping]["buckets"]
        mapping_values = sorted([bucket["key"] for bucket in buckets])
        return mapping_values
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

all_cases_mappings = return_all_mappings_for_endpoint(endpoint = "cases")  # one of these values is diagnoses.tissue_or_organ_of_origin 
tissue_or_organ_of_origin_all_values = return_all_options_for_a_mapping(mapping = "diagnoses.tissue_or_organ_of_origin", endpoint = "cases")
print(tissue_or_organ_of_origin_all_values)

['_missing', 'abdomen, nos', 'adrenal gland, nos', 'ampulla of vater', 'anal canal', 'anterior floor of mouth', 'anterior mediastinum', 'anterior wall of bladder', 'anus, nos', 'aortic body and other paraganglia', 'appendix', 'ascending colon', 'autonomic nervous system, nos', 'base of tongue, nos', 'biliary tract, nos', 'bladder neck', 'bladder, nos', 'blood', 'body of pancreas', 'body of stomach', 'bone marrow', 'bone, nos', 'bones of skull and face and associated joints', 'border of tongue', 'brain stem', 'brain, nos', 'breast, nos', 'cardia, nos', 'cecum', 'cerebellum, nos', 'cerebrum', 'cervix uteri', 'cheek mucosa', 'choroid', 'ciliary body', 'colon, nos', 'connective, subcutaneous and other soft tissues of abdomen', 'connective, subcutaneous and other soft tissues of head, face, and neck', 'connective, subcutaneous and other soft tissues of lower limb and hip', 'connective, subcutaneous and other soft tissues of pelvis', 'connective, subcutaneous and other soft tissues of thorax

In [23]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 50)

tumor_normal_metadata = "/home/jrich/data/varseek_data/sequencing/bulk/tcga/matched_tumor_normal_metadata_RNA-Seq.tsv"
tumor_normal_metadata_df = pd.read_csv(tumor_normal_metadata, sep = "\t")

# Count occurrences of each value in the 'primary_site' series
counts = tumor_normal_metadata_df['primary_site'].value_counts()
print(counts)

Kidney                                                         129
Breast                                                         113
Bronchus and lung                                              109
Thyroid gland                                                   59
Liver and intrahepatic bile ducts                               57
Prostate gland                                                  52
Stomach                                                         33
Colon                                                           25
Bladder                                                         19
Other and ill-defined sites in lip, oral cavity and pharynx     13
Esophagus                                                       13
Other and unspecified parts of tongue                           13
Larynx                                                          11
Pancreas                                                         4
Rectosigmoid junction                                         

In [24]:
pair_counts = tumor_normal_metadata_df.groupby(['primary_site', 'primary_diagnosis']).size().reset_index(name='count').sort_values(by='count', ascending=False)
print(pair_counts)

                                         primary_site                                  primary_diagnosis  count
5                                              Breast                   Infiltrating duct carcinoma, NOS     90
35                                             Kidney                     Clear cell adenocarcinoma, NOS     72
49                                     Prostate gland                                Adenocarcinoma, NOS     51
41                  Liver and intrahepatic bile ducts                      Hepatocellular carcinoma, NOS     49
62                                      Thyroid gland                      Papillary adenocarcinoma, NOS     49
19                                  Bronchus and lung                       Squamous cell carcinoma, NOS     46
13                                  Bronchus and lung                                Adenocarcinoma, NOS     42
36                                             Kidney                      Papillary adenocarcinoma, NOS

In [22]:
stage_dict = {
    "Stage I": "1",
    "Stage IA": "1",
    "Stage IB": "1",
    "Stage IC": "1",
    "Stage II": "2",
    "Stage IIA": "2",
    "Stage IIB": "2",
    "Stage IIC": "2",
    "Stage III": "3",
    "Stage IIIA": "3",
    "Stage IIIB": "3",
    "Stage IIIC": "3",
    "Stage IV": "4",
    "Stage IVA": "4",
    "Stage IVB": "4",
    "Stage IVC": "4",
}

tumor_normal_metadata_df['ajcc_pathologic_stage_simplified'] = tumor_normal_metadata_df['ajcc_pathologic_stage'].map(
    lambda x: stage_dict.get(x, x)
)

trio_counts = tumor_normal_metadata_df.groupby(['primary_site', 'primary_diagnosis', 'ajcc_pathologic_stage_simplified']).size().reset_index(name='count').sort_values(by='count', ascending=False)
print(trio_counts)

                                          primary_site                                  primary_diagnosis ajcc_pathologic_stage_simplified  count
10                                              Breast                   Infiltrating duct carcinoma, NOS                                2     53
108                                      Thyroid gland                      Papillary adenocarcinoma, NOS                                1     31
58                                              Kidney                     Clear cell adenocarcinoma, NOS                                1     25
38                                   Bronchus and lung                       Squamous cell carcinoma, NOS                                1     24
25                                   Bronchus and lung                                Adenocarcinoma, NOS                                1     23
61                                              Kidney                     Clear cell adenocarcinoma, NOS                   