<a href="https://colab.research.google.com/github/muffin-head/essex_work/blob/main/essex_dissertation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import json

def query_gdc_for_uuids(project, data_type):
    """
    Query the GDC API to find file UUIDs based on project and data type.

    Parameters:
    - project: The project ID (e.g., "TCGA-BRCA").
    - data_type: The type of data to search for (e.g., "Gene Expression Quantification").

    Returns:
    A list of file UUIDs.
    """
    # GDC endpoint for files
    endpoint = "https://api.gdc.cancer.gov/files"

    # Define the query parameters
    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "cases.project.project_id", "value": [project]}},
            {"op": "in", "content": {"field": "files.data_type", "value": [data_type]}}
        ]
    }
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "2000"  # Adjust based on expected results
    }

    # Perform the API request
    response = requests.get(endpoint, params=params)
    file_uuids = []

    if response.status_code == 200:
        # Parse the response JSON to extract file UUIDs
        results = response.json()["data"]["hits"]
        file_uuids = [result["file_id"] for result in results]
        print(f"Found {len(file_uuids)} files.")
    else:
        print("Failed to query GDC API.")

    return file_uuids


In [None]:
import requests
import json

# Define a function to query GDC API and download files
def download_files_from_gdc(project, data_type):
    # Endpoint for searching files in GDC
    files_endpt = "https://api.gdc.cancer.gov/files"

    # Setting up the query parameters
    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "cases.project.project_id", "value": [project]}},
            {"op": "in", "content": {"field": "files.data_type", "value": [data_type]}}
        ]
    }

    params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "10"  # Adjust size as needed
    }

    # Make the request to the GDC API
    response = requests.get(files_endpt, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        file_uuids = [file["file_id"] for file in response.json()["data"]["hits"]]

        # Download each file (example for the first file)
        if file_uuids:
            download_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_uuids[0])
            with requests.get(download_endpt, stream=True) as r:
                r.raise_for_status()
                with open(f"{file_uuids[0]}.txt", 'wb') as f:  # Example file name
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print("Downloaded first file successfully.")
        else:
            print("No files found for the given query.")
    else:
        print("Failed to query GDC API.")

# Example usage
download_files_from_gdc("TCGA-BRCA", "Gene Expression Quantification")


Downloaded first file successfully.


In [None]:
import requests
import json

def download_clinical_data_from_gdc(project):
    """
    Download clinical data for a specific project from the GDC.

    Parameters:
    - project: The project ID (e.g., "TCGA-LUAD").
    """
    # Endpoint for searching files in GDC
    files_endpt = "https://api.gdc.cancer.gov/files"

    # Define the query for clinical data
    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "cases.project.project_id", "value": [project]}},
            {"op": "in", "content": {"field": "files.data_category", "value": ["clinical_data"]}}
        ]
    }

    params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "10"  # Adjust size as needed
    }

    # Make the API request
    response = requests.get(files_endpt, params=params)

    if response.status_code == 200:
        file_uuids = [file["file_id"] for file in response.json()["data"]["hits"]]

        for uuid in file_uuids:
            download_endpt = f"https://api.gdc.cancer.gov/data/{uuid}"
            with requests.get(download_endpt, stream=True) as r:
                r.raise_for_status()
                with open(f"{project}_clinical_{uuid}.xml", 'wb') as f:  # Example file name
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Downloaded clinical data file {uuid} successfully.")
    else:
        print("Failed to query GDC API.")

# Example usage
download_clinical_data_from_gdc("TCGA-LUAD")


In [None]:
import requests
import pandas as pd
import json

def fetch_clinical_data(project):
    """
    Fetch clinical data fields for a specific project from the GDC.

    Parameters:
    - project: The project ID (e.g., "TCGA-BRCA").

    Returns:
    A pandas DataFrame with the clinical data of interest.
    """
    # Endpoint for searching cases in GDC
    cases_endpt = "https://api.gdc.cancer.gov/cases"

    # Define the query for clinical data
    fields = [
        "case_id",
        "diagnoses.primary_diagnosis",
        "diagnoses.year_of_diagnosis",
        "diagnoses.prior_malignancy"
    ]
    filters = {
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": [project]
        }
    }
    params = {
        "filters": json.dumps(filters),
        "fields": ",".join(fields),
        "format": "JSON",
        "size": "100"  # Adjust size as needed
    }

    # Make the API request
    response = requests.get(cases_endpt, params=params)

    if response.status_code == 200:
        data = response.json()["data"]["hits"]

        # Extract the data of interest
        clinical_data = []
        for case in data:
            for diagnosis in case.get("diagnoses", []):
                clinical_data.append({
                    "case_id": case["case_id"],
                    "year_of_diagnosis": diagnosis.get("year_of_diagnosis"),
                    "prior_malignancy": diagnosis.get("prior_malignancy")
                })

        return pd.DataFrame(clinical_data)
    else:
        print("Failed to query GDC API.")
        return pd.DataFrame()

# Fetch clinical data
clinical_data = fetch_clinical_data("TCGA-BRCA")
print(clinical_data.head())


                                case_id  year_of_diagnosis prior_malignancy
0  a8b1f6e7-2bcf-460d-b1c6-1792a9801119               2011               no
1  17f275c1-a0d4-487d-8f02-ea279584b4cd               2010               no
2  6f6e7356-3521-4674-8eec-ad01340d4b8e               2010               no
3  8a0ef221-8070-43fc-9d98-def90abe4871               2007               no
4  195b6731-6ae6-416e-a345-0eb8874a0ca2               2010               no


In [None]:
import requests
import pandas as pd

def fetch_complete_diagnosis(project):
    cases_endpt = "https://api.gdc.cancer.gov/cases"

    fields = [
        "case_id",
        "diagnoses.diagnosis_id",
        "diagnoses.primary_diagnosis",
        "diagnoses.tumor_stage",
        "diagnoses.morphology",
        # Add more diagnosis-related fields as needed
    ]
    filters = {
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": [project]
        }
    }
    params = {
        "filters": json.dumps(filters),
        "fields": ",".join(fields),
        "format": "JSON",
        "size": "200"  # Adjust size based on expected number of cases
    }

    response = requests.get(cases_endpt, params=params)
    diagnosis_details = []

    if response.status_code == 200:
        data = response.json()["data"]["hits"]
        for case in data:
            for diagnosis in case.get("diagnoses", []):
                diagnosis_details.append({
                    "Case ID": case["case_id"],
                    "Diagnosis ID": diagnosis.get("diagnosis_id"),
                    "Primary Diagnosis": diagnosis.get("primary_diagnosis"),
                    "Tumor Stage": diagnosis.get("tumor_stage"),
                    "Morphology": diagnosis.get("morphology"),
                    # Extract more fields as needed
                })

        return pd.DataFrame(diagnosis_details)
    else:
        print("Failed to query GDC API.")
        return pd.DataFrame()

# Example usage
project_id = "TCGA-BRCA"  # Example: Breast Cancer
complete_diagnosis_df = fetch_complete_diagnosis(project_id)
print(complete_diagnosis_df.head())


                                Case ID                          Diagnosis ID  \
0  a8b1f6e7-2bcf-460d-b1c6-1792a9801119  92a02928-76c3-5ce9-a0ad-a507f90f0a49   
1  17f275c1-a0d4-487d-8f02-ea279584b4cd  bb20b576-2686-58d5-9cb8-f46d471fbc17   
2  6f6e7356-3521-4674-8eec-ad01340d4b8e  6f89cb22-22b6-5e9e-8e4c-7684b0de4abf   
3  8a0ef221-8070-43fc-9d98-def90abe4871  691b718c-8815-516e-87de-8c900021bc92   
4  195b6731-6ae6-416e-a345-0eb8874a0ca2  e2f11c94-39b9-597e-9cbb-0df76724e022   

                         Primary Diagnosis Tumor Stage Morphology  
0  Infiltrating duct and lobular carcinoma        None     8522/3  
1         Infiltrating duct carcinoma, NOS        None     8500/3  
2                   Lobular carcinoma, NOS        None     8520/3  
3         Infiltrating duct carcinoma, NOS        None     8500/3  
4         Infiltrating duct carcinoma, NOS        None     8500/3  


In [None]:
import requests
import pandas as pd
import json

def fetch_expanded_clinical_data(project):
    """
    Fetch expanded clinical and diagnosis data for a specific project from the GDC.

    Parameters:
    - project: The project ID (e.g., "TCGA-BRCA").

    Returns:
    A pandas DataFrame with the expanded clinical data.
    """
    # Endpoint for searching cases in GDC
    cases_endpt = "https://api.gdc.cancer.gov/cases"

    # Define the query for expanded clinical data
    fields = [
        "case_id",
        "demographic.gender",
        "demographic.race",
        "demographic.ethnicity",
        "diagnoses.primary_diagnosis",
        "diagnoses.tumor_stage",
        "diagnoses.treatments.treatment_type",
        "diagnoses.treatments.outcome",
        "diagnoses.morphology",
        "diagnoses.year_of_diagnosis",
        # Add more fields as needed
    ]
    filters = {
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": [project]
        }
    }
    params = {
        "filters": json.dumps(filters),
        "fields": ",".join(fields),
        "format": "JSON",
        "size": "200"  # Adjust size based on expected number of cases
    }

    response = requests.get(cases_endpt, params=params)
    expanded_clinical_details = []

    if response.status_code == 200:
        data = response.json()["data"]["hits"]
        for case in data:
            demographics = case.get("demographic", {})
            for diagnosis in case.get("diagnoses", []):
                treatments = diagnosis.get("treatments", [])
                for treatment in treatments:
                    expanded_clinical_details.append({
                        "Case ID": case["case_id"],
                        "Gender": demographics.get("gender"),
                        "Race": demographics.get("race"),
                        "Ethnicity": demographics.get("ethnicity"),
                        "Primary Diagnosis": diagnosis.get("primary_diagnosis"),
                        "Tumor Stage": diagnosis.get("tumor_stage"),
                        "Treatment Type": treatment.get("treatment_type"),
                        "Treatment Outcome": treatment.get("outcome"),
                        "Morphology": diagnosis.get("morphology"),
                        "Year of Diagnosis": diagnosis.get("year_of_diagnosis"),
                        # Extract more fields as needed
                    })

        return pd.DataFrame(expanded_clinical_details)
    else:
        print("Failed to query GDC API.")
        return pd.DataFrame()

# Example usage
project_id = "TCGA-LUAD"  # Example: Lung Adenocarcinoma
expanded_clinical_df = fetch_expanded_clinical_data(project_id)
print(expanded_clinical_df.head())


                                Case ID  Gender          Race  \
0  ac0d7a82-82cb-4aec-b859-e37375f3de8b  female  not reported   
1  ac0d7a82-82cb-4aec-b859-e37375f3de8b  female  not reported   
2  ae39e358-08d7-4367-ae68-82b469e791e4    male         white   
3  ae39e358-08d7-4367-ae68-82b469e791e4    male         white   
4  aea68827-dc0f-484d-a00d-06deeaa4b3ce    male  not reported   

                Ethnicity                   Primary Diagnosis Tumor Stage  \
0            not reported               Acinar cell carcinoma        None   
1            not reported               Acinar cell carcinoma        None   
2  not hispanic or latino                 Adenocarcinoma, NOS        None   
3  not hispanic or latino                 Adenocarcinoma, NOS        None   
4            not reported  Adenocarcinoma with mixed subtypes        None   

                Treatment Type Treatment Outcome Morphology  Year of Diagnosis  
0       Radiation Therapy, NOS              None     8550/3      

In [None]:
import requests
import pandas as pd

def fetch_genomic_data_for_cases(case_ids, project='TCGA-BRCA', data_type='Gene Expression Quantification'):
    files_endpt = "https://api.gdc.cancer.gov/files"
    genomic_data = []

    for case_id in case_ids:
        filters = {
            "op": "and",
            "content": [
                {"op": "in", "content": {"field": "cases.case_id", "value": [case_id]}},
                {"op": "in", "content": {"field": "files.data_type", "value": [data_type]}},
                {"op": "in", "content": {"field": "cases.project.project_id", "value": [project]}}
            ]
        }
        params = {
            "filters": json.dumps(filters),
            "fields": "file_id,file_name,cases.case_id,data_category,data_type",
            "format": "JSON",
            "size": "100"
        }

        response = requests.get(files_endpt, params=params)
        if response.status_code == 200:
            files = response.json().get('data', {}).get('hits', [])
            for file in files:
                genomic_data.append({
                    "Case ID": case_id,
                    "File ID": file.get('file_id'),
                    "File Name": file.get('file_name'),
                    "Data Type": file.get('data_type')
                })
        else:
            print(f"Failed to fetch genomic data for case ID {case_id}, status code {response.status_code}")

    return pd.DataFrame(genomic_data)

# Obtain a list of case IDs from your diagnosis data DataFrame
case_ids_list = expanded_clinical_df['Case ID'].unique().tolist()

# Fetch genomic data for these case IDs
genomic_data_df = fetch_genomic_data_for_cases(case_ids_list)
print(genomic_data_df.head())


Empty DataFrame
Columns: []
Index: []


In [None]:
!pip install biolearns

Collecting biolearns
  Downloading biolearns-0.0.62.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: biolearns
  Building wheel for biolearns (setup.py) ... [?25l[?25hdone
  Created wheel for biolearns: filename=biolearns-0.0.62-py3-none-any.whl size=27902 sha256=4d7e1961e0c2fc4b554bec46c9dd41a82c94146eb225266366a338aea63d8698
  Stored in directory: /root/.cache/pip/wheels/06/ea/ed/23d136360d5b3e8bc61421e4f68ede02975720f15483d6234e
Successfully built biolearns
Installing collected packages: biolearns
Successfully installed biolearns-0.0.62


In [None]:
from biolearns.dataset import TCGA
brca = TCGA('LUAD')

Retrieve mRNAseq from http://firebrowse.org/ ...
Cohort: Lung adenocarcinoma (LUAD)
File type: illuminahiseq_rnaseqv2-RSEM_genes_normalized

ValueError: Multiple files found in TAR archive. Only one file per TAR archive: ['gdac.broadinstitute.org_LUAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/LUAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt', 'gdac.broadinstitute.org_LUAD.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0/MANIFEST.txt']

In [None]:
!pip install lifelines

Collecting lifelines
  Downloading lifelines-0.28.0-py3-none-any.whl (349 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.1-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=058387d823b2b1024e44d95cdeb5cf04da932e5636f38b02cefe573cc665574e
  Stored in 

In [None]:
import tarfile
import os

# Specify the path to your TAR archive
tar_path = 'path_to_your_tar_archive.tar'
extraction_path = 'path_where_to_extract'

# Extract the TAR archive
with tarfile.open(tar_path, 'r') as tar:
    tar.extractall(path=extraction_path)

# Assuming you know the file name or can identify it programmatically
data_file_name = 'LUAD.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt'
data_file_path = os.path.join(extraction_path, data_file_name)

# Now, you can load the data file using pandas or your preferred method
import pandas as pd
data_df = pd.read_csv(data_file_path, sep='\t')  # Adjust separator as needed
print(data_df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_tar_archive.tar'

In [None]:
x

/bin/bash: line 1: gdc-client: command not found


In [None]:
!pip install gdc

Collecting gdc
  Downloading gdc-1.3-py3-none-any.whl (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cycler==0.10.0 (from gdc)
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver==1.1.0 (from gdc)
  Downloading kiwisolver-1.1.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.1.1 (from gdc)
  Downloading matplotlib-3.1.1.tar.gz (37.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.8/37.8 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.17.0 (from gdc)
  Downloading numpy-1.17.0.zip (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m


In [None]:
!gdc-client download -m manifest.txt

/bin/bash: line 1: gdc-client: command not found


In [104]:
import requests
import pandas as pd
import json
import numpy as np
def fetch_expanded_clinical_data(project):
    """
    Fetch expanded clinical and diagnosis data for a specific project from the GDC.

    Parameters:
    - project: The project ID (e.g., "TCGA-BRCA").

    Returns:
    A pandas DataFrame with the expanded clinical data.
    """
    # Endpoint for searching cases in GDC
    cases_endpt = "https://api.gdc.cancer.gov/cases"

    # Updated to include additional fields
    fields = [
        "case_id",
        "demographic.gender",

        "demographic.race",
        "demographic.ethnicity",
        "diagnoses.primary_diagnosis",
        "diagnoses.days_to_diagnosis",
        "diagnoses.tissue_or_organ_of_origin",
        "diagnoses.treatments.treatment_type",
        "diagnoses.treatments.outcome",
        "diagnoses.morphology",
        "diagnoses.year_of_diagnosis",
        "diagnoses.ajcc_pathologic_stage",
        "diagnoses.age_at_diagnosis",
        "diagnoses.tumor_grade_category",

        "diagnoses.days_to_last_follow_up",# Added
        "project.disease_type",  # Added
        "project.primary_site",  # Added
        "samples.longest_dimension",  # Added
        # Note: "cancer.type" and "Subtype_Selected" might not directly correspond to GDC fields.
        # You may need metadata annotations or additional logic to derive these values.
    ]
    filters = {
        "op": "in",
        "content": {
            "field": "project.project_id",
            "value": [project]
        }
    }
    params = {
        "filters": json.dumps(filters),
        "fields": ",".join(fields),
        "format": "JSON",
        "size": "3000"  # Adjust size based on expected number of cases
    }

    response = requests.get(cases_endpt, params=params)
    expanded_clinical_details = []

    if response.status_code == 200:
        data = response.json()["data"]["hits"]
        for case in data:
            demographics = case.get("demographic", {})
            samples = case.get("samples", [{}])[0]
            for diagnosis in case.get("diagnoses", []):
                treatments = diagnosis.get("treatments", [])
                for treatment in treatments:
                    expanded_clinical_details.append({
                        "Case ID": case["case_id"],
                        "Gender": demographics.get("gender"),
                        "Race": demographics.get("race"),
                        "Ethnicity": demographics.get("ethnicity"),


                        "Primary Diagnosis": diagnosis.get("primary_diagnosis"),
                        "tumor_grade_category": diagnosis.get("tumor_grade_category"),


                        "days_to_last_follow_up": diagnosis.get("days_to_last_follow_up"),
                        "age_at_diagnosis": diagnosis.get("age_at_diagnosis"),
                        "ajcc_pathologic_stage": diagnosis.get("ajcc_pathologic_stage"),
                        "days_to_diagnosis": diagnosis.get("days_to_diagnosis"),
                        "Treatment Type": treatment.get("treatment_type"),
                        #"Treatment outcome": treatment.get("drug_category"),
                        "Morphology": diagnosis.get("morphology"),
                        "Year of Diagnosis": diagnosis.get("year_of_diagnosis"),
                        "Tissue or Organ of Origin": diagnosis.get("tissue_or_organ_of_origin"),
                        "Primary Site": case.get("project", {}).get("primary_site"),
                        "Longest Dimension": samples.get("longest_dimension"),
                        # "Cancer Type" and "Subtype Selected" handling depends on your specific data annotations or additional fields
                    })

        return pd.DataFrame(expanded_clinical_details)
    else:
        print("Failed to query GDC API.")
        return pd.DataFrame()

# Example usage
project_id = "TCGA-LUAD"  # Example: Lung Adenocarcinoma
expanded_clinical_df = fetch_expanded_clinical_data(project_id)
expanded_clinical_df['age'] = np.where(pd.notnull(expanded_clinical_df['age_at_diagnosis']),(expanded_clinical_df['age_at_diagnosis'] / 365.25).round(),np.nan)
expanded_clinical_df.drop(['age_at_diagnosis'],axis=1)

Unnamed: 0,Case ID,Gender,Race,Ethnicity,Primary Diagnosis,tumor_grade_category,days_to_last_follow_up,ajcc_pathologic_stage,days_to_diagnosis,Treatment Type,Morphology,Year of Diagnosis,Tissue or Organ of Origin,Primary Site,Longest Dimension,age
0,ac0d7a82-82cb-4aec-b859-e37375f3de8b,female,not reported,not reported,Acinar cell carcinoma,,657.0,Stage IIIA,0.0,"Radiation Therapy, NOS",8550/3,2012.0,"Lower lobe, lung",[Bronchus and lung],1.9,79.0
1,ac0d7a82-82cb-4aec-b859-e37375f3de8b,female,not reported,not reported,Acinar cell carcinoma,,657.0,Stage IIIA,0.0,"Pharmaceutical Therapy, NOS",8550/3,2012.0,"Lower lobe, lung",[Bronchus and lung],1.9,79.0
2,ae39e358-08d7-4367-ae68-82b469e791e4,male,white,not hispanic or latino,"Adenocarcinoma, NOS",,40.0,Stage IIIA,0.0,"Pharmaceutical Therapy, NOS",8140/3,2010.0,"Lower lobe, lung",[Bronchus and lung],1.6,59.0
3,ae39e358-08d7-4367-ae68-82b469e791e4,male,white,not hispanic or latino,"Adenocarcinoma, NOS",,40.0,Stage IIIA,0.0,"Radiation Therapy, NOS",8140/3,2010.0,"Lower lobe, lung",[Bronchus and lung],1.6,59.0
4,aea68827-dc0f-484d-a00d-06deeaa4b3ce,male,not reported,not reported,Adenocarcinoma with mixed subtypes,,730.0,Stage IB,0.0,"Pharmaceutical Therapy, NOS",8255/3,2008.0,"Lower lobe, lung",[Bronchus and lung],1.1,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,fcfe0b53-dfc1-42fa-9efc-2b2ff259297e,male,white,not hispanic or latino,"Micropapillary carcinoma, NOS",,564.0,Stage IB,0.0,"Radiation Therapy, NOS",8265/3,2011.0,"Upper lobe, lung",[Bronchus and lung],1.1,65.0
1040,fd5c44ef-ea50-4fba-9e8d-e371cf34ebdb,female,white,not reported,Adenocarcinoma with mixed subtypes,,,Stage IIIA,0.0,"Pharmaceutical Therapy, NOS",8255/3,2000.0,"Upper lobe, lung",[Bronchus and lung],0.7,53.0
1041,fd5c44ef-ea50-4fba-9e8d-e371cf34ebdb,female,white,not reported,Adenocarcinoma with mixed subtypes,,,Stage IIIA,0.0,"Radiation Therapy, NOS",8255/3,2000.0,"Upper lobe, lung",[Bronchus and lung],0.7,53.0
1042,ff07ea4b-4e50-410d-99d6-96a351dad7b1,male,black or african american,not hispanic or latino,"Adenocarcinoma, NOS",,824.0,Stage IA,0.0,"Radiation Therapy, NOS",8140/3,2011.0,"Upper lobe, lung",[Bronchus and lung],,61.0


In [105]:
expanded_clinical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Case ID                    1044 non-null   object 
 1   Gender                     1044 non-null   object 
 2   Race                       1044 non-null   object 
 3   Ethnicity                  1044 non-null   object 
 4   Primary Diagnosis          1044 non-null   object 
 5   tumor_grade_category       0 non-null      object 
 6   days_to_last_follow_up     790 non-null    float64
 7   age_at_diagnosis           982 non-null    float64
 8   ajcc_pathologic_stage      1028 non-null   object 
 9   days_to_diagnosis          1006 non-null   float64
 10  Treatment Type             1044 non-null   object 
 11  Morphology                 1044 non-null   object 
 12  Year of Diagnosis          1024 non-null   float64
 13  Tissue or Organ of Origin  1044 non-null   objec

In [57]:
expanded_clinical_df['age'] = np.where(pd.notnull(expanded_clinical_df['age_at_diagnosis']),
                                       (expanded_clinical_df['age_at_diagnosis'] / 365.25).round(),
                                       np.nan)