In [1]:
import pandas as pd
import os
import logging
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def get_gbm_data_summary():
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()

    # Define the API endpoint
    endpoint = "https://api.gdc.cancer.gov/files"

    # Define the parameters
    params = {
        "filters": {
            "op": "and",
            "content": [
                {
                    "op": "in",
                    "content": {
                        "field": "cases.project.project_id",
                        "value": ["TCGA-GBM"]
                    }
                },
                {
                    "op": "in",
                    "content": {
                        "field": "access",
                        "value": ["open"]
                    }
                },
                {
                    "op": "not",
                    "content": {
                        "field": "data_type",
                        "value": ["Slide Image"]
                    }
                }
            ]
        },
        "fields": "file_id,file_name,file_size,data_type",
        "size": "10000",  # Increase size to ensure all files are retrieved
        "pretty": "true"
    }

    # Create a session with retry mechanism
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    # Make the request with a timeout
    logger.info("Making POST request to retrieve data")
    try:
        response = session.post(endpoint, json=params, timeout=10)  # 10 seconds timeout
    except requests.exceptions.Timeout:
        logger.error("Request timed out")
        print("Request timed out")
        response = None

    # Check the response status
    if response and response.status_code == 200:
        data = response.json()
        # Convert to DataFrame for easier manipulation
        df = pd.DataFrame(data['data']['hits'])
        logger.info("Data retrieved successfully")
        #print(df.head())

        # Filter out Slide Image data
        df = df[df['data_type'] != 'Slide Image']

        # Group by data_type and count the number of files and their total size for each type
        file_counts = df['data_type'].value_counts()
        file_sizes = df.groupby('data_type')['file_size'].sum()

        # Define the base download directory
        base_download_dir = "../data/raw/all_open_gbm_data"

        # Function to check the number of files and their total size in the data folder
        def count_files_and_sizes_in_folder(base_download_dir):
            folder_counts = {}
            folder_sizes = {}
            for data_type in os.listdir(base_download_dir):
                data_type_dir = os.path.join(base_download_dir, data_type)
                if os.path.isdir(data_type_dir):
                    files = os.listdir(data_type_dir)
                    folder_counts[data_type] = len(files)
                    folder_sizes[data_type] = sum(os.path.getsize(os.path.join(data_type_dir, f)) for f in files)
            return folder_counts, folder_sizes

        # Get the count of files and their sizes in the data folder
        folder_counts, folder_sizes = count_files_and_sizes_in_folder(base_download_dir)

        # Create DataFrames for the folder counts and sizes
        folder_counts_df = pd.DataFrame(list(folder_counts.items()), columns=['data_type', 'files_in_folder'])
        folder_sizes_df = pd.DataFrame(list(folder_sizes.items()), columns=['data_type', 'size_in_folder'])

        # Merge the expected file counts and sizes with the folder counts and sizes
        result_df = pd.DataFrame(file_counts).reset_index()
        result_df.columns = ['data_type', 'expected_files']
        result_df['expected_size'] = result_df['data_type'].map(file_sizes)
        result_df = result_df.merge(folder_counts_df, on='data_type', how='left').fillna(0)
        result_df = result_df.merge(folder_sizes_df, on='data_type', how='left').fillna(0)

        # Convert sizes to gigabytes for readability
        result_df['expected_size'] = result_df['expected_size'] / (1024 ** 3)
        result_df['size_in_folder'] = result_df['size_in_folder'] / (1024 ** 3)

        # Create a DataFrame with the initial data
        initial_data_df = df[['data_type', 'file_id', 'file_name', 'file_size']]

        return result_df, initial_data_df

    else:
        logger.error("Failed to retrieve data")
        print("Failed to retrieve data")
        return None, None

In [2]:
# Example usage
result_df, initial_data_df = get_gbm_data_summary()

INFO:root:Making POST request to retrieve data
INFO:root:Data retrieved successfully


In [3]:
result_df

Unnamed: 0,data_type,expected_files,expected_size,files_in_folder,size_in_folder
0,Gene Level Copy Number,1369,4.38805,1369,4.38805
1,Copy Number Segment,985,0.037304,985,0.037304
2,Masked Copy Number Segment,984,0.011081,984,0.011081
3,Allele-specific Copy Number Segment,879,0.006843,879,0.006843
4,Masked Intensities,743,2.221866,743,2.221866
5,Biospecimen Supplement,713,0.0469,713,0.0469
6,Clinical Supplement,506,0.01939,506,0.01939
7,Pathology Report,503,0.038505,503,0.038505
8,Raw Intensities,470,2.572408,470,2.572408
9,Masked Somatic Mutation,377,0.013021,377,0.013021


In [4]:
initial_data_df

Unnamed: 0,data_type,file_id,file_name,file_size
0,Pathology Report,468996e4-58e0-4088-8103-4f41a8bd85d4,TCGA-06-0124.5BA4F4A0-E68C-4BAD-88BC-8DD9F399A...,24859
1,Gene Level Copy Number,86573015-22a6-482d-97b0-dc07f961651e,TCGA-GBM.1320140b-2dfe-42f6-b369-0a5725bd6492....,3445562
2,Allele-specific Copy Number Segment,5fee464d-d957-4b98-ad0e-3eb1bd791fa5,TCGA-GBM.5402f832-b7d3-4a86-9626-2a0fcf5b173f....,10375
4,Clinical Supplement,0e52548c-2ac3-471f-83be-bb6b3cc2c162,nationwidechildrens.org_clinical.TCGA-14-3476.xml,25127
5,Gene Level Copy Number,8cdde20d-8ad1-4572-a03e-cabdc3a2bc4f,TCGA-GBM.8fcb6cc0-ab33-4e69-88df-c3cae742c6ba....,3445487
...,...,...,...,...
9994,Masked Copy Number Segment,e85fd32e-0af8-40d8-b289-65ddc65d5a28,SEXES_p_TCGA_b111_SNP_N_GenomeWideSNP_6_E07_78...,3451
9995,Allele-specific Copy Number Segment,bb84766d-650a-4c0f-aeb4-08b3af38ca3c,TCGA-GBM.808d2195-34fb-4e65-ae63-3a6b3e65a088....,2593
9997,Masked Intensities,1e75b973-9a34-4921-96a0-386c6300ffd8,e7aa5415-73d2-4023-b622-bb1d3a6bf530_noid_Red....,8091452
9998,Clinical Supplement,3e2a859b-dbed-462a-939c-771d6be99c12,nationwidechildrens.org_clinical.TCGA-06-5411.xml,41510


#### Get sample mapping (file_ids, file_names and sample_ids)

In [5]:
import requests
import pandas as pd
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# Define the API endpoint
endpoint = "https://api.gdc.cancer.gov/files"

# Define the parameters
params = {
    "filters": {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.project.project_id",
                    "value": ["TCGA-GBM"]
                }
            },
            {
                "op": "in",
                "content": {
                    "field": "access",
                    "value": ["open"]
                }
            },
            {
                "op": "not",
                "content": {
                    "field": "data_type",
                    "value": ["Slide Image"]
                }
            }
        ]
    },
    "fields": "file_id,file_name,file_size,data_type,cases.samples.sample_id",
    "size": "10000",  # Increase size to ensure all files are retrieved
    "pretty": "true"
}

# Create a session with retry mechanism
session = requests.Session()
retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

# Function to retrieve data and save sample ID mapping to CSV
def retrieve_and_save_sample_ids():
    logger.info("Making POST request to retrieve data")
    try:
        response = session.post(endpoint, json=params, timeout=10)  # 10 seconds timeout
    except requests.exceptions.Timeout:
        logger.error("Request timed out")
        print("Request timed out")
        response = None

    # Check the response status
    if response and response.status_code == 200:
        data = response.json()
        # Convert to DataFrame for easier manipulation
        df = pd.json_normalize(data['data']['hits'])
        logger.info("Data retrieved successfully")

        # Extract the dictionary mapping file IDs and file names to sample IDs
        sample_id_list = []
        for index, row in df.iterrows():
            file_id = row['file_id']
            file_name = row['file_name']
            cases = row['cases']
            if pd.notna(cases):
                for case in cases:
                    samples = case['samples']
                    for sample in samples:
                        sample_id = sample['sample_id']
                        sample_id_list.append({'file_id': file_id, 'file_name': file_name, 'sample_id': sample_id})

        sample_id_df = pd.DataFrame(sample_id_list)

        # Save the DataFrame to a CSV file
        sample_id_df.to_csv('../data/raw/all_open_gbm_data/sample_id_mapping.csv', index=False)

        print("Sample ID mapping saved to 'sample_id_mapping.csv'")
    else:
        logger.error("Failed to retrieve data")
        print("Failed to retrieve data")

In [6]:
# Retrieve and save sample IDs
retrieve_and_save_sample_ids()

INFO:root:Making POST request to retrieve data
INFO:root:Data retrieved successfully


Sample ID mapping saved to 'sample_id_mapping.csv'


### Match files to samples_types and patient_uuids

In [7]:
import pandas as pd

# Load the sample ID mapping from the CSV file
sample_id_mapping_df = pd.read_csv('../data/raw/all_open_gbm_data/sample_id_mapping.csv')

# Load the biospecimen data from the txt file
biospecimen_df = pd.read_csv('../data/raw/all_open_gbm_data/Biospecimen Supplement/nationwidechildrens.org_biospecimen_sample_gbm.txt', delimiter='\t')

# Merge the sample ID mapping with the biospecimen data based on 'sample_id'
merged_df = pd.merge(biospecimen_df, sample_id_mapping_df, left_on='bcr_sample_uuid', right_on='sample_id')

# Display the merged DataFrame with relevant fields
#print(merged_df[['bcr_patient_uuid', 'file_id', 'bcr_sample_uuid']].head())

# Save merged dataframe to .csv
merged_df.to_csv('../data/raw/all_open_gbm_data/sample_patient_tumor_type_mapping.csv', index=False)

In [8]:
merged_df.shape, sample_id_mapping_df.shape, biospecimen_df.shape

((9896, 30), (10936, 3), (1184, 27))

In [9]:
merged_df[['file_name', 'bcr_patient_uuid', 'sample_type', 'bcr_sample_uuid']].head()

Unnamed: 0,file_name,bcr_patient_uuid,sample_type,bcr_sample_uuid
0,TCGA-GBM.86675433-1ee1-448b-901a-aa906722a510....,30a1fe5e-5b12-472c-aa86-c2db8167ab23,Primary Tumor,3df90f1c-94da-4bd5-bd8e-a0bc92d715f9
1,8e9811c9-3ae0-4468-bc96-83da17865f23_noid_Grn....,30a1fe5e-5b12-472c-aa86-c2db8167ab23,Primary Tumor,3df90f1c-94da-4bd5-bd8e-a0bc92d715f9
2,TCGA-GBM.86675433-1ee1-448b-901a-aa906722a510....,30a1fe5e-5b12-472c-aa86-c2db8167ab23,Primary Tumor,3df90f1c-94da-4bd5-bd8e-a0bc92d715f9
3,TCGA-02-0001-01C-01-BS1.0cc8ca55-d024-440c-a4f...,30a1fe5e-5b12-472c-aa86-c2db8167ab23,Primary Tumor,3df90f1c-94da-4bd5-bd8e-a0bc92d715f9
4,8e9811c9-3ae0-4468-bc96-83da17865f23.methylati...,30a1fe5e-5b12-472c-aa86-c2db8167ab23,Primary Tumor,3df90f1c-94da-4bd5-bd8e-a0bc92d715f9


In [16]:
merged_df.file_name[0]

'TCGA-GBM.86675433-1ee1-448b-901a-aa906722a510.ascat2.allelic_specific.seg.txt'

In [10]:
merged_df.sample_type.value_counts()

sample_type
Primary Tumor           6769
Blood Derived Normal    2816
Solid Tissue Normal      168
Recurrent Tumor          143
Name: count, dtype: int64

In [11]:
merged_df.bcr_patient_uuid.nunique()

594

In [12]:
merged_df[['bcr_patient_uuid', 'file_id', 'bcr_sample_uuid']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9896 entries, 0 to 9895
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   bcr_patient_uuid  9896 non-null   object
 1   file_id           9896 non-null   object
 2   bcr_sample_uuid   9896 non-null   object
dtypes: object(3)
memory usage: 232.1+ KB


In [13]:
# Group by data_type and find the number of unique file_id per data_type 
unique_counts = initial_data_df.groupby('data_type')['file_id'].nunique().reset_index(name='unique_file_count') 

# Group by data_type and find the total counts per data_type 
total_counts = initial_data_df.groupby('data_type')['file_id'].count().reset_index(name='total_file_count') 

# Merge the two dataframes 
result = pd.merge(unique_counts, total_counts, on='data_type') 

print(result)

                              data_type  unique_file_count  total_file_count
0   Allele-specific Copy Number Segment                879               879
1                Biospecimen Supplement                713               713
2                   Clinical Supplement                506               506
3                   Copy Number Segment                985               985
4        Gene Expression Quantification                154               154
5                Gene Level Copy Number               1369              1369
6     Isoform Expression Quantification                 10                10
7            Masked Copy Number Segment                984               984
8                    Masked Intensities                743               743
9               Masked Somatic Mutation                377               377
10               Methylation Beta Value                369               369
11                     Pathology Report                503               503

### Check for any other data

In [14]:
import os
import requests
import pandas as pd
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

# Define the API endpoint for file metadata
metadata_endpoint = "https://api.gdc.cancer.gov/files"

# Define the parameters for metadata retrieval
params = {
    "filters": {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.project.project_id",
                    "value": ["TCGA-GBM"]
                }
            },
            {
                "op": "in",
                "content": {
                    "field": "access",
                    "value": ["open"]
                }
            },
            {
                "op": "not",
                "content": {
                    "field": "data_type",
                    "value": [
                        "Clinical Supplement",
                        "Biospecimen Supplement",
                        "Gene Level Copy Number",
                        "Allele-specific Copy Number Segment",
                        "Masked Somatic Mutation",
                        "Pathology Report",
                        "Slide Image"
                    ]
                }
            }
        ]
    },
    "fields": "file_id,file_name,file_size,data_type,cases.samples.sample_id",
    "size": "10000",  # Increase size to ensure all files are retrieved
    "pretty": "true"
}

# Function to retrieve metadata and check the size of the remaining files
def check_file_sizes(folder_path):
    logger.info("Making POST request to retrieve metadata")
    try:
        with requests.Session() as session:
            retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504])
            session.mount('https://', HTTPAdapter(max_retries=retries))
            response = session.post(metadata_endpoint, json=params, timeout=20)  # Increased timeout to 20 seconds

            # Check the response status
            if response.status_code == 200:
                data = response.json()
                # Extract file metadata
                files = data['data']['hits']
                logger.info("Metadata retrieved successfully")
                print(f"Number of files: {len(files)}")

                # Exclude files in the specified folder and its subfolders
                excluded_files = []
                for root, dirs, filenames in os.walk(folder_path):
                    for filename in filenames:
                        excluded_files.append(filename)

                # Filter out the excluded files and "Slide Image" data type
                remaining_files = [file for file in files if file['file_name'] not in excluded_files and file['data_type'] != "Slide Image"]

                # Calculate the total size of the remaining files in MB
                total_size = sum(file['file_size'] for file in remaining_files)
                total_size_mb = total_size / (1024 * 1024)  # Convert to MB

                # Get the number of remaining files
                num_remaining_files = len(remaining_files)

                print(f"Total size of the remaining files: {total_size_mb:.2f} MB")
                print(f"Number of remaining files: {num_remaining_files}")
            else:
                logger.error(f"Failed to retrieve metadata: {response.status_code}")
                print(f"Failed to retrieve metadata: {response.status_code}")

    except requests.exceptions.Timeout:
        logger.error("Request timed out")
        print("Request timed out")
    except requests.exceptions.ConnectionError:
        logger.error("Connection error occurred")
        print("Connection error occurred")
    except requests.exceptions.HTTPError as err:
        logger.error(f"HTTP error occurred: {err}")
        print(f"HTTP error occurred: {err}")
    except Exception as err:
        logger.error(f"An error occurred: {err}")
        print(f"An error occurred: {err}")

# Run the function with the path to the folder containing datatype folders
folder_path = "../data/raw/all_open_gbm_data"  # Replace with the actual path
check_file_sizes(folder_path)


INFO:root:Making POST request to retrieve metadata
INFO:root:Metadata retrieved successfully


Number of files: 10000
Total size of the remaining files: 0.00 MB
Number of remaining files: 0
