In [24]:
# install the CPTAC helper package

import importlib.util, sys, subprocess

pkg = "cptac"
if importlib.util.find_spec(pkg) is None:
    print("Installing cptac...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "cptac"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "cptac"])

else:
    print("cptac is already installed")


cptac is already installed


In [25]:
# load one CPTAC dataset

import cptac
# load one CPTAC datasets that got both proteomics and transcriptomics

import cptac
import pandas as pd

df = cptac.list_datasets()
print(df.head(10))


  Cancer            Source         Datatype
0   ucec               bcm       proteomics
1   ucec               bcm  transcriptomics
2   None  brca_mapping.csv             None
3   brca             broad  transcriptomics
4  ccrcc             broad  transcriptomics
5   coad             broad  transcriptomics
6    gbm             broad  transcriptomics
7  hnscc             broad  transcriptomics
8   lscc             broad  transcriptomics
9   luad             broad  transcriptomics


In [26]:
# Find the dataset with cancers, data source and the type of data

df2 = df.dropna(subset=["Cancer", "Source", "Datatype"]).copy()

# Count datatypes per (Cancer, Source)
counts = (
    df2.groupby(["Cancer", "Source"])["Datatype"]
       .value_counts()
       .unstack(fill_value=0)
)

print(counts.head(10))

# Keep only Cancer/Source pairs that have both proteomics and transcriptomics
both = counts[(counts.get("proteomics", 0) > 0) & (counts.get("transcriptomics", 0) > 0)]
print(both)

Datatype                CNV  acetylproteomics  ancestry_prediction  cibersort  \
Cancer      Source                                                              
all_cancers harmonized    0                 0                    1          0   
            mssm          0                 0                    0          0   
            washu         0                 0                    0          0   
brca        bcm           1                 0                    0          0   
            broad         0                 0                    0          0   
            umich         0                 2                    0          0   
            washu         1                 0                    0          1   
ccrcc       bcm           1                 0                    0          0   
            broad         0                 0                    0          0   
            umich         0                 0                    0          0   

Datatype                cir

In [27]:
# confirm what datatypes exist for ucec / bcm

import cptac
df= cptac.list_datasets()
df2= df.dropna(subset=["Cancer", "Source", "Datatype"]).copy()

counts= (df2.groupby(["Cancer", "Source"])["Datatype"]
         .value_counts()
         .unstack(fill_value=0)
        )
  
print("Datatype columns (first 20):")
print(list(counts.columns)[:20])

print("\nCounts for ('ucec','bcm') (non-zero only):")
row = counts.loc[("ucec", "bcm")]
print(row[row > 0].sort_values(ascending=False))

print("\nSpecifically:")
print("proteomics:", int(row.get("proteomics", 0)))
print("transcriptomics:", int(row.get("transcriptomics", 0)))

Datatype columns (first 20):
['CNV', 'acetylproteomics', 'ancestry_prediction', 'cibersort', 'circular_RNA', 'clinical', 'hla_typing', 'mature_miRNA', 'miRNA', 'phosphoproteomics', 'precursor_miRNA', 'proteomics', 'somatic_mutation', 'total_miRNA', 'transcriptomics', 'tumor_purity', 'xcell']

Counts for ('ucec','bcm') (non-zero only):
Datatype
proteomics           2
phosphoproteomics    2
circular_RNA         1
CNV                  1
miRNA                1
transcriptomics      1
Name: (ucec, bcm), dtype: int64

Specifically:
proteomics: 2
transcriptomics: 1


In [28]:
# Load UCEC and inspect available tables

import cptac
import inspect

print("Ucec signature:", inspect.signature(cptac.Ucec))
print("\nUcec docstring (first ~20 lines):")
print("\n".join((cptac.Ucec.__doc__ or "").splitlines()[:20]))


ds = cptac.Ucec()
print("Loaded dataset object:", type(ds))


Ucec signature: (no_internet=False)

Ucec docstring (first ~20 lines):

    The Ucec class is a child class of the base Cancer class, intended for handling 
    Uterine Corpus Endometrial Carcinoma (UCEC) data from different sources.

    Attributes:
        _sources (dict): A dictionary of source objects that load and hold data 
                         pertaining to UCEC cancer.
    
Loaded dataset object: <class 'cptac.cancers.ucec.Ucec'>


In [29]:
methods = [m for m in dir(ds) if m.startswith("get_")]
print("Available get_* methods:")
for m in sorted(methods):
    print(" -", m)


Available get_* methods:
 - get_CNV
 - get_acetylproteomics
 - get_ancestry_prediction
 - get_cancer_type
 - get_cibersort
 - get_circular_RNA
 - get_clinical
 - get_data_list
 - get_dataframe
 - get_derived_molecular
 - get_docs
 - get_followup
 - get_genotype_all_vars
 - get_hla_typing
 - get_medical_history
 - get_miRNA
 - get_phosphoproteomics
 - get_proteomics
 - get_somatic_mutation
 - get_somatic_mutation_binary
 - get_targeted_phosphoproteomics
 - get_targeted_proteomics
 - get_transcriptomics
 - get_tumor_purity
 - get_xcell


In [30]:
import cptac

meta = cptac.list_datasets()

def sources_for(cancer, dtype):
    out = meta[(meta["Cancer"] == cancer) & (meta["Datatype"] == dtype)]["Source"]
    return sorted([s for s in out.dropna().unique()])

print("UCEC clinical sources:", sources_for("ucec", "clinical"))
print("UCEC proteomics sources:", sources_for("ucec", "proteomics"))
print("UCEC transcriptomics sources:", sources_for("ucec", "transcriptomics"))



UCEC clinical sources: []
UCEC proteomics sources: ['bcm', 'umich']
UCEC transcriptomics sources: ['bcm', 'broad', 'washu']


In [15]:

import sys
print(sys.executable)

# Adding automatic retry for Zenodo downloads
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import cptac.tools.download_tools as download_tools

retry = Retry(
    total=12,
    backoff_factor=2,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
    respect_retry_after_header=True,
)

session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retry))

# This makes cptac's download code to use retrying session
download_tools.requests = session

print("Retry session set up")




/mnt/vol1/protein_cancer/envs/python_ai_env/bin/python
Retry session set up


In [31]:
# Load UCEC and find which sources actually have clinical data

import cptac

ds = cptac.Ucec()

# list what each source can load
available = {src: sorted(obj.load_functions.keys()) for src, obj in ds._sources.items()}
available


{'bcm': ['CNV',
  'circular_RNA',
  'miRNA',
  'phosphoproteomics',
  'proteomics',
  'transcriptomics'],
 'broad': ['transcriptomics'],
 'mssm': ['clinical', 'follow-up', 'medical_history'],
 'umich': ['acetylproteomics', 'phosphoproteomics', 'proteomics'],
 'washu': ['CNV',
  'cibersort',
  'hla_typing',
  'miRNA',
  'somatic_mutation',
  'transcriptomics',
  'tumor_purity',
  'xcell'],
 'harmonized': ['ancestry_prediction', 'somatic_mutation']}

In [17]:
# pick the first source that supports clinical

def first_source_with(ds, dtype):
    for src, obj in ds._sources.items():
        if dtype in obj.load_functions:
            return src
    return None

clinical_source = first_source_with(ds, "clinical")
print("Clinical source for UCEC:", clinical_source)


Clinical source for UCEC: mssm


In [32]:
# Download with a simple backoff loop

import time

def call_with_backoff(func, *args, **kwargs):
    for attempt in range(1, 11):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "429" in str(e):
                wait = 30 * attempt
                print(f"Hit Zenodo rate limit (429). Waiting {wait}s then retrying...")
                time.sleep(wait)
            else:
                raise
    raise RuntimeError("Still rate-limited after retries.")


In [19]:
# Get the data and save it 

clinical = None
if clinical_source is not None:
    clinical = call_with_backoff(ds.get_clinical, source=clinical_source)

proteomics = call_with_backoff(ds.get_proteomics, source="bcm")
transcriptomics = call_with_backoff(ds.get_transcriptomics, source="bcm")

print("clinical:", None if clinical is None else clinical.shape)
print("proteomics:", proteomics.shape)
print("transcriptomics:", transcriptomics.shape)


from pathlib import Path

PROJECT = Path("/mnt/vol1/Multi-Omics-Cancer-Subtype-Discovery")
RAW = PROJECT / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

if clinical is not None:
    clinical.to_csv(RAW / "clinical.csv")

proteomics.to_csv(RAW / "proteomics.csv")
transcriptomics.to_csv(RAW / "transcriptomics.csv")

print("Saved files:")
for p in sorted(RAW.glob("*.csv")):
    print(" -", p.name)



Downloading clinical_Pan-cancer.May2022.tsv.gz: 100%|███| 243k/243k [00:01<00:00, 138kB/s]
Downloading UCEC_proteomics_gene_abundance_log2_reference_intensity_normalized_Tumor.txt.g
Downloading UCEC_proteomics_gene_abundance_log2_reference_intensity_normalized_Normal.txt.
Downloading gencode.v34.basic.annotation-mapping.txt.gz: 100%|█| 1.75M/1.75M [00:14<00:00,
Downloading UCEC-gene_rsem_removed_circRNA_tumor_normal_UQ_log2(x+1)_BCM.txt.gz: 100%|█| 8


clinical: (103, 124)
proteomics: (113, 11949)
transcriptomics: (119, 59286)
