In [3]:
# install the CPTAC helper package

import importlib.util, sys, subprocess

pkg = "cptac"
if importlib.util.find_spec(pkg) is None:
    print("Installing cptac...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "cptac"])
else:
    print("cptac is already installed")


cptac is already installed


In [88]:
# load one CPTAC dataset

import cptac
# load one CPTAC datasets that got both proteomics and transcriptomics

import cptac
import pandas as pd

df = cptac.list_datasets()
print(df.head(10))


  Cancer            Source         Datatype
0   ucec               bcm       proteomics
1   ucec               bcm  transcriptomics
2   None  brca_mapping.csv             None
3   brca             broad  transcriptomics
4  ccrcc             broad  transcriptomics
5   coad             broad  transcriptomics
6    gbm             broad  transcriptomics
7  hnscc             broad  transcriptomics
8   lscc             broad  transcriptomics
9   luad             broad  transcriptomics


In [96]:
# Find the dataset with cancers, data source and the type of data

df2 = df.dropna(subset=["Cancer", "Source", "Datatype"]).copy()

# Count datatypes per (Cancer, Source)
counts = (
    df2.groupby(["Cancer", "Source"])["Datatype"]
       .value_counts()
       .unstack(fill_value=0)
)

print(counts.head(10))

# Keep only Cancer/Source pairs that have both proteomics and transcriptomics
both = counts[(counts.get("proteomics", 0) > 0) & (counts.get("transcriptomics", 0) > 0)]
print(both)

Datatype                CNV  acetylproteomics  ancestry_prediction  cibersort  \
Cancer      Source                                                              
all_cancers harmonized    0                 0                    1          0   
            mssm          0                 0                    0          0   
            washu         0                 0                    0          0   
brca        bcm           1                 0                    0          0   
            broad         0                 0                    0          0   
            umich         0                 2                    0          0   
            washu         1                 0                    0          1   
ccrcc       bcm           1                 0                    0          0   
            broad         0                 0                    0          0   
            umich         0                 0                    0          0   

Datatype                cir

In [97]:
# confirm what datatypes exist for ucec / bcm

import cptac
df= cptac.list_datasets()
df2= df.dropna(subset=["Cancer", "Source", "Datatype"]).copy()

counts= (df2.groupby(["Cancer", "Source"])["Datatype"]
         .value_counts()
         .unstack(fill_value=0)
        )
  
print("Datatype columns (first 20):")
print(list(counts.columns)[:20])

print("\nCounts for ('ucec','bcm') (non-zero only):")
row = counts.loc[("ucec", "bcm")]
print(row[row > 0].sort_values(ascending=False))

print("\nSpecifically:")
print("proteomics:", int(row.get("proteomics", 0)))
print("transcriptomics:", int(row.get("transcriptomics", 0)))

Datatype columns (first 20):
['CNV', 'acetylproteomics', 'ancestry_prediction', 'cibersort', 'circular_RNA', 'clinical', 'hla_typing', 'mature_miRNA', 'miRNA', 'phosphoproteomics', 'precursor_miRNA', 'proteomics', 'somatic_mutation', 'total_miRNA', 'transcriptomics', 'tumor_purity', 'xcell']

Counts for ('ucec','bcm') (non-zero only):
Datatype
proteomics           2
phosphoproteomics    2
circular_RNA         1
CNV                  1
miRNA                1
transcriptomics      1
Name: (ucec, bcm), dtype: int64

Specifically:
proteomics: 2
transcriptomics: 1


In [101]:
# Load UCEC and inspect available tables

import cptac
import inspect

print("Ucec signature:", inspect.signature(cptac.Ucec))
print("\nUcec docstring (first ~20 lines):")
print("\n".join((cptac.Ucec.__doc__ or "").splitlines()[:20]))


ds = cptac.Ucec()
print("Loaded dataset object:", type(ds))


Ucec signature: (no_internet=False)

Ucec docstring (first ~20 lines):

    The Ucec class is a child class of the base Cancer class, intended for handling 
    Uterine Corpus Endometrial Carcinoma (UCEC) data from different sources.

    Attributes:
        _sources (dict): A dictionary of source objects that load and hold data 
                         pertaining to UCEC cancer.
    
Loaded dataset object: <class 'cptac.cancers.ucec.Ucec'>


In [102]:
methods = [m for m in dir(ds) if m.startswith("get_")]
print("Available get_* methods:")
for m in sorted(methods):
    print(" -", m)


Available get_* methods:
 - get_CNV
 - get_acetylproteomics
 - get_ancestry_prediction
 - get_cancer_type
 - get_cibersort
 - get_circular_RNA
 - get_clinical
 - get_data_list
 - get_dataframe
 - get_derived_molecular
 - get_docs
 - get_followup
 - get_genotype_all_vars
 - get_hla_typing
 - get_medical_history
 - get_miRNA
 - get_phosphoproteomics
 - get_proteomics
 - get_somatic_mutation
 - get_somatic_mutation_binary
 - get_targeted_phosphoproteomics
 - get_targeted_proteomics
 - get_transcriptomics
 - get_tumor_purity
 - get_xcell
