# Exploración de los datos

In [1]:
import cptac

Downloading cptac_genes.csv: 100%|██████████████████████████████████████████████████| 462k/462k [00:00<00:00, 1.07MB/s]
Downloading brca_mapping.csv: 100%|███████████████████████████████████████████████| 6.37k/6.37k [00:00<00:00, 22.5kB/s]
Downloading index.tsv: 100%|██████████████████████████████████████████████████████| 30.2k/30.2k [00:00<00:00, 99.0kB/s]


In [5]:
cptac.get_cancer_info()

{'brca': 'Breast invasive carcinoma',
 'ccrcc': 'Clear cell renal cell carcinoma',
 'coad': 'Colon adenocarcinoma',
 'gbm': 'Glioblastoma multiforme',
 'hnscc': 'Head and Neck squamous cell carcinoma',
 'lscc': 'Lung squamous cell carcinoma',
 'luad': 'Lung adenocarcinoma',
 'ov': 'Ovarian serous cystadenocarcinoma',
 'pda': 'Pancreatic ductal adenocarcinoma',
 'pdac': 'Pancreatic ductal adenocarcinoma',
 'ucec': 'Uterine Corpus Endometrial Carcinoma'}

Para este proyecto nos interesan los datos de brca.

In [3]:
cptac.get_source_options()

Unnamed: 0_level_0,Unnamed: 1_level_0,Cancer
Source,Datatype,Unnamed: 2_level_1
bcm,CNV,"[coad, ucec, pdac, ccrcc, ov, lscc, brca, gbm,..."
bcm,circular_RNA,"[ucec, pdac, ccrcc, lscc, gbm, luad, hnscc]"
bcm,miRNA,"[coad, ucec, pdac, ccrcc, lscc, brca, gbm, lua..."
bcm,phosphoproteomics,"[coad, ucec, pdac, ccrcc, ov, lscc, brca, gbm,..."
bcm,proteomics,"[coad, ucec, pdac, ccrcc, ov, lscc, brca, gbm,..."
bcm,transcriptomics,"[coad, ucec, pdac, ccrcc, ov, lscc, brca, gbm,..."
broad,transcriptomics,"[coad, ucec, pdac, ccrcc, ov, lscc, brca, gbm,..."
harmonized,ancestry_prediction,[all_cancers]
harmonized,somatic_mutation,[all_cancers]
mssm,clinical,[all_cancers]


Y queremos datos transcriptómicos, que como vemos tienen origen bcm, washu y broad. Creo que cada set tiene las mismas muestras, aunque diferente procesado, pero tengo que revisarlo.

In [13]:
brca = cptac.Brca()

In [14]:
brca.list_data_sources()

Unnamed: 0,Data type,Available sources
0,CNV,"[bcm, washu]"
1,miRNA,[bcm]
2,phosphoproteomics,"[bcm, umich]"
3,proteomics,"[bcm, umich]"
4,transcriptomics,"[bcm, broad, washu]"
5,ancestry_prediction,[harmonized]
6,somatic_mutation,"[harmonized, washu]"
7,clinical,[mssm]
8,follow-up,[mssm]
9,medical_history,[mssm]


De los tipos de datos que incluye Brca, nos interesan "transcriptomics" y "clinical", en principio. "follow-up" y "medical_history" también pueden ser útiles.

# Dataframe brca transcriptomics

In [19]:
brca_transc_bcm = brca.get_transcriptomics('bcm')
brca_transc_bcm.head()

Name,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2ML1-AS1,A2ML1-AS2,A2MP1,A3GALT2,...,ZXDB,ZXDC,ZYG11A,ZYG11AP1,ZYG11B,ZYX,ZYXP1,ZZEF1,hsa-mir-1253,hsa-mir-423
Database_ID,ENSG00000121410.12,ENSG00000268895.6,ENSG00000148584.15,ENSG00000175899.15,ENSG00000245105.4,ENSG00000166535.20,ENSG00000256661.1,ENSG00000256904.1,ENSG00000256069.7,ENSG00000184389.9,...,ENSG00000198455.4,ENSG00000070476.15,ENSG00000203995.10,ENSG00000232242.2,ENSG00000162378.13,ENSG00000159840.16,ENSG00000274572.1,ENSG00000074755.15,ENSG00000272920.1,ENSG00000266919.3
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BR001,3.51,8.04,0.0,12.93,5.01,4.88,0.0,0.0,3.13,0.0,...,9.02,8.88,7.07,0.0,9.86,10.63,0.0,10.05,0.0,0.0
01BR008,2.84,8.24,2.5,13.02,5.67,10.1,0.0,0.0,0.0,0.0,...,7.93,8.84,8.6,0.0,8.44,11.36,0.0,10.2,0.0,0.0
01BR009,4.85,8.11,0.0,13.68,5.87,9.5,0.0,0.0,3.11,0.0,...,7.53,8.85,3.64,0.0,8.9,10.58,0.0,10.54,0.0,0.0
01BR010,4.24,7.96,0.0,12.64,4.82,5.03,0.0,0.0,1.33,0.0,...,8.53,8.31,8.28,0.0,9.28,12.4,0.0,10.42,0.0,0.0
01BR015,3.89,7.56,1.58,13.55,5.32,3.31,0.0,0.0,2.99,0.0,...,9.17,9.05,5.08,0.0,10.13,10.44,0.0,10.46,0.0,0.0


In [20]:
brca_transc_broad = brca.get_transcriptomics('broad')
brca_transc_broad.head()

Name,A1BG,A1BG,A1BG,A1BG,A1BG,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,A1BG-AS1,...,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZEF1,ZZEF1,hsa-mir-1253,hsa-mir-423
Transcript_ID,ENST00000263100.8,ENST00000595014.1,ENST00000596924.1,ENST00000598345.1,ENST00000600966.1,ENST00000593374.5,ENST00000593960.5,ENST00000594950.5,ENST00000595302.1,ENST00000599728.5,...,ENST00000572426.5,ENST00000572699.1,ENST00000572831.1,ENST00000573183.1,ENST00000573536.1,ENST00000573606.2,ENST00000574474.1,ENST00000575428.1,ENST00000609567.1,ENST00000586878.1
Database_ID,ENSG00000121410.12,ENSG00000121410.12,ENSG00000121410.12,ENSG00000121410.12,ENSG00000121410.12,ENSG00000268895.6,ENSG00000268895.6,ENSG00000268895.6,ENSG00000268895.6,ENSG00000268895.6,...,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000074755.15,ENSG00000272920.1,ENSG00000266919.3
Patient_ID,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
01BR001,0.0,0.06,0.15,10.13,0.0,0.0,0.21,1.26,1.28,0.0,...,0.27,0.62,0.0,0.0,1.38,0.19,0.14,0.0,0.0,0.0
01BR008,0.0,0.09,0.06,3.12,0.11,0.0,0.0,2.98,1.94,0.0,...,0.05,0.12,0.0,0.0,0.74,0.27,0.03,0.0,0.0,0.0
01BR009,0.0,0.13,0.47,11.88,0.0,0.0,0.0,1.22,0.1,0.0,...,0.32,1.19,0.0,0.0,3.53,1.55,0.32,0.28,0.0,0.0
01BR010,0.0,0.16,0.25,12.34,0.0,0.0,0.41,2.05,0.79,0.0,...,0.3,1.66,0.0,0.0,1.6,1.21,0.1,0.0,0.0,0.0
01BR015,0.0,0.04,0.39,19.99,0.0,0.0,0.0,1.93,0.79,0.0,...,0.77,1.29,0.48,0.0,2.08,0.5,0.36,0.0,0.0,0.0


La secuenciación del broad incluye diferentes isoformas o versiones del mismo gen. Por ejemplo, todas las columnas con Database_ID = ENSG00000121410.12 son isoformas del gen A1BG.

In [21]:
brca_transc_washu = brca.get_transcriptomics('washu')
brca_transc_washu.head()

Name,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,5S_rRNA,...,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,uc_338,yR211F11.2
Database_ID,ENSG00000201285.1,ENSG00000212595.1,ENSG00000252830.2,ENSG00000271924.1,ENSG00000272253.1,ENSG00000272351.1,ENSG00000272435.1,ENSG00000274059.1,ENSG00000274097.1,ENSG00000274164.1,...,ENSG00000278040.1,ENSG00000278113.1,ENSG00000278218.1,ENSG00000278236.1,ENSG00000278333.1,ENSG00000278413.1,ENSG00000278498.1,ENSG00000278587.1,ENSG00000278643.1,ENSG00000213076.3
Patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BR001,0.0,0.0,0.0,1.756644,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.238985,0.0,0.0,0.0,0.0,0.0,0.0
01BR008,0.0,0.0,0.0,1.783371,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.732229,...,0.0,0.0,0.0,0.263943,0.0,0.0,0.0,0.0,0.0,0.156545
01BR010,0.0,0.0,0.0,2.023434,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.229401,0.0,0.0,0.242067,0.0,0.0,0.068029
01BR015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.248619,0.0,0.0,0.0,0.0,0.0,0.0


En el caso del database washu, no se incluyen distintas isoformas del mismo transcrito, y además muchas columnas no están asociadas a un nombre. La mejor opción parece ser el escoger el dataset bcm. Con el siguiente código, se descarga el df de interés: 

In [22]:
brca_transc_bcm.to_csv(path_or_buf="data.tsv", sep='\t')

# Dataframe brca clinical

In [23]:
clinical = brca.get_clinical('mssm')
clinical.head()

Downloading clinical_Pan-cancer.May2022.tsv.gz: 100%|████████████████████████████████| 243k/243k [00:00<00:00, 523kB/s]


Name,tumor_code,discovery_study,type_of_analyzed_samples,confirmatory_study,type_of_analyzed_samples,age,sex,race,ethnicity,ethnicity_race_ancestry_identified,...,additional_treatment_pharmaceutical_therapy_for_new_tumor,additional_treatment_immuno_for_new_tumor,number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional,number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis,"Recurrence-free survival, days","Recurrence-free survival from collection, days","Recurrence status (1, yes; 0, no)","Overall survival, days","Overall survival from collection, days","Survival status (1, dead; 0, alive)"
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001,BR,Yes,,,,55,Female,Black or African American,Not Hispanic or Latino,,...,,,,,,,0,421.0,,0.0
01BR008,BR,Yes,,,,48,Female,Black or African American,Not Hispanic or Latino,,...,,,,,,,0,,,
01BR009,BR,Yes,,,,64,Female,Black or African American,Not Hispanic or Latino,,...,,,,,,,0,,,
01BR010,BR,Yes,,,,65,Female,Black or African American,Not Hispanic or Latino,,...,,,,,,,0,,,
01BR015,BR,Yes,,,,35,Female,White,Not Hispanic or Latino,,...,,,,,,,0,347.0,,0.0


De nuevo, descargamos los datos de interés con:

In [24]:
clinical.to_csv(path_or_buf="clinical.tsv", sep='\t')