# Processing TCGA miRNA-seq files

In [9]:
# Loading in libraries

import pandas as pd
import glob
import os.path
import sys
import json
import numpy as np

In [2]:
# Loading manifest file
filelist = pd.read_csv("./tcga/gdc_manifest.2024-01-18.txt", sep="\t", header=0) #11, 661

## Compiling all TCGA miRNA-seq data into a dataframe
Each file is in a separate folder. I use the manifest file to generate the file pathway and pull out the data.

In [3]:

data_list = []
ids = []

for i in range(len(filelist.index)):
    id = filelist['id'].iloc[i]
    filename = filelist['filename'].iloc[i]
    path = "./tcga/" + id + "/" + filename

    if os.path.isfile(path):
        f = pd.read_csv(path, sep="\t", header=0)
        f.insert(0, 'file_UUID', [id] * len(f.index))
        data_list.append(f)
    else: 
        ids.append(id)
data = pd.concat(data_list)

data.shape #(21934341, 5)

(21934341, 5)

## Taking the miRNA expression from the TCGA data of the 'interesting' 22 miRNAs from the Kidney cancer cachexia miRNA-seq dataset

The interesting miRNAs here are those DE miRNAs whose target genes are associated with "Cartilage Develoment in Endochondral Bone Morphogenesis" and "Myoblast Proliferation".

In [4]:
# load in interesting miRNAs from kidney (22)
kidney_miRNAs = pd.read_csv("./kidney_results_interesting_miRNAs.csv", sep=",", header=0)

# change format of miRNA IDs to match TCGA
interesting_miRNAs = []
for i in list(set(kidney_miRNAs['miRNA'].tolist())):
    m = i.split('-')
    interesting_miRNAs.append("hsa-mir-" + m[2])

# get interesting miRNAs from TCGA data
tcga_interesting_miRNAs = data.loc[data['miRNA_ID'].isin(interesting_miRNAs)]

tcga_interesting_miRNAs.shape #(209898, 5)
tcga_interesting_miRNAs.head()

Unnamed: 0,file_UUID,miRNA_ID,read_count,reads_per_million_miRNA_mapped,cross-mapped
108,5c57e4a1-6498-4ace-8a63-04550891ec02,hsa-mir-1271,4,0.45304,N
167,5c57e4a1-6498-4ace-8a63-04550891ec02,hsa-mir-130b,205,23.218309,N
186,5c57e4a1-6498-4ace-8a63-04550891ec02,hsa-mir-140,6399,724.751023,N
194,5c57e4a1-6498-4ace-8a63-04550891ec02,hsa-mir-146a,800,90.608035,N
206,5c57e4a1-6498-4ace-8a63-04550891ec02,hsa-mir-152,2094,237.166533,N


In [5]:
# Check if some interesting miRNAs are not present in TCGA data: 3 miRNAs
not_in_tcga = []

for i in interesting_miRNAs:
    if i not in list(set(list(tcga_interesting_miRNAs['miRNA_ID']))):
        not_in_tcga.append(i)

not_in_tcga

['hsa-mir-30c', 'hsa-mir-138', 'hsa-mir-194']

Some miRNAs (i.e., 'hsa-mir-138', 'hsa-mir-30c', 'hsa-mir-194') are not present in the TCGA files that I have loaded. This is likely an annotation issue. 

## Retrieving the Clinical Data of each case
The clinical data of each case has to be matched to the corresponding file using the details displayed on the Repository 'Files' tab. This contains the file UUID and its corresponding case ID. The case ID is then used to match the clinical details to the respective file UUID. 

To obtain the primary site at the study/project level, I used the details displayed on the Repository 'Cases' tab to match the case ID to the corresponding primary site, as this isn't included in the clinical data. 


In [19]:
# loading in clinical data and taking only relevant columns

clinical_data = pd.read_csv("./tcga/clinical.cases_selection.2024-01-21/clinical.tsv", sep="\t", header=0)
cd = np.asarray(clinical_data)
cd[cd == "'--"] = np.nan
cd = pd.DataFrame(cd,index= list(clinical_data.index), columns=list(clinical_data.columns))
clinical_data_clean = cd.dropna(axis=1, how='all')

# clinical_data_clean = clinical_data[['case_id', 'project_id', 'gender', 'primary_diagnosis', 'primary_disease', 
#                                      'tumor_grade', 'tumor_stage',
#                                      'tissue_or_organ_of_origin', 'metastasis_at_diagnosis_site',
#                                      'sites_of_involvement']].drop_duplicates()

# add sex, primary vs metastasis, tumor type, tumor grade, primary site
clinical_data_clean.shape # 10,753 rows

(20963, 50)

In [20]:

clinical_data_clean = clinical_data_clean.drop(['age_at_index', 'age_is_obfuscated', 'cause_of_death', 'days_to_birth', 
                          'days_to_death', 'ethnicity', 'vital_status', 'year_of_birth', 'year_of_death',
                          'age_at_diagnosis', 'days_to_diagnosis', 'days_to_last_follow_up', 'site_of_resection_or_biopsy',
                          'year_of_diagnosis', 'treatment_or_therapy', 'treatment_type'], axis = 1)

clinical_data_clean.columns
clinical_data_clean.shape # 10,753 rows

(20963, 34)

In [21]:
# Opening the TCGA Repository 'Files' tab details which contain the project_id, case_id, filename, and file_UUID

with open('./tcga/files.2024-01-23.json') as f:
    metadata = json.load(f)

# fix dictionary
md_list = []
for i in range(len(metadata)):
    md_list.append({'project_id' : metadata[i]['cases'][0]['project']['project_id'],
                    'case_id' : metadata[i]['cases'][0]['case_id'],
                    'filename': metadata[i]['file_name'],
                    'file_UUID': metadata[i]['file_id']})

# flatten json dict to dataframe
df_md = pd.json_normalize(md_list)
df_md.head()

Unnamed: 0,project_id,case_id,filename,file_UUID
0,TCGA-BRCA,ff0f875f-113a-4c0f-8c0a-4259cd496e6d,becd04bc-9d52-411e-8dab-2edd585170b3.mirbase21...,5c57e4a1-6498-4ace-8a63-04550891ec02
1,TCGA-BRCA,408cb583-6dc3-4698-8bd2-e284042bd5ef,e496cc70-7510-41d9-9f9b-d054ff7332dc.mirbase21...,07ff7aa3-a3be-44c0-89d8-c67a0dedcc72
2,TCGA-BRCA,bef7b135-a727-45e8-850a-cc4cd56c49aa,7348be0a-2a46-47df-b13e-2660f391a865.mirbase21...,a0992897-79c4-4e33-974a-a65248ee43fa
3,TCGA-BRCA,83f26720-007c-42c9-a7de-7484bf8afa89,cddbfd6d-5362-4c75-a0db-022c4e0137c0.mirbase21...,8acfb9e1-3fc2-4736-a1e3-44593ca8d1fe
4,TCGA-BRCA,bfd49783-1767-469b-9d79-7822301c5efc,fda26b22-b0f8-498a-b627-f105d010f051.mirbase21...,21abffb3-3e07-4022-86ff-51a41036cfc4


In [22]:
# Opening the Repository 'Cases' tab file which includes the primary site of the study

with open('./tcga/cases.2024-02-01.json') as cases:
    metadata_cases = json.load(cases)

# fix dictionary
md_cases_list = []
for i in range(len(metadata_cases)):
    md_cases_list.append({'primary_site' : metadata_cases[i]['primary_site'],
                    'case_id': metadata_cases[i]['case_id']})

# flatten json dict to dataframe
df_md_cases = pd.json_normalize(md_cases_list)
df_md_cases.head()

Unnamed: 0,primary_site,case_id
0,Breast,a8b1f6e7-2bcf-460d-b1c6-1792a9801119
1,Breast,17f275c1-a0d4-487d-8f02-ea279584b4cd
2,Breast,6f6e7356-3521-4674-8eec-ad01340d4b8e
3,Breast,8a0ef221-8070-43fc-9d98-def90abe4871
4,Breast,195b6731-6ae6-416e-a345-0eb8874a0ca2


In [23]:
# joining the repository 'cases' and 'files' tabs by case_id

df_md_merged = df_md.merge(df_md_cases, how = 'left', on = 'case_id')
df_md_merged.head()

Unnamed: 0,project_id,case_id,filename,file_UUID,primary_site
0,TCGA-BRCA,ff0f875f-113a-4c0f-8c0a-4259cd496e6d,becd04bc-9d52-411e-8dab-2edd585170b3.mirbase21...,5c57e4a1-6498-4ace-8a63-04550891ec02,Breast
1,TCGA-BRCA,408cb583-6dc3-4698-8bd2-e284042bd5ef,e496cc70-7510-41d9-9f9b-d054ff7332dc.mirbase21...,07ff7aa3-a3be-44c0-89d8-c67a0dedcc72,Breast
2,TCGA-BRCA,bef7b135-a727-45e8-850a-cc4cd56c49aa,7348be0a-2a46-47df-b13e-2660f391a865.mirbase21...,a0992897-79c4-4e33-974a-a65248ee43fa,Breast
3,TCGA-BRCA,83f26720-007c-42c9-a7de-7484bf8afa89,cddbfd6d-5362-4c75-a0db-022c4e0137c0.mirbase21...,8acfb9e1-3fc2-4736-a1e3-44593ca8d1fe,Breast
4,TCGA-BRCA,bfd49783-1767-469b-9d79-7822301c5efc,fda26b22-b0f8-498a-b627-f105d010f051.mirbase21...,21abffb3-3e07-4022-86ff-51a41036cfc4,Breast


In [24]:
# merging clinical details and file details

file_details = df_md_merged.merge(clinical_data_clean, on = 'case_id', how='left')
file_details_clean = file_details.drop(labels = 'project_id_y', axis = 1)
file_details_clean.rename(columns = {"project_id_x": "project_id"},  
          inplace = True) 
file_details_clean.head()

Unnamed: 0,project_id,case_id,filename,file_UUID,primary_site,case_submitter_id,gender,race,ajcc_clinical_m,ajcc_clinical_n,...,primary_diagnosis,primary_gleason_grade,prior_malignancy,prior_treatment,progression_or_recurrence,secondary_gleason_grade,synchronous_malignancy,tissue_or_organ_of_origin,tumor_grade,wilms_tumor_histologic_subtype
0,TCGA-BRCA,ff0f875f-113a-4c0f-8c0a-4259cd496e6d,becd04bc-9d52-411e-8dab-2edd585170b3.mirbase21...,5c57e4a1-6498-4ace-8a63-04550891ec02,Breast,TCGA-C8-A26W,female,asian,,,...,"Infiltrating duct carcinoma, NOS",,no,No,not reported,,No,"Breast, NOS",Not Reported,
1,TCGA-BRCA,ff0f875f-113a-4c0f-8c0a-4259cd496e6d,becd04bc-9d52-411e-8dab-2edd585170b3.mirbase21...,5c57e4a1-6498-4ace-8a63-04550891ec02,Breast,TCGA-C8-A26W,female,asian,,,...,"Infiltrating duct carcinoma, NOS",,no,No,not reported,,No,"Breast, NOS",Not Reported,
2,TCGA-BRCA,408cb583-6dc3-4698-8bd2-e284042bd5ef,e496cc70-7510-41d9-9f9b-d054ff7332dc.mirbase21...,07ff7aa3-a3be-44c0-89d8-c67a0dedcc72,Breast,TCGA-E9-A1RH,female,white,,,...,"Infiltrating duct carcinoma, NOS",,no,No,not reported,,No,"Breast, NOS",Not Reported,
3,TCGA-BRCA,408cb583-6dc3-4698-8bd2-e284042bd5ef,e496cc70-7510-41d9-9f9b-d054ff7332dc.mirbase21...,07ff7aa3-a3be-44c0-89d8-c67a0dedcc72,Breast,TCGA-E9-A1RH,female,white,,,...,"Infiltrating duct carcinoma, NOS",,no,No,not reported,,No,"Breast, NOS",Not Reported,
4,TCGA-BRCA,bef7b135-a727-45e8-850a-cc4cd56c49aa,7348be0a-2a46-47df-b13e-2660f391a865.mirbase21...,a0992897-79c4-4e33-974a-a65248ee43fa,Breast,TCGA-AO-A0JJ,female,white,,,...,"Lobular carcinoma, NOS",,no,No,not reported,,No,"Breast, NOS",Not Reported,


In [25]:
# merging clinical, file, case data with expression data

tcga_interesting_miRNAs_complete = file_details_clean.merge(tcga_interesting_miRNAs.drop('cross-mapped', axis = 1), how = 'left', on = 'file_UUID')

In [26]:
tcga_interesting_miRNAs_complete.columns

Index(['project_id', 'case_id', 'filename', 'file_UUID', 'primary_site',
       'case_submitter_id', 'gender', 'race', 'ajcc_clinical_m',
       'ajcc_clinical_n', 'ajcc_clinical_stage', 'ajcc_clinical_t',
       'ajcc_pathologic_m', 'ajcc_pathologic_n', 'ajcc_pathologic_stage',
       'ajcc_pathologic_t', 'ajcc_staging_system_edition',
       'ann_arbor_b_symptoms', 'ann_arbor_clinical_stage',
       'ann_arbor_extranodal_involvement', 'classification_of_tumor',
       'figo_stage', 'icd_10_code', 'igcccg_stage',
       'last_known_disease_status', 'masaoka_stage', 'morphology',
       'primary_diagnosis', 'primary_gleason_grade', 'prior_malignancy',
       'prior_treatment', 'progression_or_recurrence',
       'secondary_gleason_grade', 'synchronous_malignancy',
       'tissue_or_organ_of_origin', 'tumor_grade',
       'wilms_tumor_histologic_subtype', 'miRNA_ID', 'read_count',
       'reads_per_million_miRNA_mapped'],
      dtype='object')

In [27]:
tcga_interesting_miRNAs_complete.shape # (209898, 16)

(408636, 40)

In [28]:
sys.getsizeof(tcga_interesting_miRNAs_complete)

880028444

tcga_interesting_miRNAs_complete.to_csv("tcga_interesting_miRNAs_complete.csv")