In [None]:
import pandas as pd
import numpy as np
import glob
import time
import datetime

READ CNV DATA

In [None]:
#Getting Filelist
filelist_cnv = glob.glob("../../outputs/*/*copy*.tsv") #Getting path to all files, using the glob module
filelist_cnv = [(x,x.split('/')[3]) for x in filelist_cnv] #creates a tuple (x,y) x = path to file, y = name of file for accession
len(filelist_cnv)

In [None]:
def read_cnv_df(filepath,filename):
    '''
    Function that reads a cnv file and the copy number of the gene with the sample name
    '''
    df1 = pd.read_csv(filepath,header=0, sep= '\t',index_col=1)
    df1 = df1[['copy_number']]
    df1 = df1.T
    df1.index = [filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index':'file_name'}, inplace=True) 
    return df1

In [None]:
df_list = []

for x in filelist_cnv:
    tmp_df = read_cnv_df(x[0],x[1])
    df_list.append(tmp_df)

In [None]:
master_cnv_df = pd.concat(df_list)
master_cnv_df.to_csv("../../data/master_cnv_df_allsamples.csv", index=False)

Reading Gene Data


In [None]:
filelist_gene = glob.glob("../../outputs/*/*rna*.tsv") #Getting path to all files, using the glob module
filelist_gene = [(x,x.split('/')[3]) for x in filelist_gene] #creates a tuple (x,y) x = path to file, y = name of file for accession
len(filelist_gene)

In [None]:

def read_gene_df(filepath,filename):
    '''
    Function that reads a gene file and of the gene with the sample name
    '''
    with open(filepath) as f:
        f.readline()
        cols = f.readline().split()

    df1 = pd.read_csv(filepath,header=None, sep= '\t',skiprows=6)
    df1.columns = cols
    df1.index = df1['gene_name']
    df1 = df1[['tpm_unstranded']]
    df1 = df1.T
    df1.index = [filename]
    df1 = df1.reset_index()
    df1.rename(columns={'index':'file_name'}, inplace=True)
    df1
    return df1

In [None]:
df_list = []

for x in filelist_gene:
    tmp_df = read_gene_df(x[0],x[1])
    df_list.append(tmp_df)

In [None]:
master_gene_df = pd.concat(df_list)
master_gene_df.to_csv("../../data/master_gene_df_allsamples.csv", index=False)
master_gene_df.head()

Reading Clinical and Sample Data Now


In [None]:
sample_data = pd.read_csv("../../gdc_sample_sheet.2022-11-25.tsv", delimiter='\t') #Reading Sample Data
sample_data['filename_short'] = sample_data['File Name'].apply(lambda x: x.split('.')[0]) #Stripping File name (Removing FPKM.txt.gz)
sample_data = sample_data[~sample_data['Sample Type'].isin(['Solid Tissue Normal','Solid Tissue Normal, Solid Tissue Normal','Solid Tissue Normal, Solid Tissue Normal, Solid Tissue Normal'])].reset_index(drop = True)
sample_data.head()

In [None]:
#Removing Multiple Case IDs
sample_data['Case_ID_Final'] = sample_data.apply(lambda x: x['Case ID'].split(',')[0], axis = 1)
sample_data.head()

Getting in created time of sample


time_df = pd.read_json("../../metadata.cart.2022-11-25.json")
time_df['created_date'] = time_df.apply(lambda x: x['analysis']['created_datetime'].split('T')[0], axis = 1)
time_df = time_df[['file_name','created_date']]
time_df.columns = ['File Name', 'created_date']
time_df.head()

Adding timestamp to sample data


In [None]:
sample_data = pd.merge(left=sample_data, right=time_df, how='inner', on='File Name')
sample_data.head()

Adding Labels to Data

In [None]:
clinical_data = pd.read_csv("../../clinical.cases_selection.2022-10-25/clinical.tsv", delimiter='\t')
clinical_data = clinical_data[['case_submitter_id','primary_diagnosis']]
clinical_data

u = clinical_data.groupby("case_submitter_id").agg(list).reset_index() #Grouping all samples and their diagnosis to a list
u['len'] = u['primary_diagnosis'].apply(lambda x: len(x)) #Getting length of each list to check how many times samples have their clinical data
u['all_equal'] = u['primary_diagnosis'].apply(lambda x: len(set(x))) #Checking if every time all the entries added in primary tumor are the same
u['final_label'] = u['primary_diagnosis'].apply(lambda x: x[0]) #creating the final label for the dataset

sample_data = sample_data.merge(u[['case_submitter_id','final_label']], how='left', left_on='Case_ID_Final', right_on='case_submitter_id')
sample_data.head()

Selecting most recent gene and cnv file for a case ID


In [None]:
sample_data_latest = sample_data.sort_values(by=['Case_ID_Final','Data Category', 'created_date'])
sample_data_latest = sample_data_latest.drop_duplicates(subset=['Case_ID_Final','Data Category'], keep='last').reset_index(drop = True)
sample_data_latest

Keeping only those patients that have both CNV and Gene Expression data


In [None]:
test = sample_data_latest.groupby('Case_ID_Final').agg({"File Name":'count'})
removed_list = set(test[test['File Name']<2].reset_index()['Case_ID_Final'])

sample_data_latest = sample_data_latest[~sample_data_latest['Case_ID_Final'].isin(removed_list)].reset_index(drop = True)
sample_data_latest

Bucketing into ALC and SCLC and removing Solid Tumours which can be mixed


In [None]:
sample_data_latest = sample_data_latest[sample_data_latest['final_label']!='Solid carcinoma, NOS'].reset_index(drop = True)
def mapping(x):
    ALC = ['Adenocarcinoma with mixed subtypes','Adenocarcinoma, NOS','Bronchio-alveolar carcinoma, mucinous','Bronchiolo-alveolar adenocarcinoma, NOS',
        'Bronchiolo-alveolar carcinoma, non-mucinous','Clear cell adenocarcinoma, NOS','Micropapillary carcinoma, NOS','Papillary adenocarcinoma, NOS']
    SCLC = ['Basaloid squamous cell carcinoma','Papillary squamous cell carcinoma','Squamous cell carcinoma, NOS','Squamous cell carcinoma, keratinizing, NOS',
        'Squamous cell carcinoma, large cell, nonkeratinizing, NOS','Squamous cell carcinoma, small cell, nonkeratinizing']
    
    if x in ALC:
        return 'ALC'
    else:
        return 'SCLC'
sample_data_latest['LABEL'] = sample_data_latest.apply(lambda x: mapping(x['final_label']), axis = 1)
sample_data_latest.head()

In [None]:
final_labels = sample_data_latest[['File ID','File Name','Case_ID_Final','LABEL','final_label','Project ID']]
final_labels = final_labels[final_labels['Project ID'] != 'CPTAC-3'].reset_index(drop = True)
final_labels.to_csv("../../data/final_labels.csv",index = False)
final_labels.head()

Filtering CNV_master_df
Using a reference for encoding copy number values, I referenced this paper to encode the copy number of a gene
from gtfparse import read_gtf

In [None]:

gtf = read_gtf("../../data/Homo_sapiens.GRCh38.108.chr.gtf")
gtf = gtf[gtf['transcript_biotype']=='protein_coding'].reset_index(drop = True)
gtf[['gene_name','transcript_id','transcript_version']]

#Subsetting all protein coding genes
protein_coding = set(gtf['gene_name'])
a = master_cnv_df.iloc[:,1:].columns
protein_coding_selected = list(a.intersection(protein_coding))
protein_coding_selected.insert(0,'file_name')

#protein_coding_selected

In [None]:
#Subsetting only those file ids that are required
master_cnv_df_sub = master_cnv_df[master_cnv_df['file_name'].isin(final_labels['File ID'])].reset_index(drop = True)

#getting all protein coding genes only
master_cnv_df_sub = master_cnv_df_sub[protein_coding_selected]

#Removing all Nans
master_cnv_df_sub.dropna(axis=1, how='all',inplace = True)

#Adding Zero Value for al NA
master_cnv_df_sub.fillna(0, inplace = True)

#Decribing the remaining
master_cnv_df_sub.head()

In [None]:
master_cnv_df_sub.to_csv("../../data/master_cnv_df.csv",index = False)


Subsetting protein coding genes for Gene Expression dataset


In [None]:
master_gene_df.head()

In [None]:
all_genes = set(master_gene_df.iloc[:,1:].columns)
selected_genes = list(all_genes.intersection(protein_coding))
selected_genes.insert(0,'file_name')
len(selected_genes)

In [None]:
master_gene_df_sub = master_gene_df[master_gene_df['file_name'].isin(final_labels['File ID'])].reset_index(drop = True)

#Selecting ony selected genes
master_gene_df_sub = master_gene_df_sub[selected_genes]

#Removing genes with all zero values
master_gene_df_sub.dropna(axis=1, how='all',inplace = True)

master_gene_df_sub

master_gene_df_sub.to_csv("../../data/master_gene_df.csv",index=False)
master_gene_df

In [None]:
final_labels.groupby('LABEL').agg({'Case_ID_Final':'nunique'})