In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
dfCNA_ori = pd.read_csv('LIHC__genome_wide_snp_6__GeneLevelCNA.txt', sep='\t', header=0)
dfDNAmeth_ori = pd.read_csv('LIHC_Methylation450__SingleValue__TSS1500__Both.txt', sep='\t', header=0)
dfRNASeq_ori = pd.read_csv('LIHC_RNASeq__illuminahiseq_rnaseqv2__GeneExp.txt', sep='\t', header=0)

In [3]:
print('Original shape of \n\tRNASeq data =', dfRNASeq_ori.shape, '\n\tMethylation data =', dfDNAmeth_ori.shape, ' and\n\tCNA data =', dfCNA_ori.shape)

Original shape of 
	RNASeq data = (20530, 426) 
	Methylation data = (20421, 431)  and
	CNA data = (24924, 764)


#### Get rid of the non-sample columns (e.g. GeneSymbol, strand, SingleValueType etc)

In [4]:
dfRNASeq = dfRNASeq_ori.drop(dfRNASeq_ori.columns[[0,1]], axis=1)    # remove 'GeneSymbol', 'EntrezID'
dfDNAmeth = dfDNAmeth_ori.drop(dfDNAmeth_ori.columns[[0,1]], axis=1) # remove 'GeneSymbol', 'SingleValueType'
dfCNA = dfCNA_ori.drop(dfCNA_ori.columns[[0, 1, 2]], axis=1)       # remove 'GeneSymbol', 'Chromosome', 'Strand'
print('New shape of \n\tRNASeq data =', dfRNASeq.shape, '\n\tMethylation data =', dfDNAmeth.shape, ' and\n\tCNA data =', dfCNA.shape)

New shape of 
	RNASeq data = (20530, 424) 
	Methylation data = (20421, 429)  and
	CNA data = (24924, 761)


In [5]:
#dfRNASeq_ori.columns[[0,1,2]]

In [6]:
#dfDNAmeth_ori.columns[[0, 1,2]]

In [7]:
#dfCNA_ori.columns[[0, 1,2,3]]

#### Remove rows containing >=20% values as NA

In [8]:
def removeRows(df):
    threshold = int(0.8*df.shape[1])
    dfNonNA = df.dropna(axis=0, thresh=threshold)
    return dfNonNA

In [9]:
dfRNASeq = removeRows(dfRNASeq)
dfDNAmeth = removeRows(dfDNAmeth)
dfCNA = removeRows(dfCNA)
print('New shape of \n\tRNASeq data =', dfRNASeq.shape, '\n\tMethylation data =', dfDNAmeth.shape, ' and\n\tCNA data =', dfCNA.shape)

New shape of 
	RNASeq data = (20530, 424) 
	Methylation data = (18996, 429)  and
	CNA data = (23604, 761)


#### Remove cols containing >=20% values as NA

In [10]:
def removeCols(df):
    threshold = int(0.8*df.shape[0])
    dfNonNA = df.dropna(axis=1, thresh=threshold)
    return dfNonNA

In [11]:
dfRNASeq = removeCols(dfRNASeq)
dfDNAmeth = removeCols(dfDNAmeth)
dfCNA = removeCols(dfCNA)
print('New shape of \n\tRNASeq data =', dfRNASeq.shape, '\n\tMethylation data =', dfDNAmeth.shape, ' and\n\tCNA data =', dfCNA.shape)

New shape of 
	RNASeq data = (20530, 424) 
	Methylation data = (18996, 429)  and
	CNA data = (23604, 761)


In [12]:
#print(dfDNAmeth.iloc[:,0])

#### Fetch column names from the matrix

In [13]:
rnaSeqSamples = list(dfRNASeq)
dnaMethSamples = list(dfDNAmeth)
cnaSamples = list(dfCNA)
#print(rnaSeqSamples)

#### Fetch participant, sample and tumor type information from TCGA bar-code<br>(https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/)

In [14]:
rnaSeqID = [x[8:16] for x in rnaSeqSamples]
methID = [x[8:16] for x in dnaMethSamples]
cnaID = [x[8:16] for x in cnaSamples]
print('#RNASeq Samples =', len(rnaSeqID), ', #DNAMeth Samples =', len(methID), ', #CNA Samples =', len(cnaID))

#RNASeq Samples = 424 , #DNAMeth Samples = 429 , #CNA Samples = 761


In [15]:
#methID

#### Remove duplicate samples-vial combinations (henceforth referred to as samples)

In [16]:
rnaSeqID = list(set(rnaSeqID))
methID = list(set(methID))
cnaID = list(set(cnaID))
print('After removing duplicate samples\n#RNASeq Samples =', len(rnaSeqID), ', #DNAMeth Samples =', len(methID), ', #CNA Samples =', len(cnaID))

After removing duplicate samples
#RNASeq Samples = 424 , #DNAMeth Samples = 429 , #CNA Samples = 760


#### Identify common samples

In [17]:
def compareLists(bigList, smallList):
    commonList = []
    for i in smallList:
        for j in bigList:
            if i == j:
                commonList.append(bigList.index(j))
                break
    return commonList

In [18]:
commonSamples = list(set(rnaSeqID) & set(methID) & set(cnaID))
print('#commonSamples', len(commonSamples), '\ncommonSamples =', commonSamples)

#commonSamples 404 
commonSamples = ['A4NR-01A', 'A75G-01A', 'A116-01A', 'AAE7-01A', 'A82E-01A', 'AAW0-01A', 'A9FW-01A', 'A7SB-01A', 'A10X-11A', 'A5MZ-01A', 'A9H9-01A', 'A4NK-01A', '5264-01A', 'A3KG-01A', 'AAVV-01A', 'A1EC-11A', 'AACV-01A', 'A10U-01A', 'AAD2-01A', 'A7IL-01A', 'A3R2-01A', 'A26S-11A', 'AAPD-01A', 'A11C-11A', 'A4NN-01A', 'A3MA-01A', 'A9H6-01A', 'AAVU-01A', 'A4NV-01A', 'A9H3-01A', 'A2L6-11A', 'A4NJ-01A', '5258-01A', 'AAW3-01A', 'AA46-01A', 'AACX-01A', 'A8YO-01A', 'A119-01A', 'A5NQ-01A', 'A113-11A', 'A1EG-11A', 'A5KG-01A', 'A5SL-01A', 'A23B-01A', 'A1EH-01A', 'A7IE-01A', 'AACS-01A', 'A3A3-11A', 'AADN-01A', 'A9H7-01A', 'A3A2-01A', 'AAV5-01A', 'A1EA-01A', 'A3A6-01A', 'A3A1-01A', 'A8JO-01A', 'A217-01A', 'A1HT-01A', 'A5N0-01A', 'AAV0-01A', 'A75H-01A', 'A118-01A', 'A39X-01A', 'A3JL-01A', 'A9D0-01A', 'A7IK-01A', 'A3OU-01A', 'A10Z-01A', 'AADW-01A', 'A4U2-01A', 'AA44-01A', 'A7XP-01A', 'AACL-01A', 'AAV1-01A', 'A39W-11A', 'AACB-01A', 'A8LF-01A', 'A216-01A', 'A9CY-01A', 'A9D4-01A', 'A9

In [19]:
#type(cnaID)

In [20]:
commonRNASeqSamples = compareLists(rnaSeqID, commonSamples)
commonDNAmethSamples = compareLists(methID, commonSamples)
commonCNASamples = compareLists(cnaID, commonSamples)

In [21]:
print('extracted # samples from:\n\tRNA-seq data =', len(commonRNASeqSamples),'\n\tDNAMeth data =', len(commonDNAmethSamples), '\n\tCNA data =', len(commonCNASamples))

extracted # samples from:
	RNA-seq data = 404 
	DNAMeth data = 404 
	CNA data = 404


In [22]:
dfRNASeq = dfRNASeq.iloc[:, commonRNASeqSamples]
dfDNAmeth = dfDNAmeth.iloc[:, commonDNAmethSamples]
dfCNA = dfCNA.iloc[:, commonCNASamples]

In [23]:
#print(list(dfDNAmeth))

In [24]:
#dfDNAmeth.iloc[:, 0]#commonDNAmethSamples]

In [25]:
print('New shape of \n\tRNASeq data =', dfRNASeq.shape, '\n\tMethylation data =', dfDNAmeth.shape, ' and\n\tCNA data =', dfCNA.shape)

New shape of 
	RNASeq data = (20530, 404) 
	Methylation data = (18996, 404)  and
	CNA data = (23604, 404)


#### Remove records from RNA-seq data where combined expression values across all samples are too low.<br>
Hence remove 1st quantile i.e. add all values from a row --> order in ascending order --> remove 1st 25% lowest values

(Quantile: rank the vals in ascending order. Compute the rank at 25% of the total vals in the list. Round it up to the nearest higher integrer. Fetch the value corresp to that rank.)

In [26]:
sum_o_rows = dfRNASeq.sum(axis=1)
#print(type(sum_o_rows))
print(sum_o_rows.quantile(0.25))

4537.887000000001


In [27]:
sum_o_rows = dfRNASeq.sum(axis=1)
dfRNASeq_indices = pd.DataFrame(sum_o_rows > sum_o_rows.quantile(0.25));
dfRNASeq2 = dfRNASeq[dfRNASeq_indices.values]
print('New shape of \n\tRNASeq2 data =', dfRNASeq2.shape, '\n\tMethylation data =', dfDNAmeth.shape, ' and\n\tCNA data =', dfCNA.shape)

New shape of 
	RNASeq2 data = (15397, 404) 
	Methylation data = (18996, 404)  and
	CNA data = (23604, 404)


In [28]:
print(type(dfRNASeq_indices))

<class 'pandas.core.frame.DataFrame'>


In [29]:
print(type(dfRNASeq_indices.values))

<class 'numpy.ndarray'>


#### Impute missing values w avg of that col => avg of values for all genes in that sample are taken

In [30]:
imputer = SimpleImputer(missing_values = np.nan, strategy="mean")
dfDNAmeth_imputed=imputer.fit_transform(dfDNAmeth)
dfRNASeq2_imputed=imputer.fit_transform(dfRNASeq2)
dfCNA_imputed=imputer.fit_transform(dfCNA)

In [31]:
print('New shape of \n\dfRNASeq2_imputed data =', dfRNASeq2_imputed.shape, '\n\tMethylation data =', dfDNAmeth_imputed.shape, ' and\n\tCNA data =', dfCNA_imputed.shape)

New shape of 
\dfRNASeq2_imputed data = (15397, 404) 
	Methylation data = (18996, 404)  and
	CNA data = (23604, 404)


#### Transpose the matrix so that samples are aligned against rows --> labeling of samples is possible.<br>Imputing before transpose is very important. If imputing performed after transpose, the missing values for a sample and a gene will be avg of all samples for that gene which is wrong.

In [32]:
dfDNAmeth_transposed = dfDNAmeth_imputed.transpose()
dfRNASeq2_transposed = dfRNASeq2_imputed.transpose()
dfCNA_transposed = dfCNA_imputed.transpose()

In [33]:
print('New shape of \n\dfRNASeq2_transposed data =', dfRNASeq2_transposed.shape, '\n\tMethylation data =', dfDNAmeth_transposed.shape, ' and\n\tCNA data =', dfCNA_transposed.shape)

New shape of 
\dfRNASeq2_transposed data = (404, 15397) 
	Methylation data = (404, 18996)  and
	CNA data = (404, 23604)


In [34]:
def normalizeVals(df):
    rows,cols = df.shape;print(df.shape)
    mins = np.zeros(shape=cols, dtype=np.float32)
    maxs = np.zeros(shape=cols, dtype=np.float32)
    
    for j in range(cols):
        mins[j] = np.min(df[:, j])
        maxs[j] = np.max(df[:, j])
        
    normalized_df = np.copy(df)
    for i in range(rows):
        for j in range(cols):
            normalized_df[i,j] = (df[i,j] - mins[j]) / (maxs[j] - mins[j])
    return normalized_df

In [35]:
normDNAmeth = normalizeVals(dfDNAmeth_transposed)
normRNASeq = normalizeVals(dfRNASeq2_transposed)
normCNA = normalizeVals(dfCNA_transposed)

(404, 18996)
(404, 15397)
(404, 23604)


#### Export data

In [36]:
df_normDNAmeth = pd.DataFrame(normDNAmeth)
df_normDNAmeth.to_csv('normDNAmeth.csv', index=False)

df_normRNASeq = pd.DataFrame(normRNASeq)
df_normRNASeq.to_csv('normRNASeq.csv', index=False)

df_normCNA = pd.DataFrame(normCNA)
df_normCNA.to_csv('normCNA.csv', index=False)