# Process TCGA FPKM for clustering
## Input: 
1. ../data/ProbeAnnotations_NS_IO_360_v1.0_clean.csv
2. ../data/fpkm_TCGA_CCL3L1.csv
3. ../data/IDsTCGA.xlsx
4. ../data/TCGA_HCC_no_gene_expression.csv
5. ../data/TCGA_HCC_no_slide.csv
6. ../data/TCGA_samples.csv


## Output:
1. ../results/tcga/fpkm_final_raw_CCL3L1_336.csv
2. ../results/tcga/fpkm_final_add1-log2-zscore_336.csv

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import csv
# !pip install openpyxl # even pip installed xlrd dependency, still XLRDError: Excel xlsx file; not supported

# Part Data
## Load and visualize data

## Gene-related files

In [2]:
# Nanostring panel: gene list
nanostring = pd.read_csv("../data/ProbeAnnotations_NS_IO_360_v1.0_clean.csv", sep="\t")
print(nanostring.shape) # (784, 12)
display(nanostring.head(5))

(784, 12)


Unnamed: 0,ProbeID,Codeset.Name,Probe.Label,Analyte.Type,Is.Control,Control.Type,Related.Probes,Probe.Annotation,KEGG.Pathways,Cell.Type,Official.Gene.Name,Control.Conc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
0,NM_000077.4:1052,NS_IO_360_V1.0,CDKN2A,mRNA,False,,,Cell Proliferation;Metabolic Stress,hsa04110;hsa04115;hsa05166;hsa05200;hsa05203;h...,,CDKN2A,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
1,NM_004120.4:1744,NS_IO_360_V1.0,GBP2,mRNA,False,,,Interferon Signaling,,,GBP2,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
2,NM_138636.4:2210,NS_IO_360_V1.0,TLR8,mRNA,False,,,Myeloid Compartment,hsa04620,,TLR8,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
3,NM_001146055.1:480,NS_IO_360_V1.0,SNCA,mRNA,False,,,,hsa05010;hsa05012,,SNCA,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...
4,NM_001165.4:6567,NS_IO_360_V1.0,BIRC3,mRNA,False,,,Apoptosis;NF-kappaB Signaling,hsa04064;hsa04120;hsa04210;hsa04510;hsa04621;h...,,BIRC3,;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;...


#### Genes: Nanostring 784 --> FPKM 754
- +1. CCL3 included by mistake. 
- -14. Missing during normalization
- -14. Artificial probes
- -3. CD45s (variants of PTPRC)

#### Load full fpkm data --> (754, 424)

In [3]:
data_full = pd.read_csv('../data/fpkm_TCGA_CCL3L1.csv', sep=',', index_col=0)
print(data_full.shape) # (754, 424)
display(data_full.head(5))

(754, 424)


Unnamed: 0,AACO-01A,5263-01A,5261-01A,A5UC-01A,A4NE-01A,A6M5-01A,A1EH-01A,A8O5-01A,5264-01A,A3I0-11A,...,A39V-01A,A112-01A,A9CY-01A,A39Z-11A,A3M9-01A,A114-11A,A9HB-01A,AADR-01A,AAUZ-01A,A75C-01A
A2M,960.89033,8.063961,351.811349,1609.876721,96.720287,1994.566655,468.53028,2074.350943,26.585468,1150.658889,...,270.085116,34.286241,797.329657,1629.106472,30.443332,1322.322173,481.80338,201.065289,816.703134,733.93431
ABCF1,68.348111,11.610049,15.231046,22.682212,24.680961,10.575817,17.701322,27.142904,7.101905,32.287054,...,30.956738,14.220006,13.805224,22.741792,12.237185,11.513642,11.783001,19.483708,34.192999,11.003601
ACVR1C,0.185683,0.076367,0.028349,0.011091,0.089753,0.212977,0.009394,0.042833,0.01666,1.006738,...,0.185369,0.022324,0.330558,0.706112,0.008248,0.387284,0.08748,0.509742,0.251584,0.065433
ADAM12,0.04329,0.117043,0.424874,0.220646,0.398002,2.08055,0.058136,0.021398,0.048752,0.012036,...,0.020337,0.407111,0.087382,0.152428,1.300681,0.05585,0.083316,0.051072,0.043991,0.063461
ADM,4.200867,12.315492,0.774076,2.088622,1.877584,2.108351,2.311229,2.463347,2.33799,75.254975,...,18.702486,0.641932,2.27178,11.744819,21.226002,4.173679,2.178665,6.627051,6.081892,14.884325


## Sample-related files

In [4]:
# Load real HCC sample list
df_hcc = pd.read_excel('../data/IDsTCGA.xlsx', engine='openpyxl')
print(df_hcc.shape) # (358, 1)
display(df_hcc.head(5))

(358, 1)


Unnamed: 0,ID
0,TCGA-2V-A95S-01
1,TCGA-2Y-A9GS-01
2,TCGA-2Y-A9GT-01
3,TCGA-2Y-A9GU-01
4,TCGA-2Y-A9GV-01


In [5]:
# Load samples that have no mRNA
df_no_mrna = pd.read_csv("../data/TCGA_HCC_no_gene_expression.csv", sep='\t', header=None)
print(df_no_mrna.shape) # (6, 1)
display(df_no_mrna.head(5))

(6, 1)


Unnamed: 0,0
0,TCGA-DD-A3A0-01
1,TCGA-DD-AACM-01
2,TCGA-G3-A25W-01
3,TCGA-DD-AADE-01
4,TCGA-DD-A1E9-01


In [6]:
# Load sample list with no slides
df_no_slide = pd.read_csv("../data/TCGA_HCC_no_slide.csv", sep='\t', header=None)
print(df_no_slide.shape) # (11, 1)
display(df_no_slide.head(5))

# Extract sample id in format 'A2HS-01'
set_sample_no_slide = set(i[8:] for i in df_no_slide[0])

(11, 1)


Unnamed: 0,0
0,TCGA-3K-AAZ8-01
1,TCGA-BD-A2L6-01
2,TCGA-BD-A3EP-01
3,TCGA-BD-A3ER-01
4,TCGA-ES-A2HS-01


# Part Pre-processing
## Genes filtering
#### Name index as 'Gene Name'

In [7]:
data_full.index.name='Gene Name'
print(data_full.shape) # (754, 424)
display(data_full.head(3))

(754, 424)


Unnamed: 0_level_0,AACO-01A,5263-01A,5261-01A,A5UC-01A,A4NE-01A,A6M5-01A,A1EH-01A,A8O5-01A,5264-01A,A3I0-11A,...,A39V-01A,A112-01A,A9CY-01A,A39Z-11A,A3M9-01A,A114-11A,A9HB-01A,AADR-01A,AAUZ-01A,A75C-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,960.89033,8.063961,351.811349,1609.876721,96.720287,1994.566655,468.53028,2074.350943,26.585468,1150.658889,...,270.085116,34.286241,797.329657,1629.106472,30.443332,1322.322173,481.80338,201.065289,816.703134,733.93431
ABCF1,68.348111,11.610049,15.231046,22.682212,24.680961,10.575817,17.701322,27.142904,7.101905,32.287054,...,30.956738,14.220006,13.805224,22.741792,12.237185,11.513642,11.783001,19.483708,34.192999,11.003601
ACVR1C,0.185683,0.076367,0.028349,0.011091,0.089753,0.212977,0.009394,0.042833,0.01666,1.006738,...,0.185369,0.022324,0.330558,0.706112,0.008248,0.387284,0.08748,0.509742,0.251584,0.065433


#### Drop an extra gene (CCL3) --> (753, 424)
##### Not included in NanoString but accidently included in FPKM

In [8]:
data_full.drop("CCL3", inplace=True)
print(data_full.shape) # (753, 424)

(753, 424)


#### Drop housekeeping genes --> (734, 424)
##### Drop 19 (1 missing)

In [9]:
name_hk = set(nanostring[nanostring["Control.Type"]=="Housekeeping"]["Probe.Label"].values) 
print("Number of housekeeping genes in NanoString:") # 20
print(str(len(name_hk)))
print("Housekeeping gene names:")
print(str(name_hk))
print()
print("Missing housekeeping genes:")
print((name_hk - set(data_full.index))) # NRDE2 (aka. C14orf102) not found in the FPKM
name_hk = name_hk - ((name_hk - set(data_full.index)))
print("Number of housekeeping genes to be dropped:")
print(str(len(name_hk))) # 19

Number of housekeeping genes in NanoString:
20
Housekeeping gene names:
{'PUM1', 'TBP', 'UBB', 'TLK2', 'ERCC3', 'TMUB2', 'POLR2A', 'SDHA', 'DNAJC14', 'TFRC', 'TBC1D10B', 'GUSB', 'STK11IP', 'PSMC4', 'SF3A1', 'NRDE2', 'OAZ1', 'MRPL19', 'ABCF1', 'G6PD'}

Missing housekeeping genes:
{'NRDE2'}
Number of housekeeping genes to be dropped:
19


In [10]:
data_full.drop(name_hk, inplace=True)
print(data_full.shape) # (734, 424)
display(data_full.head(5))

(734, 424)


Unnamed: 0_level_0,AACO-01A,5263-01A,5261-01A,A5UC-01A,A4NE-01A,A6M5-01A,A1EH-01A,A8O5-01A,5264-01A,A3I0-11A,...,A39V-01A,A112-01A,A9CY-01A,A39Z-11A,A3M9-01A,A114-11A,A9HB-01A,AADR-01A,AAUZ-01A,A75C-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,960.89033,8.063961,351.811349,1609.876721,96.720287,1994.566655,468.53028,2074.350943,26.585468,1150.658889,...,270.085116,34.286241,797.329657,1629.106472,30.443332,1322.322173,481.80338,201.065289,816.703134,733.93431
ACVR1C,0.185683,0.076367,0.028349,0.011091,0.089753,0.212977,0.009394,0.042833,0.01666,1.006738,...,0.185369,0.022324,0.330558,0.706112,0.008248,0.387284,0.08748,0.509742,0.251584,0.065433
ADAM12,0.04329,0.117043,0.424874,0.220646,0.398002,2.08055,0.058136,0.021398,0.048752,0.012036,...,0.020337,0.407111,0.087382,0.152428,1.300681,0.05585,0.083316,0.051072,0.043991,0.063461
ADM,4.200867,12.315492,0.774076,2.088622,1.877584,2.108351,2.311229,2.463347,2.33799,75.254975,...,18.702486,0.641932,2.27178,11.744819,21.226002,4.173679,2.178665,6.627051,6.081892,14.884325
ADORA2A,2.901246,0.634927,1.887203,1.87275,1.417934,1.370511,1.103044,1.623636,0.210387,1.11801,...,2.21186,0.582018,0.86759,1.49111,1.521162,1.103204,1.04491,1.043928,1.770068,1.102014


## Sample filtering
### Filter samples by 2 conditions
1. Real HCCs
2. Have related slides

### Drop samples only with poor-quality slides
- Optional but used in current workflow

#### Filter real HCC samples --> (734, 351)

In [11]:
# Check duplicates in hcc ref
print("Duplicates:")
display(df_hcc[df_hcc.ID.duplicated(keep=False)])

Duplicates:


Unnamed: 0,ID
227,TCGA-DD-AAW2-01
228,TCGA-DD-AAW2-01


In [12]:
set_samples_hcc = set(i[8:] for i in df_hcc.ID) # Items in FPKM have no "TCGA-XX-"
print("Number of real HCC samples:")
print(len(set_samples_hcc))

Number of real HCC samples:
357


In [14]:
# drop not HCC samples
print(data_full.shape) # (734, 424)
dfObj = pd.DataFrame()
# A set is an unordered collection, so sample order changes and thus results in rounding error after zscore!
# (Optional) If you want to reproduce the exact same results, load the sample list to reorder the dataframe
# after the step "Drop samples with only poor-quality slide".
for sampleid in set_samples_hcc: 
    dfObj = pd.concat([dfObj,data_full.loc[:, data_full.columns.str.startswith(sampleid)]], axis=1)
data_full = dfObj
del(dfObj)
print(data_full.shape) # (734, 351)
display(data_full.head(5))

(734, 351)
(734, 351)


Unnamed: 0_level_0,A6M6-01A,AA46-01A,A39Z-01A,A75H-01A,A1EJ-01A,A9DB-01A,A118-01A,5262-01A,A7MF-01A,AACW-01A,...,A7IE-01A,A9GW-01A,AADS-01A,A7K0-01A,AACU-01A,5261-01A,A9G7-01A,A11A-01A,A39X-01A,A9GY-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,146.120702,222.300123,4250.893244,100.522376,120.878435,258.887679,329.62408,424.89807,400.125599,208.190962,...,396.114779,244.983896,310.137116,90.913546,117.694033,351.811349,35.042571,71.977083,297.247587,41.190644
ACVR1C,0.01895,0.345253,0.127787,0.148953,0.011042,0.121135,0.08493,0.035937,0.019061,0.082971,...,0.011733,0.053547,0.16556,0.124258,0.067015,0.028349,0.062943,0.246415,0.014357,0.024878
ADAM12,1.397431,0.027654,0.037917,0.023151,0.010297,0.195268,0.108333,0.513873,0.015554,0.039793,...,0.16139,0.377293,0.0,0.038626,0.06085,0.424874,0.187834,0.027288,0.024864,0.420049
ADM,12.915171,13.328006,6.209435,9.490293,6.142268,3.816987,1.357324,4.998112,4.280159,1.194942,...,4.872289,2.445533,6.861121,8.608716,2.595243,0.774076,4.343928,2.672804,1.408484,11.322187
ADORA2A,1.569534,0.427331,1.337454,1.259919,0.94552,0.77795,1.937381,2.237277,1.112935,1.185278,...,1.916869,1.409112,3.267579,1.124022,0.785072,1.887203,1.269886,0.476204,1.223346,2.653176


##### Validate not include the 6 samples without mRNA

In [15]:
# Validate no overlapping with samples that have no mRNA
set_sample_no_mrna = set(i[8:] for i in df_no_mrna[0])
assert len(set(i[:-1] for i in data_full.columns.values).intersection(set_sample_no_mrna)) == 0
del(df_no_mrna,set_sample_no_mrna)

#### Drop samples without related slides --> (734, 340)

In [16]:
# drop no slide samples
print(data_full.shape) # (734, 351)
unwanted_sample=[]
for sampleid in set_sample_no_slide:
    unwanted_sample.append(data_full.columns[data_full.columns.str.startswith(sampleid)].values[0])
data_full.drop(unwanted_sample, axis=1, inplace=True)
print(data_full.shape) # (734, 340)
del(df_no_slide, set_sample_no_slide, unwanted_sample)

(734, 351)
(734, 340)


#### (Optional) Drop samples with only poor-quality slides --> (734, 336)

In [17]:
# drop 4 samples: "A216-01A","A8YO-01A", "AACY-01A", "A5RF-01A"
print(data_full.shape) # (734, 340)
poorquality_sample = ["A216-01A","A8YO-01A", "AACY-01A", "A5RF-01A"]
data_full.drop(poorquality_sample, axis=1, inplace=True)
print(data_full.shape) # (734, 336)
del(poorquality_sample)

(734, 340)
(734, 336)


#### (Optional) If you wanted to reproduce the exactly same results, the sample order has to be the same. Otherwise, there will be a rounding error after zscore. 
###### And this may affect the sample order in the clustering dendrogram. Although it may not change the cluster assigned to each sample, the sample order is the order of processing slides taken by the model, and will finally change the trained model.

In [18]:
# Load sample list of previous processing
sample_list = pd.read_csv('../data/TCGA_samples.csv', header=None)[0].values.tolist()
print(len(sample_list))
print(sample_list[:3])

336
['A1EJ-01A', 'A116-01A', 'AAD5-01A']


In [19]:
# Reorder the columns in the dataframe
data_full = data_full[sample_list]
print(data_full.shape) # (734, 336)
display(data_full.head(5))

(734, 336)


Unnamed: 0_level_0,A1EJ-01A,A116-01A,AAD5-01A,A66X-01A,A2KB-01A,A7PY-01A,A3JL-01A,A3A7-01A,AAEB-01A,A3MA-01A,...,A7II-01A,A5NP-01A,AAD6-01A,A8LF-01A,A75E-01A,A6M6-01A,A10Y-01A,A6GG-01A,A4ND-01A,A9CD-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,120.878435,70.594743,92.149114,582.100406,91.391688,2116.915293,195.693648,248.361142,149.607817,41.925908,...,255.234319,157.75497,552.443357,84.174382,749.379768,146.120702,33.54894,350.789615,88.221089,462.848896
ACVR1C,0.011042,0.089073,0.008587,0.009242,0.019567,0.011448,0.095005,0.509921,0.086818,0.01356,...,0.01242,0.006459,0.404416,0.189459,0.125112,0.01895,0.059387,0.530567,0.055671,0.076745
ADAM12,0.010297,0.064079,0.740129,1.199091,0.022158,0.016013,0.053159,0.456508,0.080962,0.615186,...,0.201539,0.016866,0.060829,0.117103,0.182435,1.397431,0.013031,0.032743,0.383264,0.133803
ADM,6.142268,8.471828,13.881182,1.141214,4.146708,1.529741,1.689659,12.351448,57.859831,5.531999,...,2.050489,1.464072,9.444584,9.070045,3.870842,12.915171,2.107564,4.388291,6.015631,14.742393
ADORA2A,0.94552,1.478947,0.094151,1.760734,0.481203,1.870259,1.500053,3.019209,1.65908,0.67871,...,0.759918,1.121857,2.117022,2.07253,1.506499,1.569534,0.564344,0.834129,1.195694,1.595166


# Part Data exploration
## Count missing values

In [20]:
# Missing elements
print("Number of NaN or missing data: " + str(data_full.isnull().sum().sum())) # 0, should not be any missing data
print("Number of 0: " + str((data_full==0.0).sum().sum())) # 10428 for 340, 10346 for 336, 0 doesn't mean missing

Number of NaN or missing data: 0
Number of 0: 10346


In [21]:
# Missing names of row or col
print("Missing values in gene name: " + str(data_full.index.isna().sum())) # 0 
print("Missing values in sample id: " + str(data_full.columns.isna().sum())) # 0

Missing values in gene name: 0
Missing values in sample id: 0


In [22]:
# How many all-zero rows and their indices
# print("All-zero rows: " + str(((data_full.T==0.0).sum()==340).sum())) # 1, normal. Not expressed at all.
# display(data_full.iloc[np.where((data_full.T==0.0).sum()==340)]) # also all-zero in fpkm_TCGA
print("All-zero rows: " + str(((data_full.T==0.0).sum()==336).sum())) # 1, normal. Not expressed at all.
display(data_full.iloc[np.where((data_full.T==0.0).sum()==336)]) # DEFB134

# How many all-zero cols
print("All-zero cols: " + str(((data_full==0.0).sum()==734).sum())) # 0

All-zero rows: 1


Unnamed: 0_level_0,A1EJ-01A,A116-01A,AAD5-01A,A66X-01A,A2KB-01A,A7PY-01A,A3JL-01A,A3A7-01A,AAEB-01A,A3MA-01A,...,A7II-01A,A5NP-01A,AAD6-01A,A8LF-01A,A75E-01A,A6M6-01A,A10Y-01A,A6GG-01A,A4ND-01A,A9CD-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DEFB134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


All-zero cols: 0


#### Remove the all-zero (not expressed) gene --> (733, 336/340)

In [23]:
# Otherwise there will be problem when performing Z-Score and scaling
data_full.drop(labels='DEFB134', inplace=True)
data_full.shape # (733, 336/340)

(733, 336)

In [24]:
# check data min (to see if need to add 1 before zscore)
display(data_full.T.describe())
print("Minimum of the data: "+str(data_full.min().min())) # 0

Gene Name,A2M,ACVR1C,ADAM12,ADM,ADORA2A,AKT1,ALDOA,ALDOC,ANGPT1,ANGPT2,...,WNT4,WNT5A,WNT5B,WNT7B,XCL1,ZAP70,ZC3H12A,ZEB1,ZEB2,CCL3L1
count,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0,...,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0,336.0
mean,475.948888,0.162166,0.346709,6.35956,1.437697,10.053008,69.65238,27.314263,0.384149,1.043441,...,1.558116,1.184296,1.743884,0.186054,0.489998,0.776322,4.445453,1.816098,0.515522,1.069744
std,689.382669,0.196429,1.800181,9.160999,1.057646,5.556579,94.705265,31.457206,0.382167,1.090865,...,3.472694,1.664764,3.156275,1.23324,0.797918,1.162557,4.981874,0.975123,0.628903,2.907913
min,8.063961,0.0,0.0,0.641932,0.094151,1.596935,7.803906,0.149045,0.0,0.069319,...,0.0,0.0,0.00688,0.0,0.0,0.015525,0.391197,0.37795,0.056094,0.0
25%,89.630523,0.036036,0.027462,2.248501,0.776934,6.260545,28.448874,6.539468,0.126075,0.396476,...,0.042986,0.171825,0.167828,0.0,0.074706,0.178045,1.632519,1.129421,0.220969,0.169138
50%,248.338199,0.088117,0.066169,3.782491,1.129963,8.904925,46.4214,16.662141,0.281967,0.724156,...,0.255015,0.695436,0.662919,0.009534,0.196791,0.398455,2.67218,1.691145,0.353231,0.45637
75%,516.908848,0.224736,0.188796,6.560203,1.797293,12.900811,78.837317,35.036506,0.527008,1.270555,...,1.361565,1.491462,1.736394,0.038669,0.581974,0.901772,5.084525,2.267876,0.612979,0.956998
max,5167.410193,1.263938,25.872119,103.768361,8.900005,41.633061,965.402045,185.567247,3.237895,8.359021,...,24.04155,15.61577,22.036023,20.405032,6.869837,12.190879,38.911911,6.666682,7.559379,44.230049


Minimum of the data: 0.0


## Export pre-processed data

In [25]:
data_full.to_csv('../results/tcga/fpkm_final_raw_CCL3L1_{}.csv'.format(data_full.shape[1]), index=True, sep='\t')
print(data_full.shape)
display(data_full.head(3))

(733, 336)


Unnamed: 0_level_0,A1EJ-01A,A116-01A,AAD5-01A,A66X-01A,A2KB-01A,A7PY-01A,A3JL-01A,A3A7-01A,AAEB-01A,A3MA-01A,...,A7II-01A,A5NP-01A,AAD6-01A,A8LF-01A,A75E-01A,A6M6-01A,A10Y-01A,A6GG-01A,A4ND-01A,A9CD-01A
Gene Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,120.878435,70.594743,92.149114,582.100406,91.391688,2116.915293,195.693648,248.361142,149.607817,41.925908,...,255.234319,157.75497,552.443357,84.174382,749.379768,146.120702,33.54894,350.789615,88.221089,462.848896
ACVR1C,0.011042,0.089073,0.008587,0.009242,0.019567,0.011448,0.095005,0.509921,0.086818,0.01356,...,0.01242,0.006459,0.404416,0.189459,0.125112,0.01895,0.059387,0.530567,0.055671,0.076745
ADAM12,0.010297,0.064079,0.740129,1.199091,0.022158,0.016013,0.053159,0.456508,0.080962,0.615186,...,0.201539,0.016866,0.060829,0.117103,0.182435,1.397431,0.013031,0.032743,0.383264,0.133803


# Part Processing
- +1
- log2
- zscore per gene

In [26]:
def transform_data(df, centering):
    data = 1 + df
    data = np.log2(data)
    
    if centering == "zscore":
        data = pd.DataFrame(stats.zscore(data, axis=1), columns=data.columns, index=data.index)
    elif centering == "median":
        row_medians = np.median(data, axis=1)
        row_medians_col_vec = row_medians.reshape((row_medians.shape[0], 1))
        data = pd.DataFrame(data-row_medians_col_vec, columns=data.columns, index=data.index)

    return data

## Export processed data

In [27]:
transform_data(data_full, centering="zscore").to_csv('../results/tcga/fpkm_final_add1-log2-zscore_{}.csv'.format(data_full.shape[1]), index=True, sep='\t')