In [1]:
import pandas as pd
import numpy as np

In [2]:
# load expression values of 46 Prolaris genes in all 498 TCGA PRAD patients
gexp = pd.read_csv('PRAD_genes_list3a.csv')
gexp.set_index('gene symbol', inplace = True)

In [3]:
gexp.shape

(16, 498)

In [4]:
gexp = np.log2(gexp+1)

In [5]:
# Determine class lables:  pfi<cutoff1, class 1 high risk,
# pfi>cutoff2, class 3 low risk, cutoff1<pfi<cutoff2, class 2 medium risk
df = pd.read_csv('PRAD_clinical_pfi_os.csv')
df.set_index('full_bar_code', inplace = True)

In [6]:
pfi = df.loc[['PFI.1', 'PFI.time.1']]

In [7]:
cutoff1, cutoff2 = 2*365, 5*365

In [8]:
class1 = pfi.columns[(pfi.loc['PFI.time.1']<cutoff1) & (pfi.loc['PFI.1'] == 1)].tolist()
class2 = pfi.columns[(pfi.loc['PFI.time.1']>=cutoff1) &  (pfi.loc['PFI.time.1']<cutoff2) & (pfi.loc['PFI.1'] == 1)].tolist()
class3 = pfi.columns[(pfi.loc['PFI.time.1']>=cutoff2)].tolist()

In [9]:
[len(class1), len(class2), len(class3)]

[52, 30, 63]

In [10]:
label = np.concatenate((np.ones(len(class1)), 2*np.ones(len(class2)), 3*np.ones(len(class3)))).astype(np.int8)

In [11]:
label

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int8)

In [12]:
samples = class1 + class2 + class3
gexptest = gexp.loc[:,samples]
gexptest.loc['label'] = label

In [13]:
gexptest.to_csv('gexp3deciphe1.csv')