In [47]:
import math
import numpy as np
import os
import pandas as pd
import pickle
import pystan

In [48]:
data_dir = '../../data'

In [49]:
pth = os.path.join(data_dir, 'pptc-pdx-clinical-web.txt')

pdx_clinical = pd.read_csv(pth, sep='\t')

# Only consider models that are part of the PPTC
pdx_clinical = pdx_clinical[pdx_clinical['RNA.Part.of.PPTC'] == 'yes']

# Fix for the latest round of the paper
assert pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-102', 'Histology.Detailed'].item() == 'BCP-ALL'
pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-102', 'Histology.Detailed'] = 'Ph-likeALL'

assert pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-105', 'Histology.Detailed'].item() == 'BCP-ALL'
pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-105', 'Histology.Detailed'] = 'Ph-likeALL'

assert pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-115', 'Histology.Detailed'].item() == 'BCP-ALL'
pdx_clinical.loc[pdx_clinical['Model'] == 'ALL-115', 'Histology.Detailed'] = 'Ph-likeALL'

assert pdx_clinical.loc[pdx_clinical['Model'] == 'PAKYEP', 'Histology.Detailed'].item() == 'BCP-ALL'
pdx_clinical.loc[pdx_clinical['Model'] == 'PAKYEP', 'Histology.Detailed'] = 'Ph-likeALL'

In [50]:
# Create Attributes File for TumorMap Analysis

pth = os.path.join(data_dir, 'tumormap-attr-2019-07-30-pdx.tsv')

attr = pd.DataFrame(columns=['Model', 'Histology'])

for (model, hist), rows in pdx_clinical.groupby(['Model', 'Histology.Detailed']):
    
    attr.loc[len(attr), :] = [model, hist]
    
attr.to_csv(pth, sep='\t', index=False)

In [52]:
# Load PDX expression data normalized to TPM
pth = os.path.join(data_dir, 'pdx-TPM-2019-02-15.tsv')
exp = pd.read_csv(pth, sep='\t', index_col=0)

In [53]:
# Check how many of the PDXs have expression data

pdxs = pdx_clinical['Model'].values
print('Started with ', len(pdxs))

for pdx in pdxs:
    if pdx not in exp.columns:
        print(pdx)

pdxs = [x for x in pdxs if x in exp.columns]
print('Ended with ', len(pdxs))

Started with  244
Ended with  244


In [54]:
# Remove genes that have more than 80% of the genes non-expressed
# Remove genes that are in the bottom 20% for low variance
def expression_variance_filter(norm_df):
    """
    This function was taken from the UCSC Treehouse protocol pipeline:
    https://github.com/UCSC-Treehouse/protocol/blob/master/3_generate-thresholds.ipynb
    """
    proportion_unexpressed = 0.8
    variance_filter_level = 0.2

    max_ok_zeroes = len(norm_df.columns) * proportion_unexpressed
    def sufficiently_expressed(series):
        return len(series[series <= 0.]) < max_ok_zeroes

    withZeroes = norm_df.apply(sufficiently_expressed, axis=1) # Series: Gene to True (keep) or False
    expression_filtered_df = norm_df[withZeroes]

    variance = expression_filtered_df.apply(np.std, axis=1) #  Series
    cut_proportion = int(math.ceil(len(variance) * variance_filter_level))
    keep_proportion = len(variance) - cut_proportion

    # list of genes that remain after
    expr_var_filtered_genelist = variance.nlargest(keep_proportion) #  Series

    return expr_var_filtered_genelist.index

In [55]:
# Subset expression matrix to the PPTC PDXs
exp = exp[pdxs]

# Apply log2(TPM + 1) transformation
exp = np.log2(exp + 1.0)

# Filter genes that are mostly zero or have 
# variance below the 20th percentile
filt_genes = expression_variance_filter(exp)
exp = exp.reindex(filt_genes)

# Center data
exp = exp.apply(lambda x: x - x.mean(), axis=1) 

# Drop missing values
exp.dropna(inplace=True)

exp.shape

pth = os.path.join(data_dir, 'tumormap-exp-2019-07-30-pdx.tsv')
exp.to_csv(pth, sep='\t')

In [58]:
pdx_clinical.groupby('Histology.Priors').count()

Unnamed: 0_level_0,Model,Phase,PersonID,DNA.Part.of.PPTC,RNA.Part.of.PPTC,Have.snp.file,snp.array.filename,snp.array.sample.ID,Have.maf,WES.human.bam.filename,...,Histology.Oncoprints2,STR.OK,Model.Type,Histology,Histology.Detailed,PI,Sex,Age,Reported_Ethnicity,Inferred_Ethnicity
Histology.Priors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BCP-ALL,37,37,37,37,37,37,36,36,37,37,...,37,37,37,37,37,37,37,37,36,36
Brain,48,48,48,48,48,48,44,44,48,32,...,48,48,48,48,48,48,48,42,44,44
Carcinoma,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
ETP-ALL,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
Ewing Sarcoma,9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
Extracranial Rhabdoid,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
MLL-ALL,10,10,10,10,10,10,10,10,10,10,...,10,10,10,10,10,10,10,10,10,10
Neuroblastoma,33,33,33,33,33,33,33,33,33,33,...,33,33,33,33,33,33,33,28,33,33
Osteosarcoma,30,30,30,30,30,30,30,30,30,28,...,30,30,30,30,30,30,30,25,30,30
Other Sarcoma,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,1,3,3


In [59]:
for i, rows in pdx_clinical.groupby(['Histology.Detailed', 'Histology.Priors']):
    print(i, len(rows))

('ASPS', 'Other Sarcoma') 2
('ATRT', 'Brain') 6
('Astrocytoma', 'Brain') 4
('BCP-ALL', 'BCP-ALL') 33
('CNS EFT-CIC', 'Brain') 2
('CNS embryonal NOS', 'Brain') 1
('CNS germinoma', 'Brain') 1
('Clear Cell Sarcoma', 'Other Sarcoma') 1
('Colon Carcinoma', 'Carcinoma') 1
('DIPG', 'Brain') 2
('ETMR', 'Brain') 3
('ETP-ALL', 'ETP-ALL') 6
('Ependymoblastoma', 'Brain') 1
('Ependymoma', 'Brain') 6
('Ewing Sarcoma', 'Ewing Sarcoma') 9
('Extracranial Rhabdoid', 'Extracranial Rhabdoid') 3
('Fusion+ RMS', 'Rhabdomyosarcoma') 6
('Fusion- RMS', 'Rhabdomyosarcoma') 6
('Glioblastoma', 'Brain') 4
('Hepatoblastoma', 'Renal') 1
('MLL-ALL', 'MLL-ALL') 10
('Medulloblastoma', 'Brain') 18
('Neuroblastoma', 'Neuroblastoma') 33
('Osteosarcoma', 'Osteosarcoma') 30
('Ph+-ALL', 'Ph+/like-ALL') 3
('Ph-likeALL', 'BCP-ALL') 4
('Ph-likeALL', 'Ph+/like-ALL') 15
('Small Cell Carcinoma', 'Carcinoma') 2
('T-ALL', 'T-ALL') 19
('Wilms', 'Renal') 12


In [60]:
def get_tissue(d):
    """
    We found that including a tissue specific prior improves 
    identification of differentially expressed genes across PDXs.
    Here we match specific diseases to broad tissue categories.
    """
    if d in ['ATRT', 
             'Astrocytoma', 
             'CNS germinoma', 
             'DIPG', 
             'Ependymoma',
             'ETMR',
             'Glioblastoma', 
             'High-grade glioma',
             'Medulloblastoma', 
             'PNET',
             'Anaplastic Rhabdoid Meningioma',
             'CNS EFT-CIC',
             'CNS embryonal NOS',
             'Ependymoblastoma']:
        return "brain"
    
    elif d in ['ETP-ALL',
               'T-ALL']:
        return 't-cell ALL'
    
    elif d in ['BCP-ALL',
               'MLL-ALL',
               'Ph+-ALL',
               'Ph-likeALL']:
        return 'b-cell ALL'
    
    elif d in ['Ewing Sarcoma', 'Osteosarcoma']:
        return 'bone'
    
    elif d in ['ASPS',
               'Other Sarcoma',
               'Rhabdoid',
               'Rhabdomyosarcoma',
               'Alveolar Rhabdomyosarcoma',
               'Embryonal Rhabdomyosarcoma',
               'Extracranial Rhabdoid',
               'Fusion+ RMS',
               'Fusion- RMS',
               'Clear Cell Sarcoma']:
        return 'soft sarcoma'
    
    elif d in ['Wilms', 'Other Renal']:
        return 'renal'
    
    elif d == 'Neuroblastoma':
        return 'neuroblastoma'
    
    elif d == 'Hepatoblastoma':
        return 'hepatoblastoma'
    
    elif d in ['Small Cell Carcinoma', 'Colon Carcinoma']:
        return 'carcinoma'
    
    else:
        raise ValueError(d)

In [61]:
# Match meta data and expression to each PDX. 
tmps = []
for p in exp.columns:
    if p in pdxs:
        tmp = pd.DataFrame(columns=['sample', 
                                    'tissue', 
                                    'disease', 
                                    'gene', 
                                    'gene_id', 
                                    'expression'],
                           index=range(len(exp.index)))
        
        disease = pdx_clinical.loc[pdx_clinical['Model'] == p, 'Histology.Detailed'].item()
        
        tissue = get_tissue(disease)
        
        tmp['sample'] = p
        tmp['tissue'] = tissue
        tmp['disease'] = disease
        tmp['gene'] = exp.index.values
        tmp['gene_id'] = range(1, len(exp.index.values) + 1)
        tmp['expression'] = exp.loc[exp.index, p].values
        
    else:
        raise ValuError()
        
    tmps.append(tmp)

In [62]:
# Merge all of these features into one large dataframe
data = pd.concat(tmps,
                 axis=0,
                 verify_integrity=True,
                 ignore_index=True)

In [63]:
# Check that the mean is centered
for i, row in data.groupby('gene'):
    assert np.isclose(np.mean(row['expression'].values), [0])

In [64]:
# Stan only takes integer values, so match disease and tissue names 
# to an integer identifier.
disease_map = dict((d, i) for d, i in zip(data['disease'].unique(), range(1, len(data['disease'].unique()) + 1)))
disease_map

tissue_map = dict((d, i) for d, i in zip(data['tissue'].unique(), range(1, len(data['tissue'].unique()) + 1)))

In [65]:
# Add disease and tissue identifiers to dataframe
data['disease_id'] = data['disease'].map(disease_map)
data['tissue_id'] = data['tissue'].map(tissue_map)

In [66]:
# Instead of using the tissue_id, stan uses 
# the disease to tissue map.
disease_tissue_map = []
for disease in sorted(data['disease_id'].unique()):
    tissue = data.loc[data['disease_id'] == disease, 'tissue_id'].unique()[0]
    disease_tissue_map.append(tissue)

In [67]:
stan_d = {'N': data.shape[0],
          'G': len(data['gene_id'].unique()),
          'T': len(data['tissue_id'].unique()),
          'D': len(data['disease_id'].unique()),
          'genes': data['gene_id'].values,
          'tissues': disease_tissue_map,
          'diseases': data['disease_id'].values,
          'y': data['expression'].values}

In [68]:
# Write the Stan data dictionary to disk
pth = os.path.join(data_dir, 'stan-data-v8-2019-07-30.dump')
pystan.stan_rdump(stan_d, pth)

In [69]:
# Save the mapping to the original disease, tissue, and gene names
pth = os.path.join(data_dir, 'stan-data-v8-map-2019-07-30.pkl')
inv_disease_map = {v: k for k, v in disease_map.items()}
inv_tissue_map = {v: k for k, v in tissue_map.items()}

gene_mapper = {}
for gene, _id in zip(data['gene'].unique(), data['gene_id'].unique()):
    gene_mapper[_id] = gene

maps = {'tissue': inv_tissue_map,
        'disease': inv_disease_map, 
        'gene': gene_mapper}

with open(pth, 'wb') as f:
    pickle.dump(maps, f)

In [70]:
inv_disease_map

{1: 'BCP-ALL',
 2: 'MLL-ALL',
 3: 'Ph+-ALL',
 4: 'T-ALL',
 5: 'Ph-likeALL',
 6: 'ASPS',
 7: 'CNS embryonal NOS',
 8: 'ATRT',
 9: 'Ewing Sarcoma',
 10: 'Neuroblastoma',
 11: 'ETP-ALL',
 12: 'Extracranial Rhabdoid',
 13: 'ETMR',
 14: 'DIPG',
 15: 'Glioblastoma',
 16: 'Ependymoma',
 17: 'Ependymoblastoma',
 18: 'CNS EFT-CIC',
 19: 'Astrocytoma',
 20: 'CNS germinoma',
 21: 'Medulloblastoma',
 22: 'Fusion- RMS',
 23: 'Wilms',
 24: 'Fusion+ RMS',
 25: 'Small Cell Carcinoma',
 26: 'Colon Carcinoma',
 27: 'Clear Cell Sarcoma',
 28: 'Hepatoblastoma',
 29: 'Osteosarcoma'}

In [71]:
# Check that the gene and gene_id values are the same for 
# all of the samples
last = None
for i, row in data.groupby('sample'):
    if last is None:
        last = row
        continue
        
    assert (row['gene'].values == last['gene'].values).all()
    assert (row['gene_id'].values == last['gene_id'].values).all()