In [1]:
# Append root directory to system's path
import sys
sys.path.append('../ARCH_package')
import filtering, plot

In [2]:
import dill
import os

# Load dataset and create directory paths

In [3]:
# Import synonymous trajectories as exported through with basic.load module
with open('../Exports/LBC_synonymous.dill', 'rb') as infile:
    syn = dill.load(infile)

# Import non-synonymoous trajectories as exported through with basic.load module
with open('../Exports/LBC_non-synonymous.dill', 'rb') as infile:
    lbc = dill.load(infile)

In [4]:
# Create path for exporting
path = '../Results/NGF/'
if not os.path.exists(path):
    os.makedirs(path)

# Add prefix to plots
path = path + 'NGF-' 

# Neutral growth filter
## Train a filter based on the gradient distribution of synonymous mutations

In [6]:
part = lbc[18]
fig = plot.synonymous_profile(part, syn)
fig.show()
fig.write_image(path + 'synonymous_profile.svg')

We use fluctuations from synonymous mutations to develop a neutral growth filter (NGF) to detect mutations that grow more than a neutral clone.

In [7]:
cohort_neutral = filtering.neutral_filter(lbc, syn, n_std=2, mean_intercept=True)
slope = cohort_neutral.neutral_dist.var_regr.coef_[0]
slope

0.00022727614924627953

In [7]:
cohort_neutral.neutral_dist.mean_regr.coef_

array([-0.3397679])

The slope of the linear relation between variance and vaf can be used to approximate the following:

$$ \text{slope} = \frac{\lambda}{N},$$
where $N$ corresponds to the number of hematopoietic cells in an individual and $\lambda$ to the time between self-renewing divisions during normal haematopoiesis.

* Self-renewal replication rate 
    - ~ 1.04 - 2.08/year (average of 1 every 40 weeks). Catlin - Abkowitz (Blood 2011)
    - ~ 0.6 - 6/ year   (symetric self-renewal division every 2-20 months. Lee-Six, 2018 Nature)
    - ~ 0.866 (1 every 60 weeks Dingli Pacheco 2006)
* Active HSC estimate 
    - ~ 11,200 - 22,400 (Abkowitz Catlin, Blood 2002)
    - ~ 385 (Pacheco, Dingli 2006)
    - ~ 50,000  - 200,000 (Lee-Six, Nature 2018)
                   
1. Lee-Six estimates a slope of:
$$ 1~\times 10^{-05}$$ 

2. Abkowitz and Catlin vest estimate is 1 division every 40 weeks and 11000 SCs. These estimates predict a slope of:
$$~ 1.2 \times10^{-4}$$

3. Pacheco estimates a slope of:
$$2 \times 10^{-3}$$ 

4. Our slope $$2.3 \times 10^{-4}.$$

In [8]:
cohort_neutral.rate = 1.3
cohort_neutral.HSC = cohort_neutral.rate / slope

print(f'We estimate {cohort_neutral.HSC} HSCs with a self-renewal rate of {cohort_neutral.rate} divisions/year')

We estimate 5719.91387706636 HSCs with a self-renewal rate of 1.3 divisions/year


We can check the filter's performance on neutral trajectories:

In [9]:
# NGF stack plot
fig = cohort_neutral.neutral_dist.figures[3]
fig.show()
fig.write_image(path + 'stack.svg', width=600)

# Filter on synonymoous plot
fig = cohort_neutral.neutral_dist.figures[0]
fig.show()
fig.write_image(path + 'filter.svg', width=1000)

# Mean linear regression
fig = cohort_neutral.neutral_dist.figures[1]
fig.show()
fig.write_image(path + 'mean_regression.svg', width=1000)

# Variance linear regression
fig = cohort_neutral.neutral_dist.figures[2]
fig.show()
fig.write_image(path + 'Variance linear regression.svg', width=1000)

## Plot participants profiles highlighting trajectories being selected by the NGF

In [10]:
# Plot participant NGF with variant class colors
part = lbc[18]
fig = plot.participant_filter(part)
fig.write_image(path + 'participant_filter_sample.svg')

fig.show()

# Cohort overview

In [11]:
# Gene counts bar plot
fig = cohort_neutral.gene_bar
fig.show()
fig.write_image(path + 'gene_bar.svg')

In [12]:
fig = cohort_neutral.gradient_plot
fig.show()
fig.write_image(path + 'gradient_inset.png', width=600, scale=10)

# Export the model class object for further training

In [13]:
# Export cohort
with open('../Exports/cohort_neutral.dill', 'wb') as outfile:
    dill.dump(cohort_neutral, outfile)