# Large Population: Neutral *vs* Selection HIGH MUTATION RATE

# SIMULATION NOTEBOOK

Due to the size of the populations under the high mutation rate, the simulations are carried out in this separate notebook. The necessary data is exported and the analysis is carried out in a separate analysis notebook.

**WARNING:** these simulations can take 1-2 hours to complete.

The parameters are summarized in the table below:

Parameter  | Neutral | Selection  |
-----------|---------|------------|
Final size | $10^8$  |   $10^8$   |
q (start)  | **0.1** | **0.1**|
q-factor   | $1$   |   $1$    |  
P(death)   | $0.3$   |   $0.3$    |
Selection  | $0$   |  $0.3$   |  

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from ThesisScripts.MyModel import *
from ThesisScripts.Analyses import *
from ThesisScripts.Visualizations import *
from ThesisScripts.PhyloTree import *

from IPython.display import Image

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")

#matplotlib preamble, taken from http://blog.juliusschulz.de/blog/ultimate-ipython-notebook
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
plt.rcParams['savefig.dpi'] = 75

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 14, 7
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

In [2]:
%ls Figures/

[1m[34mLarge-HIGH_DEATH-Neutral_vs_Selection-SAMPLED[m[m/
[1m[34mLarge-HIGH_MUTRATE-Neutral_vs_Selection-SAMPLED[m[m/
[1m[34mLarge-MUTATOR-Neutral_vs_Selection-SAMPLED[m[m/
[1m[34mLarge-Neutral_vs_Selection-SAMPLED[m[m/


# Simulations

## Neutral

In [3]:
%%time
# set seed to get same result for each test simulation
np.random.seed(123)

l = 10**8      # size limit
q = 10**(-1)   # mutation rate
qfac = 1       # mutation-rate increasing factor (here: no Mutator phenotype)
alpha = 0.3    # death rate
sel = 0        # selection factor
large_N = Population(l, q, mutfactor=qfac, death_frac=alpha, selection=sel)
large_N.simulate()
print("Total population size:", large_N.size)
print("Generations:", large_N.gen)
print()
print("Start clone size:", large_N.start_clone.size)
print("Number of subclones:", len(large_N.clones[1:]))

Total population size: 116402657
Generations: 35

Start clone size: 14925593
Number of subclones: 16629347
CPU times: user 12min 19s, sys: 5min 44s, total: 18min 4s
Wall time: 19min 19s


## In case of selection

In [4]:
%%time
# set seed to get same result for each test simulation
np.random.seed(123)

l = 10**8      # size limit
q = 10**(-1)   # mutation rate
qfac = 1       # mutation-rate increasing factor (here: no Mutator phenotype)
alpha = 0.3    # death rate
sel = 0.3      # selection factor
large_S = Population(l, q, mutfactor=qfac, death_frac=alpha, selection=sel)
large_S.simulate()
print("Total population size:", large_S.size)
print("Generations:", large_S.gen)
print()
print("Start clone size:", large_S.start_clone.size)
print("Number of subclones:", len(large_S.clones[1:]))

Total population size: 116402657
Generations: 35

Start clone size: 1239
Number of subclones: 16622862
CPU times: user 17min 19s, sys: 22min 16s, total: 39min 35s
Wall time: 1h 27min 41s


## *Final data*

Extracting the necessary data from the populations.

In [6]:
%%time
large_N_data = final_data(large_N)
large_S_data = final_data(large_S)

CPU times: user 4min 24s, sys: 10min 59s, total: 15min 24s
Wall time: 39min 44s


In [14]:
large_N_data.head()

Unnamed: 0_level_0,Birthday,q,Final size,Family size,Allele frequency,Mutations,Children,Weight,RGB color
Clone ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,0,0.1,14925593,116402657,1.0,0,2487292,1,"(0, 0, 0)"
A.0,5,0.1,3081477,16546142,0.142146,1,513145,1,"(123, 57, 214)"
A.1,6,0.1,422844,6409119,0.05506,1,70075,1,"(78, 253, 164)"
A.2,7,0.1,1191286,7186810,0.061741,1,199144,1,"(118, 94, 155)"
A.1.0,7,0.1,919742,4769678,0.040976,2,152972,1,"(95, 253, 173)"


In [13]:
large_S_data.head()

Unnamed: 0_level_0,Birthday,q,Final size,Family size,Allele frequency,Mutations,Children,Weight,RGB color
Clone ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,0,0.1,1239,116402657,1.0,0,2755,1.0,"(0, 0, 0)"
A.0,5,0.1,609,1327455,0.01140399,1,1143,1.077961,"(126, 47, 73)"
A.1,6,0.1,0,0,0.0,1,0,0.873495,"(195, 2, 84)"
A.2,7,0.1,1,1,8.590869e-09,1,2,0.564377,"(162, 58, 138)"
A.3,8,0.1,68,23936,0.000205631,1,166,1.030765,"(129, 43, 186)"


## SAMPLED DATA

In [10]:
sample_size = 10**6
detection_limit = 100

In [15]:
%%time
large_N_data_sampled = sample(large_N_data, sample_size, detection_limit)
large_S_data_sampled = sample(large_S_data, sample_size, detection_limit)

CPU times: user 2h 51min 45s, sys: 1min 45s, total: 2h 53min 30s
Wall time: 2h 53min 56s


In [16]:
large_N_data_sampled.head()

Unnamed: 0_level_0,Birthday,q,Final size,Family size,Allele frequency,Mutations,Children,Weight,RGB color,sampled_size,sampled_fam_size,sampled_AF
Clone ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,0,0.1,14925593,116402657,1.0,0,2487292,1,"(0, 0, 0)",209954,1000000,1.0
A.0,5,0.1,3081477,16546142,0.142146,1,513145,1,"(123, 57, 214)",43802,142660,0.14266
A.1,6,0.1,422844,6409119,0.05506,1,70075,1,"(78, 253, 164)",6100,55482,0.055482
A.2,7,0.1,1191286,7186810,0.061741,1,199144,1,"(118, 94, 155)",16698,61978,0.061978
A.1.0,7,0.1,919742,4769678,0.040976,2,152972,1,"(95, 253, 173)",13063,41262,0.041262


In [17]:
large_S_data_sampled.head()

Unnamed: 0_level_0,Birthday,q,Final size,Family size,Allele frequency,Mutations,Children,Weight,RGB color,sampled_size,sampled_fam_size,sampled_AF
Clone ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,0,0.1,1239,116402657,1.0,0,2755,1.0,"(0, 0, 0)",13,1000000,1.0
A.0,5,0.1,609,1327455,0.011404,1,1143,1.077961,"(126, 47, 73)",12,10713,0.010713
A.3,8,0.1,68,23936,0.000206,1,166,1.030765,"(129, 43, 186)",2,157,0.000157
A.5,8,0.1,272,7761,6.7e-05,1,490,1.086933,"(51, 172, 117)",4,47,4.7e-05
A.8,9,0.1,1434,7173991,0.061631,1,1419,1.555572,"(60, 65, 116)",26,60795,0.060795


## EXPORT DATA

In [8]:
%ls High_mutrate-Simulation_data-sampled/

In [7]:
%%time
prefix = 'High_mutrate-Simulation_data-sampled/'

# neutral population
filepath_N = prefix + 'large_N_high_q_data.pkl.gz'
large_N_data.to_pickle(filepath_N, compression='gzip')

# selective population
filepath_S = prefix + 'large_S_high_q_data.pkl.gz'
large_S_data.to_pickle(filepath_S, compression='gzip')

CPU times: user 7min 41s, sys: 2min 2s, total: 9min 43s
Wall time: 11min 29s


In [18]:
%%time
prefix = 'High_mutrate-Simulation_data-sampled/'

# neutral population
filepath_N = prefix + 'sampled_large_N_high_q_data.pkl.gz'
large_N_data_sampled.to_pickle(filepath_N, compression='gzip')

# selective population
filepath_S = prefix + 'sampled_large_S_high_q_data.pkl.gz'
large_S_data_sampled.to_pickle(filepath_S, compression='gzip')

CPU times: user 9.67 s, sys: 120 ms, total: 9.79 s
Wall time: 10 s
