# Generate Simulated Data

In [15]:
import pandas as pd
import numpy as np
from pathlib import Path

Here we're generating simulated data returning a data frame with the columns representing the number of generations (in our example 5,10,25,40,45) and the rows representing the respective counts for each genotype. The number of genotypes, generations, etc. can be controlled. The trajectories are modeled using the exponential model as demonstrated in the documentation notebook. 

## Helper Functions

In [17]:
def normalize_func(x):
    """
    Normalizes lineage frequencies to sum to 1

    Parameters:
        x [array_like]: frequency vector to be normalized
    """
    return x / np.sum(x, axis = 0)

def create_trajectories(f0, s, times, normalize = True):
    """
    Simulates lineage trajectores given initial frequency and fitnesses

    Parameters:
        f0 [array-like]: initial lineage frequencies
        s [array-like]: lineage fitnesses
        times [array-like]: times, in generations, to sample lineage frequencies
        normalize [bool]: if True, normalizes lineage frequencies at each
            sampling time
    Returns:
        f_traj [numpy array]: array of lineage frequencies sampled at times
            given by "times"
    """
    f0 = f0.reshape([len(f0), -1])
    s = s.reshape([len(s), -1])
    times = times.reshape([-1, len(times)])
    f_traj = f0 * np.exp(s * times)

    if normalize:
        f_traj = normalize_func(f_traj)

    return f_traj

def sample_lineages(f, num_samples):
    """
    Returns lineage counts Poisson sampled from their true frequencies

    Parameters:
        f [array_like]: true lineage frequencies
        num_samples [int]: total number of samples to draw, should be order
            100 * num_lineages
    Returns:
        n_sampled [numpy array]: number of samples measured from each lineage
    """
    n_expected = f * num_samples
    n_sampled = np.random.poisson(n_expected)
    return n_sampled

In [22]:
def write_simulated_datafile(filename, N = 40, times = -1, s_range = 0.1,
                             depth = 100):
    """
    Creates a textfile of simulated trajectories formated like a real datafile

    Params:
        filename [str]: name of the output file
        N [int]: population size, i.e. number of genotypes 
        times [array_like]: times, in generations, to sample lineages.
        s_range [float]: range of fitness values
        depth [int_or_float]: Simulated read depth, affects noise 
    """
    f0_vals = np.random.random(N)
    s_vals = np.random.random(N) * s_range
    if times == -1:
        times = np.array([5, 10, 25, 40, 45])
    else:
        times = np.array(times)

    trajectory = create_trajectories(f0_vals, s_vals, times)
    sampled = pd.DataFrame(sample_lineages(trajectory, depth * N),
                           columns = times)

    sampled.to_csv(filename, sep="\t", index_label = "BC")

In [23]:
#writes a simulated data file of 40 different genotypes over 5, 10, 25, 40 and 45 generations. 
write_simulated_datafile("simulated_data_1.txt")

#writes a simulated data file using 7, 14, 28, 42, and 49 generations
write_simulated_datafile("simulated_data_2.txt", times = [7,14,28,42,49])