# Generate Simulated Data

In [1]:
import pandas as pd
import numpy as np

#requires that the data_io is one parent directory up 
import sys 
sys.path.append('..')

import data_io as di
from fitness_mcmc import create_trajectories, sample_lineages



Here we're generating simulated data returning a data frame with the columns representing the number of generations (in our example 5,10,25,40,45) and the rows representing the respective counts for each genotype. The number of genotypes, generations, etc. can be controlled. The trajectories are modeled using the exponential model as demonstrated in the documentation notebook. 

## Helper Functions

In [2]:
def write_simulated_datafile(filename, N = 40, times = None, s_range = 0.1,
                             depth = 1000, s_vals = None, f0_vals = None):
    """
    Creates a textfile of simulated trajectories formated like a real datafile

    Params:
        filename [str]: name of the output file
        N [int]: population size, i.e. number of genotypes 
        times [array_like]: times, in generations, to sample lineages.
        s_range [float]: range of fitness values
        depth [int_or_float]: Simulated read depth, affects noise 
    """
    if not f0_vals:
        f0_vals = np.random.random(N)
    if not s_vals:
        s_vals = np.random.random(N) * s_range
    if not times:
        times = np.array([5, 10, 25, 40, 45])
    else:
        times = np.array(times)

    trajectory = create_trajectories(f0_vals, s_vals, times)
    sampled = pd.DataFrame(sample_lineages(trajectory, depth * N),
                           columns = times)
    metadata = pd.DataFrame({"s_vals": s_vals, "f0_vals": f0_vals})
    
    if ".txt" in filename:
        filename = filename.split(".txt")[0]
    
    sampled.to_csv(filename + ".txt", sep="\t", index_label = "BC")
    metadata.to_csv(filename + "_metadata.txt", sep="\t", index_label = "BC")

In [3]:
# #writes a simulated data file of 40 different genotypes over 5, 10, 25, 40 and 45 generations. 
# write_simulated_datafile("simulated_data_1.txt")

# #writes a simulated data file using 7, 14, 28, 42, and 49 generations
# write_simulated_datafile("simulated_data_2.txt", times = [7,14,28,42,49])

In [None]:
#writes a simulated data file using 7, 14, 28, 42, and 49 generations
write_simulated_datafile("simulated_data_LTEE.txt", times = [7,14,28,42,49])