# Generate Simulated Data

In [1]:
import pandas as pd
import numpy as np

#requires that the data_io is one parent directory up 
import sys 
sys.path.append('..')

import data_io as di
from fitness_mcmc import create_trajectories, sample_lineages



Here we're generating simulated data returning a data frame with the columns representing the number of generations (in our example 5,10,25,40,45) and the rows representing the respective counts for each genotype. The number of genotypes, generations, etc. can be controlled. The trajectories are modeled using the exponential model as demonstrated in the documentation notebook. 

## Helper Functions

In [2]:
def write_simulated_datafile(filename, N = 40, times = [5, 10, 25, 40, 45], s_range = 0.1,
                             depth = 1000, s_vals = [], f0_vals = []):
    """
    Creates a textfile of simulated trajectories formated like a real datafile

    Params:
        filename [str]: Name of the output file.
        N [int]: Population size, i.e. number of genotypes. Automatically assumed if f0_vals or s_vals
            are included.
        times [array_like]: Times, in generations, to sample lineages.
        s_range [float]: Range of fitness values. Ignored if s_vals is included.
        depth [int_or_float]: Simulated read depth, affects noise.
        s_vals [array_like]: Fitness values for the population, optional.
        f0_vals [array_like]: Starting frequencies of the population, optional.
    """
    if len(f0_vals) > 0 or len(s_vals) > 0:
        if len(f0_vals) > 0 and len(s_vals) > 0 and len(f0_vals) != len(s_vals):
            raise ValueError("s_vals and f0_vals must have the same length.")
        N = max(len(f0_vals), len(s_vals))
    if len(f0_vals) == 0:
        f0_vals = np.random.random(N)
    if len(s_vals) == 0:
        s_vals = np.random.random(N) * s_range
    times = np.array(times)

    trajectory = create_trajectories(f0_vals, s_vals, times)
    sampled = pd.DataFrame(sample_lineages(trajectory, depth * N),
                           columns = times)
    metadata = pd.DataFrame({"s_vals": s_vals, "f0_vals": f0_vals})
    
    if ".txt" in filename:
        filename = filename.split(".txt")[0]
    
    sampled.to_csv(filename + ".txt", sep="\t", index_label = "BC")
    metadata.to_csv(filename + "_metadata.txt", sep="\t", index_label = "BC")

In [3]:
# #writes a simulated data file of 40 different genotypes over 5, 10, 25, 40 and 45 generations. 
# write_simulated_datafile("simulated_data_1.txt")

# #writes a simulated data file using 7, 14, 28, 42, and 49 generations
# write_simulated_datafile("simulated_data_2.txt", times = [7,14,28,42,49])

In [4]:
# Writes a simulated data file based on estimated fitness of Ara-1, rep 1

s_vals = [0.910699904, 1.11253385, 0.99821983, 1.203381179, 1.262659378, 1.359457018, 1.211697985, 1.261849015, 
          1.21159652, 1.390293936, 1.355383317, 1.445180916, 1.271422864, 1.49517616, 1.382129783, 1.349289071,
          1.427948471, 1.47738471, 1.440798106, 1.336651198, 1.46977996, 1.481388763, 1.33157471, 1.553550132, 
          1.496842327, 1.435615946, 1.578332058, 1.764073117, 1.581792539, 1.812140099, 1.460282403, 2.021813917,
          1.844907913, 2.084364992, 1.877496757, 1.859594605, 1.674140374, 1.606857522, 1.575513844, 1.885293796,
          1.416568779]

write_simulated_datafile("simulated_data_LTEE.txt", times = (1 + np.arange(5))*6.64, 
                         s_vals = s_vals, f0_vals = np.ones(len(s_vals)))