In [2]:
import numpy as np
import dadi
import random
import pickle
import os, sys
sys.path.append(os.path.join(os.getcwd(), '..')) # this is the ml_dadi dir
import data_manip
from data_manip import generating_data


In [3]:
def generate_params(n_samples):
    # Generate parameters for training & testing data
    param = []  # full range no log
    param_log = []  # full range & nu in log scale
    param_exclude = []  # exclude T/nu > 5 & nu in log scale
    param_list = [param, param_log, param_exclude]

    while len(param_exclude) < n_samples:
        # pick random values in specified range
        # nu range: 0.01-100; T range: 0.1-2
        log_nu = random.random() * 4 - 2
        T = random.random() * 1.9 + 0.1

        # save each param pair into appropriate list
        if len(param) < n_samples:  # stop appending once reach desired size
            param_log.append((log_nu, T))
            param.append((10**log_nu, T))

        if T/10**log_nu <= 5:
            param_exclude.append((log_nu, T))

    # print data set info
    print(f'n_samples full range: {len(param)}')
    print(f'n_samples exclude T/nu > 5: {len(param_exclude)}')
    texts = ['Full range params',
             'Full range params log', 'Exclusion params log']
    for text, param in zip(texts, param_list):
        print(
            f'{text}: {tuple([float("{0:.3f}".format(n)) for n in min(param)])} to {tuple([float("{0:.3f}".format(n)) for n in max(param)])}')
    return param_list


In [4]:
# generate frequency spectrum data from param sets
def generate_data_from_params(param_list, label):
    '''label = "train_data" or "test_data"'''

    # list of theta values to run scaling and add variance
    theta_list = [1, 10000, 1000, 100]  # order of increase variance

    # designate demographic model, sample size, and extrapolation grid
    func = dadi.Demographics1D.two_epoch
    ns = [20]
    pts_l = [40, 50, 60]

    # specify param in log scale, name of data sets
    logs = [[False, False], [True, False], [True, False]]
    names = ['full_no_log', 'full_log', 'exclude_log']

    for param, log, name in zip(param_list, logs, names):
        to_save = generating_data(param, theta_list, func, ns, pts_l, log)
        pickle.dump(to_save, open(f'data/{label}_{name}', 'wb'), 2)


In [14]:
train_param = generate_params(1000)


n_samples full range: 1000
n_samples exclude T/nu > 5: 1000
Full range params: (0.01, 0.732) to (99.27, 0.445)
Full range params log: (-1.998, 0.732) to (1.997, 0.445)
Exclusion params log: (-1.364, 0.157) to (1.997, 0.445)


In [16]:
generate_data_from_params(train_param, "train_data")


In [5]:
test_param = generate_params(100)


n_samples full range: 100
n_samples exclude T/nu > 5: 100
Full range params: (0.011, 0.879) to (92.441, 1.5)
Full range params log: (-1.952, 0.879) to (1.966, 1.5)
Exclusion params log: (-1.416, 0.136) to (1.966, 1.5)


In [6]:
generate_data_from_params(test_param, "test_data")
