In [1]:
import numpy as np
import dadi, random, pickle
import os, sys
sys.path.append(os.path.join(os.getcwd(), '..')) # this is the ml_dadi dir
import data_manip, ml_models
from data_manip import generating_data
from ml_models import rfr_train, mlpr_train

In [2]:
# generate a list of theta values to run scaling and add variance
theta_list = [1,10000,1000,100] # order of increase variance

In [3]:
# designate demographic model, sample size, and extrapolation grid 
func = dadi.Demographics1D.growth
ns = [20]
pts_l = [40, 50, 60]

In [4]:
# designate which param to be in log scale
logs = [True, False] #in this case, nu is on log scale, T is not

In [5]:
 # Generate parameter list for training: exclude params where T/nu > 5 version
 # using log scale for nu
 # nu, T is the key to the data dictionary
 train_params = [(nu,T) for nu in np.linspace(-2, 2, 25) # (lower, upper, number of values); linspace equally spaces values
                       for T in np.linspace(0.1, 2, 24) if T/10**nu <= 5] # 

 # print training set info 
 print('n_samples training: ', len(train_params))
 print('Range of training params:', min(train_params), 'to', max(train_params))
 print('Theta list:', theta_list)
 # Make a list of training data dictionaries, one dictionary for each theta case
 list_train_dict=generating_data(train_params, theta_list, func, ns, pts_l, logs) 
 pickle.dump(list_train_dict, open('data/train_data', 'wb'), 2)

n_samples training:  410
Range of training params: (-1.6666666666666667, 0.1) to (2.0, 2.0)
Theta list: [1, 10000, 1000, 100]


In [5]:
# Load new datasets for training
list_train_dict = pickle.load(open('data/train_data','rb')) #only needed if we havent ran the previous block of code, i.e. if we haven't used list_train_dict before

In [9]:
# Train RFR 
# N/A as much to this model, but runs quick
list_rfr = [rfr_train(train_dict) for train_dict in list_train_dict]
pickle.dump(list_rfr, open('data/list_rfr', 'wb'), 2)

In [None]:
# Train MLPR with adam solver (default)
# Also test this one
list_mlpr_adam = [mlpr_train(train_dict, max_iter=500) # too high maxiter= runs too long, too low= non convergence
                        for train_dict in list_train_dict]
pickle.dump(list_mlpr_adam, open('data/list_mlpr_adam', 'wb'), 2)

In [None]:
# Train MLPR with lbfgs solver
#This will probably work the best
# Need large max_iter and take longer to run but perform the best for two_epoch
list_mlpr_lbfgs = [mlpr_train(train_dict, solver='lbfgs', max_iter=5000)
                        for train_dict in list_train_dict]
pickle.dump(list_mlpr_lbfgs, open('data/list_mlpr_lbfgs', 'wb'), 2)

In [9]:
# Generate Test Datasets
test_params = []
while len(test_params) < 100: 
# generate random nu and T within the same range as training data range
    nu = random.random() * 4 - 2 # nu in log scale
    T = random.random() * 1.9 + 0.1
    # exclude T/nu > 5
    if T/10**nu <= 5: # only appends to list if this condition is satisfied; different from below
        params = (nu, T)
        test_params.append(params)
# print testing set info 
print('n_samples testing: ', len(test_params))
print('Range of testing params:', min(test_params), 'to', max(test_params))
print('Theta list:', theta_list)
# Make a list of test data dictionaries, one dictionary for each theta case
list_test_dict = generating_data(test_params, theta_list, func, ns, pts_l, logs)
# Save testing set as a pickle file
pickle.dump(list_test_dict, open('data/test_data', 'wb'), 2)

n_samples testing:  100
Range of testing params: (-1.222463150850647, 0.26513143014436913) to (1.9193351005273405, 0.2635281256404377)
Theta list: [1, 10000, 1000, 100]


In [None]:
# Generate Test Datasets Full Range
test_params = []
while len(test_params) < 100: #This gives 100 test values
# generate random nu and T within the same range as training data range
    nu = random.random() * 4 - 2 # nu in log scale #random.random is [0,1), this formula at the end ensures values are betweeen -2 and 2
    T = random.random() * 1.9 + 0.1
    # # exclude T/nu > 5
    # if T/10**nu <= 5:
    #     params = (nu, T)
    #     test_params.append(params)
    test_params.append((nu, T))

# print testing set info 
print('n_samples testing: ', len(test_params))
print('Range of testing params:', min(test_params), 'to', max(test_params))
print('Theta list:', theta_list)
# Make a list of test data dictionaries, one dictionary for each theta case
list_test_dict = generating_data(test_params, theta_list, func, ns, pts_l, logs)
# Save testing set as a pickle file
pickle.dump(list_test_dict, open('data/new_func/test_data_full', 'wb'), 2)