In [1]:
import numpy as np
import random
import dadi
import pickle
import os, sys
sys.path.append(os.path.join(os.getcwd(), '..')) # this is the ml_dadi dir
import data_manip, ml_models
from data_manip import generating_data_parallel_log
from ml_models import rfr_train, mlpr_train

In [2]:
# generate a list of theta values to run scaling and add variance
theta_list = [1,10000,1000,100] # order of increase variance

In [3]:
# designate demographic model, sample size, and extrapolation grid 
func = dadi.Demographics2D.IM
ns = [20,20]
pts_l = [40, 50, 60]
logs = [False, True, True, False, False, False]

In [4]:
# Generate parameter list for training:
# Test a small number only here, increase when run on HPC
train_params = [(s, nu1, nu2, T, m12, m21) for s in np.linspace(0.01, 0.99, 3)
                            for nu1 in np.linspace(-2, 2, 3)
                            for nu2 in np.linspace(-2, 2, 3)
                            for T in np.linspace(0.1, 2, 3)
                            for m12 in np.linspace(1, 10, 3)
                            for m21 in np.linspace(1, 10, 3)]

In [5]:
# print training set info 
print('n_samples training: ', len(train_params))
print('Range of training params:', min(train_params), 'to', max(train_params))
print('Theta list:', theta_list)

n_samples training:  729
Range of training params: (0.01, -2.0, -2.0, 0.1, 1.0, 1.0) to (0.99, 2.0, 2.0, 2.0, 10.0, 10.0)
Theta list: [1, 10000, 1000, 100]


In [6]:
list_train_dict = generating_data_parallel_log(train_params, 
                        theta_list, func, ns, pts_l, logs)
pickle.dump(list_train_dict, open('data/train_data', 'wb'), 2)

In [7]:
list_train_dict = pickle.load(open('data/train_data','rb'))

In [8]:
# Train RFR and save the list of trained RFR into pickle file
list_rfr = [rfr_train(train_dict) for train_dict in list_train_dict]
pickle.dump(list_rfr, open('data/list_rfr', 'wb'), 2)

In [9]:
# Train MLPR and save the list of trained MLPR into pickle file
list_mlpr = [mlpr_train(train_dict, max_iter=1000) 
                for train_dict in list_train_dict]
pickle.dump(list_mlpr, open('data/list_mlpr', 'wb'), 2)

Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.

In [10]:
# Generate Test Datasets
test_params = []
for i in range(100):
    s = random.random() * 0.98 + 0.01
    nu1 = random.random() * 4 - 2
    nu2 = random.random() * 4 - 2
    T = random.random() * 1.9 + 0.1
    m12 = random.random() * 9 + 1
    m21 = random.random() * 9 + 1
    params = (s, nu1, nu2, T, m12, m21)
    test_params.append(params)
# print testing set info 
print('n_samples testing: ', len(test_params))
print('Range of testing params:', min(test_params), 'to', max(test_params))
print('Theta list:', theta_list)
# Make a list of test data dictionaries, one dictionary for each theta case
list_test_dict = generating_data_parallel_log(test_params, 
                    theta_list, func, ns, pts_l, logs)
# Save testing set as a pickle file
pickle.dump(list_test_dict, open('data/test_data', 'wb'), 2)

n_samples testing:  100
Range of testing params: (0.1206317193721481, 1.7458605843877417, 1.9254522844856357, 0.5573476249472297, 8.580966170018268, 6.989681408323595) to (0.895922808125844, 1.04527808685734, 0.6574315610822128, 1.6652855913282607, 6.7627242152791185, 4.189751563494047)
Theta list: [1, 10000, 1000, 100]
