In [2]:
import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import sys
sys.path.append('../utils_folder/')
import time
from utils_GD import *
from utils_IBP import *
from utils_jack import *
from utils_gt import *
from utils_unseen import *

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
N_ls = [50,100,200]
population_ls = ['afr', 'amr', 'asj', 'eas', 'eas_oea', 'fin', 'nfe_bgr', 'nfe_est', 'nfe_nwe', 'nfe_seu', 'nfe_swe', 'sas', 'oth']

num_its = 10
kappa = .5
status = False
results = {}
for N in N_ls:
    
    results[N] = {}
    for p, population in enumerate(population_ls):
    
        results[N][population] = {}


        sfs = np.load('data/'+population+'/sfs/N_'+str(N)+'.npy', allow_pickle=1) #'+str(population)+'/sfs/N_'+str(N)+'.npy', allow_pickle=1)
        cts = np.load('data/'+population+'/cts/N_'+str(N)+'.npy', allow_pickle=1) #'+str(population)+'/sfs/N_'+str(N)+'.npy', allow_pickle=1)
        stc = time.time()
        N_tot = int(np.load('data/'+population+'/total_sample_size.npy', allow_pickle = 1))
        M = N_tot - N
        if M <= 0 :
            pass
        else:
            num_splits = sfs.shape[0]

            ### INITIALIZE

            ### GD 

            results[N][population]['GD_params'] = np.zeros([num_splits, 3])
            results[N][population]['GD_preds'] = np.zeros([num_splits, N_tot+1])
            results[N][population]['GD_lo'], results[N][population]['GD_hi'] = np.zeros([num_splits, N_tot+1]), np.zeros([num_splits, N_tot+1])


            # IBP 
            results[N][population]['IBP_params'] = np.zeros([num_splits, 3])
            results[N][population]['IBP_preds'] = np.zeros([num_splits, N_tot+1])
            results[N][population]['IBP_lo'], results[N][population]['IBP_hi'] = np.zeros([num_splits, N_tot+1]), np.zeros([num_splits, N_tot+1])


            ### JACK

            results[N][population]['J_preds'] = np.zeros([num_splits, 4, N_tot+1])


            ### GT

            results[N][population]['GT_preds'] = np.zeros([num_splits, 2, 2, N_tot+1])

            ## LP

            results[N][population]['LP_preds'] = np.zeros([num_splits, N_tot+1])


            print('\tStarting ', population, '; N = ', N, '; M = ', M, '; Progress : ', str(100*(p+1)/len(population_ls))[:5], ' %', sep=' ', end='', flush=True)  
            results[N][population]['N'] = N
            results[N][population]['M'] = M
            results[N][population]['cts'] = np.loadtxt('data/'+population+'/cts/all')
            train_counts = results[N][population]['cts'][:N+1]
            results[N][population]['sfs'] = sfs
            results[N][population]['K']  = [np.sum(s) for s in sfs]


            for j in tqdm_notebook(range(num_splits)):

                cts_ = cts[j]
                sfs_ = sfs[j, 1:]
                K = cts_[-1]

                gd = GD()
                
                GD_params = gd.regression(train_counts= cts_, num_its=num_its, norm=2, status=status)
                GD_preds = np.concatenate([train_counts, K + gd.mean(N, M, K, GD_params)]) 
                GD_lo, GD_hi = gd.credible_interval(N, M, K, GD_params, width=.99) 
                GD_lo, GD_hi =  np.concatenate([train_counts, K + GD_lo]), np.concatenate([train_counts, K + GD_hi])

                results[N][population]['GD_params'][j] = GD_params
                results[N][population]['GD_preds'][j] = GD_preds
                results[N][population]['GD_lo'][j], results[N][population]['GD_hi'][j] = GD_lo, GD_hi  

                ibp = IBP()
                
                IBP_params = ibp.regression(train_counts= cts_, num_its=num_its, norm=2, status=status)
                IBP_preds = np.concatenate([train_counts, K + ibp.mean(N, M, IBP_params)]) 
                IBP_lo, IBP_hi = ibp.credible_interval(N, M, IBP_params, width=.99) 
                IBP_lo, IBP_hi =  np.concatenate([train_counts, K + IBP_lo]), np.concatenate([train_counts, K + IBP_hi])

                results[N][population]['IBP_params'][j] = IBP_params
                results[N][population]['IBP_preds'][j] = IBP_preds
                results[N][population]['IBP_lo'][j], results[N][population]['IBP_hi'][j] = IBP_lo, IBP_hi


                # JACKKNIFE

                for order in [1,2,3,4]:
                    results[N][population]['J_preds'][j, order-1] = predict_jack(N, M, sfs_, train_counts, order)


                # GOOD-TOULMIN


                results[N][population]['GT_preds'][j,0] = predict_gt(N, M, sfs_, train_counts, 0)
                results[N][population]['GT_preds'][j,1] = predict_gt(N, M, sfs_, train_counts, 1)

                # LP

                results[N][population]['LP_preds'][j] = pred_counts_unseen(sfs_, kappa, N, M)
              
        np.save('results/new_all_populations_regression', results)

In [None]:
'''
    Retain minimal info for plotting
'''

res = np.load('results/new_all_populations_regression.npy', allow_pickle=1).item()
_res = {}

population_names = {}
population_names['amr'] = 'Amr.'
population_names['eas'] = 'SE. As.'
population_names['eas_oea'] = 'Ot. E. As.'
population_names['fin'] = 'Fin.'
population_names['nfe_seu'] = 'S. Eu.'
population_names['nfe_swe'] = 'Swe.'
population_names['sas'] = 'S. As.'
population_names['oth'] = 'Other'

for N in [50,100]:
    _res[N]= {}
    for p, population in enumerate(population_names):
        _res[N][population] = {}
        num_boots = res[N][population]['GD_preds'].shape[0]
        _res[N][population]['M'] = len(res[N][population]['GD_preds'][0]) - N
        N_M = -1
        _res[N][population]['K'] = res[N][population]['cts'][N]
        _res[N][population]['K_new'] = res[N][population]['cts'][-1] - res[N][population]['cts'][N]
        unm = _res[N][population]['K_new'] 
        _res[N][population]['GD_precision'] = 1-np.sort(np.abs(res[N][population]['GD_preds'][:,N_M] - unm)/unm)
        _res[N][population]['IBP_precision'] = 1-np.sort(np.abs(res[N][population]['IBP_preds'][:,N_M] - unm)/unm)
        _res[N][population]['LP_precision'] = 1-np.sort(np.abs(res[N][population]['LP_preds'][:,N_M] - unm)/unm)
        _res[N][population]['J_precision'] = np.array([1-np.sort(np.abs(res[N][population]['J_preds'][:,o,N_M] - unm)/unm) for o in range(res[N][population]['J_preds'].shape[1])]).reshape(num_boots,4)
        _res[N][population]['GT_precision'] = np.array([1-np.sort(np.abs(res[N][population]['GT_preds'][:,s,0,N_M] - unm)/unm) for s in range(2)]).reshape(num_boots,2)
        
np.save('results/_new_all_populations_regression.npy', _res)
