In [16]:
### HA
### A general script/function/wrapper that takes in: 
### * genetic data in a matrix A containing N SNPs (these are the rows), and k ancestries (these are the first k columns);
###   the final column of A is total allele frequecy; therefore, A is size N x (k+1)
### * a starting guess
### and returns:
### * the hidden proportions of every ancestry in the data.

import numpy as np
import scipy as scipy
from scipy.optimize import minimize
import timeit

def HA(A,x_guess):
    
    # Grab the value of k
    k=np.shape(A)[1] - 1        # Note that we substract by 1 since the last column of A is total allele frequncy data
    
    # Grab and define the total allele frequency
    taf=A[:,k:(k+1)]
    
    # This is the objective function!

    def obj_fun(x):
        b=0
        for i in range(0,k):
            b=b + x[i]*A[:,i:(i+1)]
        b=b-taf
        return np.sum(b**2, axis=0)[0]
    
    # Here is the gradient of the objective function

    def grad_obj_fun(x):

        gradvec = np.zeros((k,1))

        d=0

        for i in range(0,k):
            d=d + x[i]*A[:,i:(i+1)]
        d=d-taf

        for i in range(0,k):
            gradvec[i,:] = np.sum(2*A[:,i:(i+1)]*d, axis=0)
        return gradvec

    # These are wrappers that make our constraints and our bounds

    cons = ({'type': 'eq', 'fun': lambda x:  np.sum(x,axis=0) -1},)

    for i in range(0,k-1):
        cons = cons + ({'type': 'ineq', 'fun': lambda x: x[i]},)

    bnds = ((0, None),)

    for i in range(0,k-1):
        bnds = bnds + ((0, None),)

    return scipy.optimize.minimize(obj_fun, x_guess, method='SLSQP', jac=grad_obj_fun, bounds=bnds, constraints=cons, tol=1e-5)

In [17]:
### Initialize data for testing; define matrix A along with strating guess

N=10000 # number of SNPs
j=15 # number of ancestries

A=np.array(np.random.uniform(low=0, high=1, size=(N,1))) # initialize an array for experimental draws

for i in range(1,j):
    A=np.hstack((A,np.random.uniform(low=0, high=1, size=(N,1))))

# First, we choose an answer! This vector must be Nx1

ans=[[0.1], [0.1], [0.1], [0.25], [0.05], [0.1], [0.05], [0.05], [0.01], [0.01], [0.01], [0.01], [0.01], [0.05], [0.1]]

mytaf=A@ans # Total allele frequency

A=np.hstack((A,mytaf))

x_t=(1/j)*np.ones((j,1))

In [18]:
### This cell runs and times the HA function

start = timeit.default_timer()

print(HA(A,x_t))

stop = timeit.default_timer()

print('Time: ', stop - start)

print('our correct answer was chosen to be', ans)

     fun: 6.263846093319523e-07
     jac: array([ 0.01193129,  0.01929479,  0.03027043, -0.01620858,  0.07737389,
       -0.00918604, -0.02817628,  0.01694641, -0.02168771, -0.04452949,
       -0.03632284,  0.03320806, -0.04053936, -0.00658027, -0.00756653])
 message: 'Optimization terminated successfully.'
    nfev: 63
     nit: 28
    njev: 28
  status: 0
 success: True
       x: array([0.10000438, 0.09999609, 0.09999595, 0.2500036 , 0.04998458,
       0.10000132, 0.05000967, 0.04999994, 0.00998947, 0.01000382,
       0.01000102, 0.00999105, 0.01000678, 0.05001041, 0.10000194])
Time:  0.18364905328633085
our correct answer was chosen to be [[0.1], [0.1], [0.1], [0.25], [0.05], [0.1], [0.05], [0.05], [0.01], [0.01], [0.01], [0.01], [0.01], [0.05], [0.1]]
