In [13]:
import numpy as np
import scipy as scipy
from scipy.optimize import minimize
import timeit

# Try Solving Many Ancestries problem, multiple SNPs

In this notebook we attempt to solve the following constrained, quadratic optimization problem:

$$\min_{\pi \in \mathbb{R}^k} f(\pi)=\sum_{i=1}^{N}\left(\sum_{j=1}^k a_{j,i}\pi_j-\tilde{a}_i\right)^2$$

$$\text{subject to:} \sum_{j=1}^k \pi_k=1 \quad \pi_j \geq 0, j=1,\ldots,k,$$

where $a_{j,i} \in \mathbb{R}$, $j=1,\ldots, k$; $i=1,\ldots N$ and $\tilde{a}_i \in \mathbb{R}$, $i =1, \ldots, N$ are quantities obtained from a genetics simulation. The $a_{j,i}$'s correspond to the observed allele frequency in ancestry $j$ at SNP $i$. There are $k$ ancestries and $N$ SNPs.

In [33]:
N=1000000 # number of SNPs
k=15 # number of ancestries

A=np.array(np.random.uniform(low=0, high=1, size=(N,1))) # initialize an array for experimental draws

for i in range(1,k):
    A=np.hstack((A,np.random.uniform(low=0, high=1, size=(N,1))))

# First, we choose an answer! This vector must be Nx1

ans=[[0.1], [0.1], [0.1], [0.25], [0.05], [0.1], [0.05], [0.05], [0.01], [0.01], [0.01], [0.01], [0.01], [0.05], [0.1]]

taf=A@ans # Total allele frequency

print(np.shape(A),np.shape(taf), np.shape(ans), np.shape(taf))

(1000000, 15) (1000000, 1) (15, 1) (1000000, 1)


In [34]:
# This is the objective function!

def gen_function(x):
    b=0
    for i in range(0,k):
        b=b + x[i]*A[:,i:(i+1)]
    b=b-taf
    return np.sum(b**2, axis=0)[0]

In [35]:
# This is a feasible initial point since its components add to 1 and are positive.

x_t=(1/k)*np.ones((k,1))
print('check shape of x_t:', np.shape(x_t), np.sum(x_t,axis=0))

# Make sure function works by computing f(x_t)

print('our initial value is', np.transpose(x_t)) # transpose for readability only
print('which has function value', gen_function(x_t))

check shape of x_t: (15, 1) [1.]
our initial value is [[0.06666667 0.06666667 0.06666667 0.06666667 0.06666667 0.06666667
  0.06666667 0.06666667 0.06666667 0.06666667 0.06666667 0.06666667
  0.06666667 0.06666667 0.06666667]]
which has function value 4704.590148775938


In [36]:
# Here is the gradient of the objective function

def gen_gradfun(x):
    
    gradvec = np.zeros((k,1))
    
    d=0
    
    for i in range(0,k):
        d=d + x[i]*A[:,i:(i+1)]
    d=d-taf
    
    for i in range(0,k):
        gradvec[i,:] = np.sum(2*A[:,i:(i+1)]*d, axis=0)
    return gradvec

In [37]:
s=gen_gradfun(x_t)
zero=gen_gradfun(ans)

print('grad of starting point is:',np.transpose(s)) # this is the gradient of where we begin
print('should be zeros:',np.transpose(zero)) # this should be zero if we have the right answer

grad of starting point is: [[ -5553.87228219  -5569.94891954  -5548.51811088 -30615.9706914
    2772.9344849   -5475.50047128   2817.07523245   2799.14522459
    9472.93969656   9541.15054213   9485.9454706    9441.10765977
    9456.53309987   2828.42229237  -5554.89172561]]
should be zeros: [[-2.02678189e-12 -1.81913868e-12 -1.93921923e-12 -2.57952893e-12
  -1.90159013e-12 -1.65816568e-12 -1.89724446e-12 -1.93425240e-12
  -1.87783881e-12 -1.80711866e-12 -1.97046979e-12 -1.87444958e-12
  -1.85696081e-12 -1.66810283e-12 -1.63727646e-12]]


## SLSQP

In [38]:
# These are wrappers that make our constraints and our bounds

cons = ({'type': 'eq', 'fun': lambda x:  np.sum(x,axis=0) -1},)

for i in range(0,k-1):
    cons = cons + ({'type': 'ineq', 'fun': lambda x: x[i]},)

bnds = ((0, None),)

for i in range(0,k-1):
    bnds = bnds + ((0, None),)

In [39]:
# This cell runs and times SLSQP

start = timeit.default_timer()

print(scipy.optimize.minimize(gen_function, x_t, method='SLSQP', jac=gen_gradfun, bounds=bnds, constraints=cons, tol=1e-5))

stop = timeit.default_timer()

print('Time: ', stop - start)

print('our correct answer was chosen to be', ans)

     fun: 2.3303816374487528e-07
     jac: array([-0.06178706,  0.02979919,  0.09644771, -0.05492109, -0.11486172,
       -0.04203461, -0.05020217, -0.02116906,  0.1599231 , -0.00062829,
        0.02027204,  0.09433528, -0.05170179, -0.05372333,  0.04808821])
 message: 'Optimization terminated successfully.'
    nfev: 91
     nit: 37
    njev: 37
  status: 0
 success: True
       x: array([0.09999963, 0.10000018, 0.10000058, 0.24999967, 0.04999931,
       0.09999975, 0.0499997 , 0.04999987, 0.01000096, 0.01      ,
       0.01000012, 0.01000057, 0.00999969, 0.04999968, 0.10000029])
Time:  64.7779243949326
our correct answer was chosen to be [[0.1], [0.1], [0.1], [0.25], [0.05], [0.1], [0.05], [0.05], [0.01], [0.01], [0.01], [0.01], [0.01], [0.05], [0.1]]


In [40]:
# How do we call the computed answer, x, without copy/pasting?!

x=np.array([0.09999963, 0.10000018, 0.10000058, 0.24999967, 0.04999931,
       0.09999975, 0.0499997 , 0.04999987, 0.01000096, 0.01      ,
       0.01000012, 0.01000057, 0.00999969, 0.04999968, 0.10000029])

In [41]:
# Print out the error in the worst component

np.max(abs(x-np.transpose(ans)))

9.59999999999503e-07

In [42]:
# Print out average error

(1/k)*np.sum(abs(x-np.transpose(ans)),axis=0)[0]

2.4666666666635825e-08