In [1]:
import numpy as np
import scipy as scipy
from scipy.optimize import minimize
import timeit

# Try Solving 4 Ancestry problem, multiple SNPs

In this notebook we attempt to solve the following constrained, quadratic optimization problem:

$$\min_{\pi \in \mathbb{R}^4} f(\pi)=\sum_{i=1}^{N}(a_{1,i}\pi_1+a_{2,i}\pi_2+a_{3,i}\pi_3+a_{4,i}\pi_4-\tilde{a}_i)^2$$

$$\text{subject to:} \quad \pi_1+\pi_2+\pi_3+\pi_4=1 \quad \pi_1\geq 0 \quad \pi_2 \geq 0 \quad \pi_3 \geq 0 \quad \pi_4 \geq 0,$$

where $a_{j,i} \in \mathbb{R}$, $j=1,2,3,4$; $i=1,\ldots N$ and $\tilde{a}_i \in \mathbb{R}$, $i =1, \ldots, N$ are quantities obtained from a genetics simulation. The $a_{1,i}$'s correspond to the observed allele frequency in ancestry 1; the $a_{2,i}$'s correspond to the observed allele frequency in ancestry 2; the $a_{3,i}$'s correspond to the observed allele frequency in ancestry 3; the $a_{4,i}$'s correspond to the observed allele frequency in ancestry 4; the $\tilde{a}_i$'s corespond to the observed  total allele frequency in the population. Here, $N$ is the number of SNPs.

In [3]:
import os
import pandas as pd

### change the current working directory
os.chdir('/nfs/storage/math/gross-s2/projects/mixtures/example_sims/')

### read in the data
ev = pd.read_csv("Afr_CEU_sas_eas_10000tot_2500Afr_2500sas_2500eas_sims_and_reference.txt", sep='\t')

ev.head(5) ### look at the first 5 rows

Unnamed: 0,CHR,SNP,CEU_MAF,afr_MAF,sas_MAF,eas_MAF,af
0,1,rs1000364,0.3939,0.070438,0.423307,0.384912,0.32305
1,1,rs1002655,0.3434,0.230153,0.468285,0.350196,0.34735
2,1,rs1008082,0.2828,0.082341,0.236197,0.112097,0.18235
3,1,rs10082057,0.0,0.181558,0.0,0.0,0.0463
4,1,rs10082123,0.1515,0.230183,0.220848,0.227184,0.2108


### For the data set we are considering here, the "answer" is $\pi_1=\pi_2=\pi_3=\pi_4=0.25.$

In [13]:
# Collect the SNPS
# These are each of the columns above

a_1 = ev['CEU_MAF']
a_2 = ev['afr_MAF']
a_3 = ev['sas_MAF']
a_4 = ev['eas_MAF']
a_t = ev['af']

print('number of SNPs is', np.shape(a_t)[0])

number of SNPs is 61857


In [14]:
# This is the objective function!

def function(x):
    return np.sum((a_1*x[0]+a_2*x[1]+a_3*x[2]+a_4*x[3]-a_t)**2,axis=0)

In [15]:
# This is a feasible initial point since its components add to 1 and are positive.

x_t=np.array((.5,.5,0,0))

# Make sure function works by computing f(x_t)

print(function(x_t))

122.44679373681608


In [16]:
# Here is the gradient of the objective function

def gradfun(x):
    return np.array((np.sum(2*a_1*(a_1*x[0]+a_2*x[1]+a_3*x[2]+a_4*x[3]-a_t),axis=0),
                     np.sum(2*a_2*(a_1*x[0]+a_2*x[1]+a_3*x[2]+a_4*x[3]-a_t),axis=0),
                     np.sum(2*a_3*(a_1*x[0]+a_2*x[1]+a_3*x[2]+a_4*x[3]-a_t),axis=0),
                     np.sum(2*a_4*(a_1*x[0]+a_2*x[1]+a_3*x[2]+a_4*x[3]-a_t),axis=0)))

In [17]:
print(gradfun(x_t), gradfun(np.array((.25,.25,.25,.25))))

[  38.07593473  394.85521771 -183.12325218 -361.33064559] [-0.42752437 -0.25613023 -0.27734046 -0.36168826]


## SLSQP

In [11]:
cons = ({'type': 'eq', 'fun': lambda x:  x[0] + x[1] + x[2] + x[3] -1},
        {'type': 'ineq', 'fun': lambda x: x[0]},
        {'type': 'ineq', 'fun': lambda x: x[1]},
        {'type': 'ineq', 'fun': lambda x: x[2]},
        {'type': 'ineq', 'fun': lambda x: x[3]})

bnds = ((0, None), (0, None), (0, None), (0, None))

In [12]:
start = timeit.default_timer()

print(scipy.optimize.minimize(function, x_t, method='SLSQP', jac=gradfun, bounds=bnds, constraints=cons, tol=1e-10))

stop = timeit.default_timer()

print('Time: ', stop - start)

     fun: 0.2792200117148072
     jac: array([-0.37543218, -0.37542445, -0.37543093, -0.37543524])
 message: 'Optimization terminated successfully.'
    nfev: 13
     nit: 8
    njev: 8
  status: 0
 success: True
       x: array([0.25022466, 0.24995787, 0.24973761, 0.25007985])
Time:  0.05441038706339896
