In [52]:
import numpy as np

In [53]:
# columns of matrix obs are possible observations of
# binary nucleotides associated with the four species
#
# the set of possible observations is {-1, +1}^4

obs = np.array([[-1, -1, -1, -1, -1, -1, -1, -1, +1, +1, +1, +1, +1, +1, +1, +1], 
                [-1, -1, -1, -1, +1, +1, +1, +1, -1, -1, -1, -1, +1, +1, +1, +1], 
                [-1, -1, +1, +1, -1, -1, +1, +1, -1, -1, +1, +1, -1, -1, +1, +1], 
                [-1, +1, -1, +1, -1, +1, -1, +1, -1, +1, -1, +1, -1, +1, -1, +1]])

# adjust the following parameters before computing mean and covariance

global tau # \theta_{1} * \theta_{5} * \theta_{2}
tau = 0.5

global k # length of DNA sequence
k = 300

In [54]:
# assume our observations are k i.i.d X_1,...,X_k
# taking values from the columns of obs
# 
# also assume \theta_{3} = \theta_{4} = 0
# 
# q_j = P(X_i = obs[:, j]) i.e. q_j is the probability
# of observing column j of obs

q = np.zeros(16)
for j in range(16):
  q[j] = (1 / 16) * (1 + obs[0, j] * obs[1, j] * tau)

In [55]:
# let Y_i = e_j \in \mathbb{R}^16 iff X_i = obs[:, j]
# where e_1,...,e_16 are the standard basis vectors, and
# define frequency vector F_k = Y_1 + ... + Y_k
#
# our frequency vector follows multinomial distribution with
# k trials and event probabilities q, and it can be approximated
# as a multivariate Gaussian via the Central Limit Theorem with 
# mean k * \mu and covariance k * \Sigma
#
# \mu is mean vector of F_k and \Sigma is covariance matrix of F_k
#
# see https://en.wikipedia.org/wiki/Multinomial_distribution#Properties
# and https://en.wikipedia.org/wiki/Central_limit_theorem#Multidimensional_CLT
#
# we now compute the mean and covariance

mean = k * q # mean vector of multivariate Gaussian

cov = k * (np.diag(q) - np.outer(q, q)) # cov matrix of multivariate Gaussian

print(mean, "\n")
print(cov)

# copy and paste as a np.array(...)

[28.125 28.125 28.125 28.125  9.375  9.375  9.375  9.375  9.375  9.375
  9.375  9.375 28.125 28.125 28.125 28.125] 

[[25.48828125 -2.63671875 -2.63671875 -2.63671875 -0.87890625 -0.87890625
  -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625
  -2.63671875 -2.63671875 -2.63671875 -2.63671875]
 [-2.63671875 25.48828125 -2.63671875 -2.63671875 -0.87890625 -0.87890625
  -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625
  -2.63671875 -2.63671875 -2.63671875 -2.63671875]
 [-2.63671875 -2.63671875 25.48828125 -2.63671875 -0.87890625 -0.87890625
  -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625
  -2.63671875 -2.63671875 -2.63671875 -2.63671875]
 [-2.63671875 -2.63671875 -2.63671875 25.48828125 -0.87890625 -0.87890625
  -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625 -0.87890625
  -2.63671875 -2.63671875 -2.63671875 -2.63671875]
 [-0.87890625 -0.87890625 -0.87890625 -0.87890625  9.08203125 -0.29296875
  -0.29296875

In [70]:
# we can now generate data by sampling a point from this distribution
# by executing the following command

freq = np.random.multivariate_normal(mean, cov)
print(freq)

# gives frequency of observations, would need to round components
# in practice

[29.48736175 23.34192884 44.96181194 23.30892565  9.74110754  8.7702922
  7.01824582 10.30910884  4.73705517  4.84297296  9.66284929  3.37115752
 35.75999972 26.78617761 29.87520582 28.02579948]
