In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
import scipy
import time

In [2]:
training2010 = pd.read_csv('../data/merged_wp_census_data2_081122.csv')
training2010=training2010.fillna(0)
county_adj = pd.read_csv('../data/countyadj2.csv', index_col = 0)

In [3]:
## This function was taken from online
# Generate samples from a multi-variate normal distribution with provided precision matrix WITHOUT inverting
def mv_normal_sample(mu=0, precision_matrix=None, num_models=1):

    # Precision matrix must be a square matrix
    assert precision_matrix.shape[0] == precision_matrix.shape[1], 'Precision matrix must be a square matrix'

    dim = precision_matrix.shape[0]

    chol_U = scipy.linalg.cholesky(precision_matrix, lower=False)

    # Create num_models iid standard normal vectors
    z_vector_matrix = np.random.normal(loc=0, scale=1, size=[num_models, dim])

    # Sample from the MV normal with precision matrix by solving the Cholesky decomp for each normal vector
    samples = np.squeeze(np.array(
        [scipy.linalg.solve_triangular(a=chol_U, b=z_vector_matrix[i, :], unit_diagonal=False) + mu for i in
         range(num_models)]))

    return (np.transpose(samples))

In [4]:
nchain = 5
tau2 = 100
rho = 0.3

Q = (1/tau2)*(np.diag(county_adj.sum(axis=1)) - rho*county_adj)
Q = tf.constant(Q, dtype = tf.float32)

init_state = tf.constant(np.array([mv_normal_sample(precision_matrix = Q, num_models = 3) for i in range(nchain)]),
                        dtype = tf.float32)

This gradient below works fine since it uses the tensorflow mean function

In [5]:
phi = init_state[0,:,:]
tf.math.reduce_sum(phi, 1)

<tf.Tensor: shape=(3064,), dtype=float32, numpy=
array([11.174502 , -2.774537 ,  4.1983414, ...,  1.0655003,  2.7233696,
        1.2552919], dtype=float32)>

In [6]:
with tf.GradientTape() as g:
  g.watch(phi)
  y = tf.reduce_mean(phi)
dy_dx = g.gradient(y,phi)
print(dy_dx)

tf.Tensor(
[[0.00010879 0.00010879 0.00010879]
 [0.00010879 0.00010879 0.00010879]
 [0.00010879 0.00010879 0.00010879]
 ...
 [0.00010879 0.00010879 0.00010879]
 [0.00010879 0.00010879 0.00010879]
 [0.00010879 0.00010879 0.00010879]], shape=(3064, 3), dtype=float32)


This gradient below does not work because it uses the numpy mean function, even though it converts the answer to a tf.Tensor.

In [7]:
with tf.GradientTape() as g:
  g.watch(phi)
  y = tf.constant(np.mean(phi))
dy_dx = g.gradient(y,phi)
print(dy_dx)

None


Below is the loss function that I want to use for an MCMC sampler. This function is the log likelihood of my model. Since the operations are not written in tensorflow the gradient doesn't work. I want to know how to implement this in tensorflow.

In [12]:
models = ['acs', 'pep', 'worldpop']
def target_log_prob_fn_CAR_old(phi):   
    ll_chains = []
    tmp = tf.Variable(0.)
    for chain in range(phi.shape[0]):
        phi_chain = phi[chain,:,:]
        # (1) Prob of the CAR random effect values
        ll = -0.5*sum(np.diagonal(np.transpose(phi_chain) @ Q @ phi_chain))

        # (2) Prob of observed data points
        exp_phi = np.exp(phi_chain)
        u = [exp_phi[i,:]/np.sum(exp_phi[i,:]) for i in range(exp_phi.shape[0])]
        tmp = training2010[models].values*u
        n = tmp.sum(axis=1)
        ll = ll + np.sum(training2010['census']*np.log(n) - n)
        ll_chains.append(ll)
        tmp = tmp + ll
    
    print(ll_chains)
    #return tf.reduce_mean(ll_chains)
    return(tmp)

def target_log_prob_fn_CAR(phi):
    
    ll = tf.Variable(0.)
    for chain in range(phi.shape[0]):
        # (1) Prob of the CAR random effect values
        ll_chain = -0.5*tf.reduce_mean(tf.linalg.diag_part(
            tf.linalg.matmul(phi[chain,:,:],tf.linalg.matmul(Q, phi[chain,:,:]), transpose_a = True))) 
        ll = ll + ll_chain
        
        # (2) Prob of observed data points
        exp_phi = tf.math.exp(phi[chain,:,:])
        u = [exp_phi[i,:]/tf.reduce_sum(exp_phi[i,:]) for i in range(exp_phi.shape[0])]
        tmp = training2010[models].values*u
        n = tmp.sum(axis=1)
        ll = ll + np.sum(training2010['census']*np.log(n) - n)

    return(ll)

target_log_prob_fn_CAR(init_state)

with tf.GradientTape() as g:
  g.watch(init_state)
  y = target_log_prob_fn_CAR(init_state)
dy_dx = g.gradient(y, init_state)
#print(dy_dx)

In [98]:
def target_log_prob_fn_CAR_test1(phi):
    
    ll = tf.Variable(0.)
    for chain in range(phi.shape[0]):
        # (1) Prob of the CAR random effect values
        ll_chain = -0.5*tf.reduce_mean(tf.linalg.diag_part(
            tf.linalg.matmul(phi[chain,:,:],tf.linalg.matmul(Q, phi[chain,:,:]), transpose_a = True))) 
        ll = ll + ll_chain

    return(ll)

def target_log_prob_fn_CAR_test2(phi):
    
    ll = tf.Variable(0.)
    for chain in range(phi.shape[0]):
        # (2) Prob of observed data points
        exp_phi = tf.math.exp(phi[chain,:,:])
        u = [exp_phi[i,:]/tf.reduce_sum(exp_phi[i,:]) for i in range(exp_phi.shape[0])]
        tmp = training2010[models].values*u
        n = tmp.sum(axis=1)
        ll = ll + np.sum(training2010['census']*np.log(n) - n)

    return(ll)

def target_log_prob_fn_CAR_test3(phi):
    
    ll = tf.Variable(0.)
    for chain in range(phi.shape[0]):
        # (2) Prob of observed data points
        exp_phi = tf.math.exp(phi[chain,:,:])
        u = [exp_phi[i,:]/tf.reduce_sum(exp_phi[i,:]) for i in range(exp_phi.shape[0])]
        #tmp = training2010[models].values*u
        #n = tmp.sum(axis=1)
        #ll = ll + np.sum(training2010['census']*np.log(n) - n)

    return(ll)

def target_log_prob_fn_CAR_test4(phi):
    
    # get exponentiated values and sum across models
    exp_phi = tf.math.exp(phi)
    exp_phi_rows = tf.reduce_sum(exp_phi, 2)
    
    # get model weights and calculate mean estimate
    u = exp_phi/exp_phi_rows[...,None]
    n = tf.reduce_sum(training2010[models].values*u, axis = 2)
    
    # update the log likelihood 
    ll = ll + tf.reduce_sum([np.sum(training2010['census']*np.log(n[chain,:]) - n[chain,:]) for chain in range(phi.shape[0])])
    
    return(ll)


In [99]:
t0 = time.perf_counter()
a = target_log_prob_fn_CAR_test2(init_state)
print(time.perf_counter() - t0)

t0 = time.perf_counter()
b = target_log_prob_fn_CAR_test3(init_state)
print(time.perf_counter() - t0)

t0 = time.perf_counter()
c = target_log_prob_fn_CAR_test4(init_state)
print(time.perf_counter() - t0)


6.912102099999174
6.866808200000378
0.012330500001553446


In [100]:
print(a)
print(b)
print(c)

tf.Tensor(17883271000.0, shape=(), dtype=float32)
<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>
tf.Tensor(17883271000.0, shape=(), dtype=float32)


In [70]:
exp_phi = tf.math.exp(init_state)
exp_phi_rows = tf.reduce_sum(exp_phi, 2)
t0 = time.perf_counter()
u1 = exp_phi/exp_phi_rows[...,None]
print(time.perf_counter() - t0)
t0 = time.perf_counter()
u = [exp_phi[0,i,:]/exp_phi_rows[0, i] for i in range(exp_phi.shape[1])]
print(time.perf_counter() - t0)

0.0007726000003458466
1.1622198000004573


In [71]:
tmp = training2010[models].values*u
tmp2 = training2010[models].values*u1[0,:,:]
tmp3 = training2010[models].values*u1

In [91]:
tf.reduce_sum(tmp2, axis = 1)

<tf.Tensor: shape=(3064,), dtype=float32, numpy=
array([ 53155.723, 176339.84 ,  27698.83 , ...,  20967.383,   8271.89 ,
         7212.482], dtype=float32)>

In [92]:
tf.reduce_sum(tmp3, axis = 2)

<tf.Tensor: shape=(5, 3064), dtype=float32, numpy=
array([[ 53155.723 , 176339.84  ,  27698.83  , ...,  20967.383 ,
          8271.89  ,   7212.482 ],
       [ 54772.88  , 176398.98  ,  27696.695 , ...,  20636.373 ,
          8521.852 ,   7093.7915],
       [ 55894.582 , 175792.12  ,  27691.943 , ...,  20825.73  ,
          8271.004 ,   7191.672 ],
       [ 53452.742 , 175886.16  ,  27395.645 , ...,  20537.242 ,
          8527.322 ,   7070.585 ],
       [ 54672.77  , 175871.31  ,  27698.883 , ...,  20967.684 ,
          8511.356 ,   7079.3257]], dtype=float32)>

In [85]:
print(tmp.shape)
print(tmp2.shape)
print(tmp3.shape)

(3064, 3)
(3064, 3)
(5, 3064, 3)


So clearly almost all of the time comes from the line calculating u. There is some extra time elsewhere but I just gotta solve that first. 
The reduce sum line is the bulk of it but the rest takes work too.

In [40]:
print(init_state.shape)
test = tf.reduce_sum(init_state, 2)
test2 = tf.reduce_sum(init_state[1,0,:])
print(test.shape)
print(test)
print(test2)

(5, 3064, 3)
(5, 3064)
tf.Tensor(
[[ 11.174502    -2.774537     4.1983414  ...   1.0655003    2.7233696
    1.2552919 ]
 [  1.4182105   -3.418344     3.8420029  ...   5.8528194   -2.1113777
   -2.4307096 ]
 [-13.023096    -7.9385347   -7.853675   ...   5.96973     10.176804
    0.7982836 ]
 [  7.277014   -10.823961     0.4170308  ...   5.4644723    1.9503944
  -10.522987  ]
 [  1.4727321   -4.1790075   -6.7124834  ... -16.180014     7.406987
   -0.79876614]], shape=(5, 3064), dtype=float32)
tf.Tensor(1.4182105, shape=(), dtype=float32)


In [42]:
exp_phi = tf.math.exp(init_state)
exp_phi

<tf.Tensor: shape=(5, 3064, 3), dtype=float32, numpy=
array([[[1.78468496e+04, 7.05183220e+00, 5.66449702e-01],
        [1.86681099e+01, 1.51097155e+00, 2.21145060e-03],
        [1.02079260e+03, 1.94553822e-01, 3.35227162e-01],
        ...,
        [3.67204142e+00, 1.29364824e+01, 6.10966012e-02],
        [1.11567590e+03, 3.84107661e+00, 3.55429389e-03],
        [7.61305261e-03, 6.26841259e+00, 7.35275269e+01]],

       [[8.93869340e-01, 1.17332617e+04, 3.93757044e-04],
        [9.48317432e+00, 8.55378449e-01, 4.03942866e-03],
        [3.05430603e+02, 6.55627623e-02, 2.32804227e+00],
        ...,
        [3.42543068e+01, 5.78476238e+00, 1.75730228e+00],
        [7.19081610e-02, 1.25822008e+00, 1.33815253e+00],
        [8.31700420e+00, 2.21257830e+00, 4.78069112e-03]],

       [[6.14739547e-04, 1.01133566e-02, 3.55267137e-01],
        [3.75151787e+01, 2.70392303e-03, 3.51671316e-03],
        [1.79270709e+00, 2.87253875e-02, 7.54079409e-03],
        ...,
        [2.69139500e+01, 2.902635