# Stable-beta Indian Buffet Process

### Loading packages

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import scipy as sp
from scipy.io import mmread
%matplotlib inline

### Reading data

# Bobby's changes start here

In [3]:
test_mat = mmread("../data/nipspapersmatrix.mtx")

In [4]:
type(test_mat)

scipy.sparse.coo.coo_matrix

In [5]:
test_mat.data = np.ones(test_mat.data.size) # change all non-zero values to 1

In [6]:
col_sums = test_mat.sum(axis=0)

In [7]:
col_sums = np.asarray(col_sums)

In [8]:
col_sums = col_sums.flatten() # Col sums ie number of documents where a given word appears

In [9]:
col_sums

array([1.701e+03, 3.767e+03, 5.792e+03, ..., 1.000e+00, 1.000e+00,
       1.000e+00])

In [10]:
col_sums.shape

(323093,)

In [14]:
type(col_sums[0]) # careful as col_sums has float values not integers

numpy.float64

In [15]:
col_sums.astype(int)

array([1701, 3767, 5792, ...,    1,    1,    1])

In [12]:
len(col_sums)

323093

In [13]:
sum(col_sums==0)

0

# Bobby's changes finished

### Writing likelihood function

In [302]:
#' Fit parameters by maximum likelihood
#'
#' Calculate the log-likelihood
#'
#' @param Z matrix whose rows correspond to customers and columns to dishes
#' @param alpha mass parameter
#' @param c concentration parameter
#' @param sigma stability exponent
#' @return The log-likelihood of binary matrices Z1,...,Zn


# modified to make calculations easier by using log gamma for the second term, but for the first term we need gamma
# however gamma gives very large values that cannot be stored

def L(Z, alpha, c, sigma):
  n = Z.shape[0] # number of rows
  vector = np.array(range(1,n+1))  # need to say 1, n+1 to get 1:n
  exponent_vec = (sp.special.gamma(1 + c) * sp.special.gamma(vector - 1 + c + sigma)) / (sp.special.gamma(vector + c) * sp.special.gamma(c + sigma))
  m = np.asarray(Z.sum(axis=0)).flatten()  # sum of columns
  K = len(m)
  prod_vec = (sp.special.loggamma(m - sigma) + sp.special.loggamma(n - m + c + sigma) + sp.special.loggamma(1 + c)) - (sp.special.loggamma(1 - sigma) + sp.special.loggamma(c + sigma) + sp.special.loggamma(n + c))
  loglikelihood = (-alpha * sum(exponent_vec)) + sum(prod_vec) + K* np.log(alpha)
  return(loglikelihood)

In [303]:
# mp gamma that deals with large number doesnt accept an array as input so need to include a loop

def loglik(Z, alpha, c, sigma):
  n = Z.shape[0] # number of rows
  exponent_vec = np.zeros(n)
  for i in range(1, n+1): # need to say 1, n+1 to get 1:n
        exponent_vec[i-1] = (mp.gamma(1 + c) * mp.gamma(i - 1 + c + sigma)) / (mp.gamma(i + c) * mp.gamma(c + sigma))
  m = np.asarray(Z.sum(axis=0)).flatten()  # sum of columns
  K = len(m)
  prod_vec = (sp.special.loggamma(m - sigma) + sp.special.loggamma(n - m + c + sigma) + sp.special.loggamma(1 + c)) - (sp.special.loggamma(1 - sigma) + sp.special.loggamma(c + sigma) + sp.special.loggamma(n + c))
  loglikelihood = (-alpha * sum(exponent_vec)) + sum(prod_vec) + K* np.log(alpha)
  return(loglikelihood)

In [306]:
# Easy 4x4 example to check that code works

c=1
alpha=1
sigma=0.5

Z= np.array([[ 0.,  1.,  0.,  1.],
       [ 0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  0.]])

loglik(Z, 1, 1, 0.5)
L(Z, 1, 1, 0.5)

(-7.7927508603023306+0j)

In [288]:
Z = dense_mat
c=1
alpha=1
sigma=0.5

loglik(Z, 1, 1, 0.5)

(-15537511.598+0j)


### Maximise log likelihood 

In [289]:
from scipy.optimize import minimize

In [307]:
def negloglik(param):
  alpha = param[0]
  c = param[1]
  sigma = param[2]
  Z = dense_mat
  n = Z.shape[0] # number of rows
  exponent_vec = np.zeros(n)
  for i in range(1, n+1): # need to say 1, n+1 to get 1:n
        exponent_vec[i-1] = (mp.gamma(1 + c) * mp.gamma(i - 1 + c + sigma)) / (mp.gamma(i + c) * mp.gamma(c + sigma))
  m = np.asarray(Z.sum(axis=0)).flatten()  # sum of columns
  K = len(m)
  prod_vec = (sp.special.loggamma(m - sigma) + sp.special.loggamma(n - m + c + sigma) + sp.special.loggamma(1 + c)) - (sp.special.loggamma(1 - sigma) + sp.special.loggamma(c + sigma) + sp.special.loggamma(n + c))
  loglikelihood = (-alpha * sum(exponent_vec)) + sum(prod_vec) + K* np.log(alpha)
  return(-loglikelihood)

In [295]:
initial = np.array([1, 1, 0.5])

In [329]:
result = minimize(negloglik, initial, bounds=((0, None), (None, None), (0, 1)))

  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  isave, dsave, maxls)


In [335]:
result

      fun: (nan+nan*j)
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
      jac: array([ -166529.41703796,  -110765.7328248 , -2383899.6887207 ])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 168
      nit: 1
   status: 2
  success: False
        x: array([ 1.93793088,  1.34682223,  0.50000145])