In [1]:
###############################################
## Numpy/PyTorch implementation of Delaporte Distribution
## See: https://github.com/cran/gamlss.dist
## See (count distributions - page197): http://www.gamlss.com/wp-content/uploads/2013/01/gamlss-manual.pdf 
##
## Author: Chris Meaney
## Date: August 2021
###############################################

In [2]:
## Dependency modules
import numpy as np
import pandas as pd
import torch
from scipy.special import gammaln
from sinfo import sinfo

In [3]:
##########################################################
## Use pandas to import data, and store as DataFrame
## Data are 1) response/target variable (number of fish = count random variable), 2) lake size (single continous feature/predictor)
##########################################################
dat = pd.read_csv('C://Users//ChristopherMeaney//Desktop//PyTorch_Stuff//pytorch_count_dists//species.csv', encoding='latin1')
dat.head(n=15)

Unnamed: 0,fish,lake,x,scale_x
0,10,5,1.609438,-1.53343
1,37,41,3.713572,-0.901903
2,60,171,5.141664,-0.473281
3,113,25719,10.154985,1.031399
4,99,59596,10.995344,1.283621
5,13,1,0.0,-2.016481
6,30,44,3.78419,-0.880708
7,114,58016,10.968474,1.275556
8,112,19477,9.87699,0.947962
9,17,10,2.302585,-1.325392


In [4]:
## Describe the data
dat.fish.describe()

count     70.000000
mean      41.742857
std       47.849609
min        5.000000
25%       14.000000
50%       21.500000
75%       47.500000
max      245.000000
Name: fish, dtype: float64

In [5]:
mu_ = dat.fish.mean()
mu_

41.74285714285714

In [6]:
sigma_ = np.sqrt(dat.fish.var())
sigma_

47.84960912241293

In [7]:
###################################################
## Numpy implementation of Delaporte Loss/Density Function
## Function is basically a Numpy implementation of Rigby et al gamlss.dist R Code (which calls tofyDEL2.c)
## https://github.com/cran/gamlss.dist/tree/master/src/tofydel2.c
###################################################
def d_DEL_np(x, mu, sigma, nu, log=True): 
    ## Determine length of data vector and parameters 
    ly = np.max(np.array([len(x), len(mu), len(sigma), len(nu)]))  
    #x = np.repeat(a=x, repeats=ly)      
    nsigma = np.repeat(a=sigma, repeats=ly)
    nmu = np.repeat(a=mu, repeats=ly)
    nnu = np.repeat(a=nu, repeats=ly)
    ## Initial vectors to store computed DEL density values
    ny = int(len(x))
    maxyp1 = np.max(x) + 1
    tofY = np.zeros(shape=(maxyp1))
    sumlty = np.zeros(shape=(ly))
    ## Compute log-prob in instance that x=0 (note: this is just kernel of density, data does not enter equation)
    logpy0 = -nmu*nnu-(1/nsigma)*(np.log(1+nmu*nsigma*(1-nnu)))
    ## Big for loop to compute DEL density (or log-density)
    ## This is directly from Rigby et al: tofyDEL2.c code.
    for i in range(1, ny+1):
        iy = x[i-1] + 1
        tofY[0] = nmu[i-1] * nnu[i-1]+nmu[i-1]*(1-nnu[i-1])/(1+nmu[i-1] * nsigma[i-1]*(1-nnu[i-1]))
        sumT = 0 
        ## Start inner loop to compute rest of DEL density
        if (x[i-1]==0):
            sumT = 0
        else:
            for j in range(1, iy):
                dum = 1 + (1/(nmu[i-1] * nsigma[i-1] * (1-nnu[i-1])))
                tofY[j] = ( (j) + (nmu[i-1] * nnu[i-1]) + 1/(nsigma[i-1] * (1 - nnu[i-1])) - (nmu[i-1] * nnu[i-1] * (j))/tofY[j-1]) / dum
                sumT = sumT + np.log(tofY[j-1])
        sumlty[i-1] = sumT
    ## Add the kernel of the DEL density back to other constant component
    logfy = logpy0 - gammaln(x+1) + sumlty
    ## log={T,F} flag: T=return log-density; F=return density
    if(log==False):
        fy = np.exp(logfy)
    else:
        fy = logfy
    ## Return log density function to user
    return fy

d_DEL_np(x=np.arange(10), mu=np.array([1]), sigma=np.array([1]), nu=np.array([0.5]), log=True)

array([-0.90546511, -1.08778666, -1.8148354 , -2.76919808, -3.8186649 ,
       -4.90299249, -5.99806526, -7.09592071, -8.19439115, -9.2929798 ])

In [8]:
###################################################
## PyTorch implementation of Delaporte Loss/Density Function
## Function is basically a Numpy implementation of Rigby et al gamlss.dist R Code (which calls tofyDEL2.c)
## https://github.com/cran/gamlss.dist/tree/master/src/tofydel2.c
###################################################
def d_DEL_th(x, mu, sigma, nu, log=True): 
    ## Determine length of data vector and parameters 
    ly = int(torch.max(torch.Tensor([len(x), len(mu), len(sigma), len(nu)])).item())
    #x = np.repeat(a=x, repeats=ly)      
    nsigma = sigma.repeat(ly)
    nmu = mu.repeat(ly)
    nnu = nu.repeat(ly)
    ## Initial vectors to store computed DEL density values
    ny = int(len(x))
    maxyp1 = x.max().item() + 1
    tofY = torch.zeros(int(maxyp1))
    sumlty = torch.zeros(ly)
    ## Compute log-prob in instance that x=0 (note: this is just kernel of density, data does not enter equation)
    logpy0 = -nmu*nnu-(1/nsigma)*(torch.log(1+nmu*nsigma*(1-nnu)))
    ## Big for loop to compute DEL density (or log-density)
    ## This is directly from Rigby et al: tofyDEL2.c code.
    for i in torch.arange(1, ny+1):
        iy = x[i.item()-1] + 1
        tofY[0] = nmu[i.item()-1] * nnu[i.item()-1]+nmu[i.item()-1]*(1-nnu[i.item()-1])/(1+nmu[i.item()-1] * nsigma[i.item()-1]*(1-nnu[i.item()-1]))
        sumT = 0 
        ## Start inner loop to compute rest of DEL density
        if (x[i.item()-1]==0):
            sumT = 0
        else:
            for j in torch.arange(1, iy):
                dum = 1 + (1/(nmu[i.item()-1] * nsigma[i.item()-1] * (1-nnu[i.item()-1])))
                tofY[j.item()] = ( (j) + (nmu[i.item()-1] * nnu[i.item()-1]) + 1/(nsigma[i.item()-1] * (1 - nnu[i.item()-1])) - (nmu[i.item()-1] * nnu[i.item()-1] * (j))/tofY[j.item()-1]) / dum
                sumT = sumT + torch.log(tofY[j.item()-1])
        sumlty[i.item()-1] = sumT
    ## Add the kernel of the DEL density back to other constant component
    logfy = logpy0 - torch.lgamma(x+1) + sumlty
    ## log={T,F} flag: T=return log-density; F=return density
    if(log==False):
        fy = torch.exp(logfy)
    else:
        fy = logfy
    ## Return log density function to user
    return fy

d_DEL_th(x=torch.arange(10), mu=torch.Tensor([1]), sigma=torch.Tensor([1]), nu=torch.Tensor([0.5]), log=True)

tensor([-0.9055, -1.0878, -1.8148, -2.7692, -3.8187, -4.9030, -5.9981, -7.0959,
        -8.1944, -9.2930])

In [9]:
## WARNING: read me... 
## Below is user-defined R implementation of Delaporte density
## We also compare against gamlss.dist::dDEL() code rolled out in gamlss.dist package
## Documentation on page 197: http://www.gamlss.com/wp-content/uploads/2013/01/gamlss-manual.pdf
## You will see that above Numpy and PyTorch implementations agree with R output (up to many decimal places)

In [10]:
'''
> ########################################################
> ## Delaporte Distribution
> ########################################################
> d_DEL <- function(x, mu=1, sigma=1, nu=0.5, log=FALSE) {
+     ## Warning messages on paramter and data space constraint violations
+     if (any(mu <= 0) )  stop(paste("mu must be greater than 0 ", "\n", "")) 
+     if (any(sigma <= 0) )  stop(paste("sigma must be greater than 0 ", "\n", "")) 
+     if (any(nu <= 0) | any(nu >= 1))  stop(paste("nu must be between 0 and 1", "\n", "")) 
+     if (any(x < 0) )  stop(paste("x must be >=0", "\n", ""))
+     ## Determine length of data vector and parameters 
+     ly <- max(length(x), length(mu), length(sigma), length(nu)) 
+     x <- rep(x, length=ly)      
+     nsigma <- rep(sigma, length=ly)
+     nmu <- rep(mu, length=ly)   
+     nnu <- rep(nu, length=ly) 
+     ## Initial vectors to store computed DEL density values
+     ny <- as.integer(length(x))
+     maxyp1 <- max(x) + 1
+     tofY <- rep(NA_real_, maxyp1)
+     sumlty <- rep(NA_real_, ly)
+     ## Compute log-prob in instance that x=0 (note: this is just kernel of density, data does not enter equation)
+     logpy0 <- -nmu*nnu-(1/nsigma)*(log(1+nmu*nsigma*(1-nnu)))
+     ## Big for loop to compute DELAPORTE density (or log-density)
+     ## This is directly from Rigby et al: tofyDEL2.c code.
+     for (i in 1:ny) {
+         iy <- x[i] + 1
+         tofY[1] <- nmu[i] * nnu[i]+nmu[i]*(1-nnu[i])/(1+nmu[i] * nsigma[i]*(1-nnu[i]))
+         sumT <- 0 
+         ## Start inner loop to compute rest of DEL density
+         if (x[i]==0) {
+             sumT <- 0
+         } else {
+             for (j in 1:(iy-1)) {
+                 dum = 1 + (1/(nmu[i] * nsigma[i] * (1-nnu[i])))
+                 tofY[j + 1] <- ( (j) + (nmu[i] * nnu[i]) + 1/(nsigma[i] * (1 - nnu[i])) - (nmu[i] * nnu[i] * (j))/tofY[j]) / dum
+                 sumT <- sumT + log(tofY[j])
+             }
+         }
+         sumlty[i] <- sumT
+     }
+     ## Add the kernel of the DEL density back to other constant component
+     logfy <-  logpy0 - lgamma(x+1) + sumlty
+     ## Log={T,F} flag: T=return log-density; F=return density
+     if(log==FALSE) {
+         fy <- exp(logfy)  
+     } else {
+         fy <- logfy
+     }
+     ## Further exception handling for whether sigma<0.0001 (small) ==> in these cases, DEL can be approx. by POIS
+     fy <- ifelse(nsigma>0.0001, fy, dpois(x, lambda=mu, log=log)) 
+     ## Return density to user
+     return(fy)
+ } 

> d_DEL(x=0:9, mu=1, sigma=1, nu=0.5, log=TRUE)
 [1]  -0.9054651  -1.0877867  -1.8148354  -2.7691981  -3.8186649  -4.9029925  -5.9980653  -7.0959207  -8.1943912  -9.2929798

> dDEL(x=0:9, mu=1, sigma=1, nu=0.5, log=TRUE)
 [1]  -0.9054651  -1.0877867  -1.8148354  -2.7691981  -3.8186649  -4.9029925  -5.9980653  -7.0959207  -8.1943912  -9.2929798

'''
;

''

In [11]:
##############################################
## Delaporte Model - try to learn MLE of fish count data; via AutoGrad/SGD implementation in PyTorch
############################################## 

In [12]:
## Instantiate data tensor, and variable for (binomial) model parameters
x = torch.autograd.Variable(torch.from_numpy(dat.fish.to_numpy())).type(torch.FloatTensor)
l_mu = torch.autograd.Variable(torch.rand(1), requires_grad=True)
l_sigma = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
l_nu = torch.autograd.Variable(torch.rand(1), requires_grad=True) 

In [13]:
def del_nll(x, mu, sigma, nu): 
    ## Determine length of data vector and parameters 
    ly = int(torch.max(torch.Tensor([len(x), len(mu), len(sigma), len(nu)])).item())
    #x = np.repeat(a=x, repeats=ly)      
    nsigma = sigma.repeat(ly)
    nmu = mu.repeat(ly)
    nnu = nu.repeat(ly)
    ## Initial vectors to store computed DEL density values
    ny = int(len(x))
    maxyp1 = x.max().item() + 1
    tofY = torch.zeros(int(maxyp1))
    sumlty = torch.zeros(ly)
    ## Compute log-prob in instance that x=0 (note: this is just kernel of density, data does not enter equation)
    logpy0 = -nmu*nnu-(1/nsigma)*(torch.log(1+nmu*nsigma*(1-nnu)))
    ## Big for loop to compute DEL density (or log-density)
    ## This is directly from Rigby et al: tofyDEL2.c code.
    for i in torch.arange(1, ny+1):
        iy = x[int(i.item()-1)] + 1
        tofY[0] = nmu[int(i.item()-1)] * nnu[int(i.item()-1)]+nmu[int(i.item()-1)]*(1-nnu[int(i.item()-1)])/(1+nmu[int(i.item()-1)] * nsigma[int(i.item()-1)]*(1-nnu[int(i.item()-1)]))
        sumT = 0 
        ## Start inner loop to compute rest of DEL density
        if (x[int(i.item()-1)]==0):
            sumT = 0
        else:
            for j in torch.arange(1, iy):
                dum = 1 + (1/(nmu[int(i.item()-1)] * nsigma[int(i.item()-1)] * (1-nnu[int(i.item()-1)])))
                tofY[int(j.item())] = ( (j) + (nmu[int(i.item()-1)] * nnu[int(i.item()-1)]) + 1/(nsigma[int(i.item()-1)] * (1 - nnu[int(i.item()-1)])) - (nmu[int(i.item()-1)] * nnu[int(i.item()-1)] * (j))/tofY[int(j.item()-1)]) / dum
                sumT = sumT + torch.log(tofY[int(j.item()-1)])
        sumlty[int(i.item()-1)] = sumT
    ## Add the kernel of the DEL density back to other constant component
    logfy = logpy0 - torch.lgamma(x+1) + sumlty
    ## Return neg log lik to user
    nll = -torch.sum(logfy)
    return nll

In [14]:
torch.autograd.set_detect_anomaly(True)

## Learning rate
learning_rate_mu = 2e-5
learning_rate_sigma = 2e-5
learning_rate_nu = 2e-5

## Training loop
for t in range(25000):
    ## Backprop on negative log likelihood loss
    NLLdel = del_nll(x=x, mu=l_mu, sigma=l_sigma, nu=l_nu) 
    NLLdel.backward()
    ## Logging to console
    if t % 1000 == 0:
        print("Iteration = ", t, 
              "loglik  =", NLLpig.data.numpy(), 
              "lmu =", l_mu.data.numpy(), 
              "lsigma =", l_sigma.data.numpy(),
              "lnu =", l_nu.data.numpy(),  
              "dL/dlmu = ", l_mu.grad.data.numpy(), 
              "dL/dlsigma = ", l_sigma.grad.data.numpy(),
              "dL/dlnu = ", l_nu.grad.data.numpy()
             )
    ## SGD update of parms
    l_mu.data -= learning_rate_mu * l_mu.grad.data
    l_sigma.data -= learning_rate_sigma * l_sigma.grad.data
    l_nu.data -= learning_rate_nu * l_nu.grad.data
    ## Zero the gradients
    l_mu.grad.data.zero_()
    l_sigma.grad.data.zero_()
    l_nu.grad.data.zero_()
    

  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\ChristopherMeaney\anaconda3\envs\pytorch_env\lib\asyncio\base_events.py

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor []], which is output 0 of SelectBackward, is at version 2992; expected version 2991 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [None]:
########################
## Print session info to console 
########################
sinfo()