In [1]:
###############################################
## PyTorch Negative-Binomial-2 Distribution Likelihood Optimization Examples
## Author: Chris Meaney
## Date: August 2021
###############################################

In [2]:
## Dependency modules
import numpy as np
import pandas as pd
import torch
from sinfo import sinfo

In [3]:
##########################################################
## Use pandas to import data, and store as data.frame
## Data are 1) response/target variable (number of fish = count random variable), 2) lake size (single continous feature/predictor)
##########################################################
dat = pd.read_csv('C://Users//ChristopherMeaney//Desktop//PyTorch_Stuff//pytorch_count_dists//species.csv', encoding='latin1')
dat.head(n=15)

Unnamed: 0,fish,lake,x,scale_x
0,10,5,1.609438,-1.53343
1,37,41,3.713572,-0.901903
2,60,171,5.141664,-0.473281
3,113,25719,10.154985,1.031399
4,99,59596,10.995344,1.283621
5,13,1,0.0,-2.016481
6,30,44,3.78419,-0.880708
7,114,58016,10.968474,1.275556
8,112,19477,9.87699,0.947962
9,17,10,2.302585,-1.325392


In [4]:
## Describe the data
dat.fish.describe()

count     70.000000
mean      41.742857
std       47.849609
min        5.000000
25%       14.000000
50%       21.500000
75%       47.500000
max      245.000000
Name: fish, dtype: float64

In [5]:
mu_ = dat.fish.mean()
mu_

41.74285714285714

In [6]:
sigma_ = np.sqrt(dat.fish.var())
sigma_

47.84960912241293

In [7]:
################################################
## NB2 Model
################################################

In [8]:
## Instantiate data tensor, and variable for (NB2) model parameters
x = torch.autograd.Variable(torch.from_numpy(dat.fish.to_numpy())).type(torch.FloatTensor)
l_mu = torch.autograd.Variable(torch.rand(1), requires_grad=True)
l_sigma = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
## Note: nb2 mu/sigma parm are on log-scale
## Note: again, especially for sigma-parm this log-scale parameterization accelerates convergence to optima

In [9]:
def negbin2_nll(x, log_mu, log_sigma):
    nll = (
    -torch.sum( torch.lgamma(x + torch.exp(log_mu)/torch.exp(log_sigma)) 
    - torch.lgamma(torch.exp(log_mu)/torch.exp(log_sigma)) 
    - torch.lgamma(x + 1) 
    + x*torch.log(torch.exp(log_sigma))
    - (x + torch.exp(log_mu)/torch.exp(log_sigma))*torch.log(1 + torch.exp(log_sigma)) )
    )
    return nll

In [10]:
## Learning rate
learning_rate_mu = 2e-5
learning_rate_sigma = 2e-5

## Training loop
for t in range(25000):
    ## Backprop on negative log likelihood loss
    # NLLnb2 = -torch.sum( torch.lgamma(xnb2 + torch.exp(l_mnb2)/torch.exp(l_snb2)) 
    #                    - torch.lgamma(torch.exp(l_mnb2)/torch.exp(l_snb2)) 
    #                    - torch.lgamma(xnb2 + 1) 
    #                    + xnb2*torch.log(torch.exp(l_snb2))
    #                    - (xnb2 + torch.exp(l_mnb2)/torch.exp(l_snb2))*torch.log(1 + torch.exp(l_snb2)) )
    NLLnb2 = negbin2_nll(x=x, log_mu=l_mu, log_sigma=l_sigma)
    NLLnb2.backward()
    ## Logging to console
    if t % 1000 == 0:
        print("Iteration = ", t, 
              "loglik  =", NLLnb2.data.numpy(), 
              "lmu =", l_mu.data.numpy(), 
              "lsigma =", l_sigma.data.numpy(), 
              "dL/dlm = ", l_mu.grad.data.numpy(), 
              "dL/dlsigma = ", l_sigma.grad.data.numpy())
    ## SGD update of parms
    l_mu.data -= learning_rate_mu * l_mu.grad.data
    l_sigma.data -= learning_rate_sigma * l_sigma.grad.data
    ## Zero the gradients
    l_mu.grad.data.zero_()
    l_sigma.grad.data.zero_()


Iteration =  0 loglik  = 1776.8999 lmu = [0.3750397] lsigma = [0.18744373] dL/dlm =  [-238.82716] dL/dlsigma =  [-1039.477]
Iteration =  1000 loglik  = 407.33865 lmu = [2.2235005] lsigma = [3.169487] dL/dlm =  [-74.377144] dL/dlsigma =  [-17.383003]
Iteration =  2000 loglik  = 336.21155 lmu = [3.3832092] lsigma = [3.2382736] dL/dlm =  [-32.821854] dL/dlsigma =  [0.37756348]
Iteration =  3000 loglik  = 330.85852 lmu = [3.658852] lsigma = [3.286677] dL/dlm =  [-3.7660522] dL/dlsigma =  [-3.6140747]
Iteration =  4000 loglik  = 330.58524 lmu = [3.6965358] lsigma = [3.3480875] dL/dlm =  [-0.992157] dL/dlsigma =  [-2.4197083]
Iteration =  5000 loglik  = 330.5022 lmu = [3.710931] lsigma = [3.385687] dL/dlm =  [-0.5267639] dL/dlsigma =  [-1.4239502]
Iteration =  6000 loglik  = 330.47348 lmu = [3.7191176] lsigma = [3.4078293] dL/dlm =  [-0.31253052] dL/dlsigma =  [-0.8430786]
Iteration =  7000 loglik  = 330.4636 lmu = [3.724015] lsigma = [3.4209967] dL/dlm =  [-0.18847656] dL/dlsigma =  [-0.503

In [11]:
## Final estimate of NB2 mean parm (on log scale, and mean scale)
[l_mu.data.numpy(), np.exp(l_mu.data.numpy())]

[array([3.7313917], dtype=float32), array([41.737152], dtype=float32)]

In [12]:
## Final estimate of NB2 log-sigma parm (on log scale, and exponentiated scale)
[l_sigma.data.numpy(), np.exp(l_sigma.data.numpy())]

[array([3.4406924], dtype=float32), array([31.20856], dtype=float32)]

In [13]:
## Compare against intercept only NB2 model

In [14]:
'''
> ## NBII Model
> library(gamlss)

> R_NBII <- gamlss(fish ~ 1, data = species, family = "NBII")
GAMLSS-RS iteration 1: Global Deviance = 664.0622 
GAMLSS-RS iteration 2: Global Deviance = 661.4262 
GAMLSS-RS iteration 3: Global Deviance = 660.9797 
GAMLSS-RS iteration 4: Global Deviance = 660.9224 
GAMLSS-RS iteration 5: Global Deviance = 660.9161 
GAMLSS-RS iteration 6: Global Deviance = 660.9156 
> summary(R_NBII)
******************************************************************
Family:  c("NBII", "Negative Binomial type II") 

Call:  gamlss(formula = fish ~ 1, family = "NBII", data = species) 

Fitting method: RS() 

------------------------------------------------------------------
Mu link function:  log
Mu Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    3.732      0.105   35.54 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
Sigma link function:  log
Sigma Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)   3.4414     0.1899   18.12 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
No. of observations in the fit:  70 
Degrees of Freedom for the fit:  2
      Residual Deg. of Freedom:  68 
                      at cycle:  6 
 
Global Deviance:     660.9156 
            AIC:     664.9156 
            SBC:     669.4126 
******************************************************************
'''
;

''

In [15]:
################################
## Extend above example to Negative Binomial 2 regression
## y=Fish-count; x=Lake-size ==> y ~ b0 + b1*x
## Goal is to estimate Negative Binomial 2 regression parms: {b0, b1}
################################

In [16]:
## Instantiate data tensor, and variable for (NB2) model parameters
x = torch.autograd.Variable(torch.from_numpy(dat.scale_x.to_numpy())).type(torch.FloatTensor)
y = torch.autograd.Variable(torch.from_numpy(dat.fish.to_numpy())).type(torch.FloatTensor)
b0 = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
b1 = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
l_sigma = torch.autograd.Variable(torch.Tensor([-0.3]), requires_grad=True) 

In [17]:
def negbin2_nll(x, y, b0, b1, log_sigma):
    nll = (
    -torch.sum( torch.lgamma(y + torch.exp(b0 + b1*x)/torch.exp(log_sigma)) 
    - torch.lgamma(torch.exp(b0 + b1*x)/torch.exp(log_sigma)) 
    - torch.lgamma(y + 1) 
    + y*torch.log(torch.exp(log_sigma))
    - (y + torch.exp(b0 + b1*x)/torch.exp(log_sigma))*torch.log(1 + torch.exp(log_sigma)) )
    )
    return nll

In [18]:
## Learning rate
learning_rate_b0 = 2e-5
learning_rate_b1 = 2e-5
learning_rate_sigma = 2e-5

## Training loop
for t in range(20000):
    ## Backprop on negative log likelihood loss
    NLLnb2 = negbin2_nll(x=x, y=y, b0=b0, b1=b1, log_sigma=l_sigma)
    NLLnb2.backward()
    ## Logging to console
    if t % 100 == 0:
        print("Iteration = ", t, 
              "loglik  =", NLLnb2.data.numpy(), 
              "b0 =", b0.data.numpy(), 
              "b1 =", b1.data.numpy(), 
              "l_sigma =", l_sigma.data.numpy(), 
              "dL/db0 = ", b0.grad.data.numpy(),
              "dL/db1 = ", b1.grad.data.numpy(),
              "dL/dlsigma = ", l_sigma.grad.data.numpy()
             )
    ## SGD update of parms
    # b0.data -= learning_rate_b0 * torch.clamp(b0.grad.data, -1, 1)
    # b1.data -= learning_rate_b1 * torch.clamp(b1.grad.data, -1, 1)
    # l_sigma.data -= learning_rate_sigma * torch.clamp(l_sigma.grad.data, -1, 1)
    b0.data -= learning_rate_b0 * b0.grad.data
    b1.data -= learning_rate_b1 * b1.grad.data
    l_sigma.data -= learning_rate_sigma * l_sigma.grad.data
    ## Zero the gradients
    b0.grad.data.zero_()
    b1.grad.data.zero_()
    l_sigma.grad.data.zero_()
    

Iteration =  0 loglik  = 2128.6384 b0 = [0.6762645] b1 = [0.69284874] l_sigma = [-0.3] dL/db0 =  [-453.98737] dL/db1 =  [-238.27551] dL/dlsigma =  [-1126.2653]
Iteration =  100 loglik  = 817.036 b0 = [1.3129431] b1 = [1.0426255] l_sigma = [1.0827036] dL/db0 =  [-236.69403] dL/db1 =  [-130.626] dL/dlsigma =  [-397.75714]
Iteration =  200 loglik  = 556.7056 b0 = [1.7145234] b1 = [1.2522092] l_sigma = [1.6305727] dL/db0 =  [-171.28198] dL/db1 =  [-80.69571] dL/dlsigma =  [-185.58948]
Iteration =  300 loglik  = 465.01147 b0 = [2.0110843] b1 = [1.3682556] l_sigma = [1.91155] dL/db0 =  [-126.3819] dL/db1 =  [-35.33545] dL/dlsigma =  [-106.31116]
Iteration =  400 loglik  = 425.17297 b0 = [2.228041] b1 = [1.4005202] l_sigma = [2.086194] dL/db0 =  [-92.439575] dL/db1 =  [0.9124756] dL/dlsigma =  [-72.29883]
Iteration =  500 loglik  = 403.65167 b0 = [2.3903387] b1 = [1.3761214] l_sigma = [2.211781] dL/db0 =  [-71.63245] dL/db1 =  [21.208862] dL/dlsigma =  [-54.650085]
Iteration =  600 loglik  = 

Iteration =  5100 loglik  = 320.83902 b0 = [3.6250246] b1 = [0.43479237] l_sigma = [3.0258038] dL/db0 =  [-0.56726074] dL/db1 =  [0.31314087] dL/dlsigma =  [-1.8023071]
Iteration =  5200 loglik  = 320.83157 b0 = [3.6261287] b1 = [0.43418458] l_sigma = [3.0293164] dL/db0 =  [-0.5378113] dL/db1 =  [0.29470825] dL/dlsigma =  [-1.7099609]
Iteration =  5300 loglik  = 320.82562 b0 = [3.627176] b1 = [0.43361202] l_sigma = [3.0326483] dL/db0 =  [-0.5099182] dL/db1 =  [0.277771] dL/dlsigma =  [-1.6224976]
Iteration =  5400 loglik  = 320.82 b0 = [3.62817] b1 = [0.4330721] l_sigma = [3.0358095] dL/db0 =  [-0.48361206] dL/db1 =  [0.2621765] dL/dlsigma =  [-1.5388489]
Iteration =  5500 loglik  = 320.81464 b0 = [3.629113] b1 = [0.4325623] l_sigma = [3.0388093] dL/db0 =  [-0.4588318] dL/db1 =  [0.24765015] dL/dlsigma =  [-1.4606323]
Iteration =  5600 loglik  = 320.81012 b0 = [3.630007] b1 = [0.4320807] l_sigma = [3.041656] dL/db0 =  [-0.43569946] dL/db1 =  [0.23410034] dL/dlsigma =  [-1.3864136]
Iter

Iteration =  10000 loglik  = 320.76868 b0 = [3.645356] b1 = [0.4240328] l_sigma = [3.0901299] dL/db0 =  [-0.05029297] dL/db1 =  [0.02441406] dL/dlsigma =  [-0.14901733]
Iteration =  10100 loglik  = 320.76862 b0 = [3.6454513] b1 = [0.42398512] l_sigma = [3.0904174] dL/db0 =  [-0.04754639] dL/db1 =  [0.02336121] dL/dlsigma =  [-0.14230347]
Iteration =  10200 loglik  = 320.7685 b0 = [3.6455467] b1 = [0.4239397] l_sigma = [3.0906975] dL/db0 =  [-0.04434204] dL/db1 =  [0.02256775] dL/dlsigma =  [-0.1350708]
Iteration =  10300 loglik  = 320.76852 b0 = [3.645635] b1 = [0.42389524] l_sigma = [3.0909598] dL/db0 =  [-0.04171753] dL/db1 =  [0.02148438] dL/dlsigma =  [-0.12982178]
Iteration =  10400 loglik  = 320.768 b0 = [3.6457064] b1 = [0.4238537] l_sigma = [3.091212] dL/db0 =  [-0.04119873] dL/db1 =  [0.02003479] dL/dlsigma =  [-0.12295532]
Iteration =  10500 loglik  = 320.76862 b0 = [3.645778] b1 = [0.42381495] l_sigma = [3.0914505] dL/db0 =  [-0.04013062] dL/db1 =  [0.01889038] dL/dlsigma = 

Iteration =  14900 loglik  = 320.76813 b0 = [3.647119] b1 = [0.4231298] l_sigma = [3.095599] dL/db0 =  [-0.00585938] dL/db1 =  [0.00213623] dL/dlsigma =  [-0.01412964]
Iteration =  15000 loglik  = 320.76846 b0 = [3.6471255] b1 = [0.42312682] l_sigma = [3.0956228] dL/db0 =  [-0.00588989] dL/db1 =  [0.00212097] dL/dlsigma =  [-0.01324463]
Iteration =  15100 loglik  = 320.76773 b0 = [3.6471317] b1 = [0.42312384] l_sigma = [3.0956466] dL/db0 =  [-0.00588989] dL/db1 =  [0.0020752] dL/dlsigma =  [-0.01269531]
Iteration =  15200 loglik  = 320.76825 b0 = [3.647138] b1 = [0.42312086] l_sigma = [3.0956705] dL/db0 =  [-0.00595093] dL/db1 =  [0.00201416] dL/dlsigma =  [-0.01223755]
Iteration =  15300 loglik  = 320.7683 b0 = [3.6471438] b1 = [0.42311788] l_sigma = [3.0956943] dL/db0 =  [-0.00592041] dL/db1 =  [0.00198364] dL/dlsigma =  [-0.01156616]
Iteration =  15400 loglik  = 320.76764 b0 = [3.64715] b1 = [0.4231149] l_sigma = [3.0957181] dL/db0 =  [-0.00601196] dL/db1 =  [0.00196838] dL/dlsigma 

Iteration =  19900 loglik  = 320.7681 b0 = [3.6471999] b1 = [0.42308417] l_sigma = [3.0958977] dL/db0 =  [-0.00588989] dL/db1 =  [0.00071716] dL/dlsigma =  [-0.00582886]


In [19]:
## Final estimate of NB2 regression parms: {b0,b1}
[b0.data.numpy(), b1.data.numpy(), l_sigma.data.numpy()]

[array([3.6471999], dtype=float32),
 array([0.42308417], dtype=float32),
 array([3.0958977], dtype=float32)]

In [20]:
## Compare above NB2 regression estimates obtained from NB2 SGD; against those from GAMLSS package 
## Note: we NEED to scale the covariate/feature vector "x" (lake-size), in order to obtain reasonable parm estimates (convergence, etc.)

In [21]:
'''
> ## NBII Model
> library(gamlss)

> R_NBII <- gamlss(fish ~ scale_x, data = species, family = "NBII")
GAMLSS-RS iteration 1: Global Deviance = 649.8994 
GAMLSS-RS iteration 2: Global Deviance = 643.0438 
GAMLSS-RS iteration 3: Global Deviance = 641.7008 
GAMLSS-RS iteration 4: Global Deviance = 641.5498 
GAMLSS-RS iteration 5: Global Deviance = 641.537 
GAMLSS-RS iteration 6: Global Deviance = 641.536 
GAMLSS-RS iteration 7: Global Deviance = 641.5359 
> summary(R_NBII)
******************************************************************
Family:  c("NBII", "Negative Binomial type II") 

Call:  gamlss(formula = fish ~ scale_x, family = "NBII", data = species) 

Fitting method: RS() 

------------------------------------------------------------------
Mu link function:  log
Mu Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)  3.64771    0.09587  38.049 < 0.0000000000000002 ***
scale_x      0.42279    0.09233   4.579            0.0000208 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
Sigma link function:  log
Sigma Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)   3.0968     0.1918   16.15 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
No. of observations in the fit:  70 
Degrees of Freedom for the fit:  3
      Residual Deg. of Freedom:  67 
                      at cycle:  7 
 
Global Deviance:     641.5359 
            AIC:     647.5359 
            SBC:     654.2813 
******************************************************************
'''
;

''

In [22]:
########################
## Print session info to console 
########################
sinfo()

-----
numpy       1.20.3
pandas      1.3.1
sinfo       0.3.1
torch       1.9.0
-----
IPython             7.26.0
jupyter_client      6.1.12
jupyter_core        4.7.1
jupyterlab          3.1.7
notebook            6.4.3
-----
Python 3.9.6 (default, Aug 18 2021, 15:44:49) [MSC v.1916 64 bit (AMD64)]
Windows-10-10.0.19042-SP0
8 logical CPU cores, Intel64 Family 6 Model 126 Stepping 5, GenuineIntel
-----
Session information updated at 2021-08-21 02:13
