In [1]:
###############################################
## PyTorch Negative-Binomial-1 Distribution Likelihood Optimization Examples
## Author: Chris Meaney
## Date: August 2021
###############################################

In [2]:
## Dependency modules
import numpy as np
import pandas as pd
import torch
from sinfo import sinfo

In [3]:
##########################################################
## Use pandas to import data, and store as data.frame
## Data are 1) response/target variable (number of fish = count random variable), 2) lake size (single continous feature/predictor)
##########################################################
dat = pd.read_csv('C://Users//ChristopherMeaney//Desktop//PyTorch_Stuff//pytorch_count_dists//species.csv', encoding='latin1')
dat.head(n=15)

Unnamed: 0,fish,lake,x,scale_x
0,10,5,1.609438,-1.53343
1,37,41,3.713572,-0.901903
2,60,171,5.141664,-0.473281
3,113,25719,10.154985,1.031399
4,99,59596,10.995344,1.283621
5,13,1,0.0,-2.016481
6,30,44,3.78419,-0.880708
7,114,58016,10.968474,1.275556
8,112,19477,9.87699,0.947962
9,17,10,2.302585,-1.325392


In [4]:
## Describe the data
dat.fish.describe()

count     70.000000
mean      41.742857
std       47.849609
min        5.000000
25%       14.000000
50%       21.500000
75%       47.500000
max      245.000000
Name: fish, dtype: float64

In [5]:
mu_ = dat.fish.mean()
mu_

41.74285714285714

In [6]:
sigma_ = np.sqrt(dat.fish.var())
sigma_

47.84960912241293

In [7]:
################################################
## NB1 Model
################################################

In [8]:
## Instantiate data tensor, and variable for (NB1) model parameters
x = torch.autograd.Variable(torch.from_numpy(dat.fish.to_numpy())).type(torch.FloatTensor)
l_mu = torch.autograd.Variable(torch.rand(1), requires_grad=True)
l_sigma = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
## Note: nb1 mean/sigma parm are on log-scale
## Note: this seems to accelerate convergence; especially for sigma-parm

In [9]:
def negbin1_nll(x, log_mu, log_sigma):
    nll = (
    -torch.sum( torch.lgamma(x + 1/torch.exp(log_sigma)) 
    - torch.lgamma(1/torch.exp(log_sigma)) 
    - torch.lgamma(x+1) 
    + x*torch.log((torch.exp(log_sigma)*torch.exp(log_mu))/(1 + torch.exp(log_sigma)*torch.exp(log_mu))) 
    + (1/torch.exp(log_sigma))*torch.log(1/(1 + torch.exp(log_sigma)*torch.exp(log_mu))) )
    )
    return nll

In [10]:
## Learning rate
learning_rate_mu = 2e-5
learning_rate_sigma = 2e-5

## Training loop
for t in range(20000):
    ## Backprop on negative log likelihood loss
    #NLLnb1 = -torch.sum( torch.lgamma(xnb1 + 1/torch.exp(l_snb1)) 
    #                    - torch.lgamma(1/torch.exp(l_snb1)) 
    #                    - torch.lgamma(xnb1+1) 
    #                    + xnb1*torch.log((torch.exp(l_snb1)*torch.exp(l_mnb1))/(1 + torch.exp(l_snb1)*torch.exp(l_mnb1))) 
    #                    + (1/torch.exp(l_snb1))*torch.log(1/(1 + torch.exp(l_snb1)*torch.exp(l_mnb1))) )
    NLLnb1 = negbin1_nll(x=x, log_mu=l_mu, log_sigma=l_sigma)
    NLLnb1.backward()
    ## Logging to console
    if t % 1000 == 0:
        print("Iteration = ", t, 
              "loglik  =", NLLnb1.data.numpy(), 
              "lmu =", l_mu.data.numpy(), 
              "lsigma =", l_sigma.data.numpy(), 
              "dL/dlmu = ", l_mu.grad.data.numpy(), 
              "dL/dlsigma = ", l_sigma.grad.data.numpy())
    ## SGD update of parms
    l_mu.data -= learning_rate_mu * l_mu.grad.data
    l_sigma.data -= learning_rate_sigma * l_sigma.grad.data
    ## Zero the gradients
    l_mu.grad.data.zero_()
    l_sigma.grad.data.zero_()


Iteration =  0 loglik  = 1752.426 lmu = [0.22673309] lsigma = [0.01673245] dL/dlmu =  [-1245.4331] dL/dlsigma =  [-1031.3241]
Iteration =  1000 loglik  = 372.81244 lmu = [2.7877162] lsigma = [1.0422134] dL/dlmu =  [-37.9292] dL/dlsigma =  [21.111374]
Iteration =  2000 loglik  = 346.42648 lmu = [3.33155] lsigma = [0.5725937] dL/dlmu =  [-19.034668] dL/dlsigma =  [22.862778]
Iteration =  3000 loglik  = 335.03677 lmu = [3.5962446] lsigma = [0.18257956] dL/dlmu =  [-8.2595215] dL/dlsigma =  [15.662231]
Iteration =  4000 loglik  = 331.5428 lmu = [3.696501] lsigma = [-0.05634008] dL/dlmu =  [-2.5725098] dL/dlsigma =  [8.609451]
Iteration =  5000 loglik  = 330.69653 lmu = [3.7241726] lsigma = [-0.18045595] dL/dlmu =  [-0.6015625] dL/dlsigma =  [4.2368774]
Iteration =  6000 loglik  = 330.50873 lmu = [3.7301686] lsigma = [-0.24003156] dL/dlmu =  [-0.11743164] dL/dlsigma =  [1.9847107]
Iteration =  7000 loglik  = 330.46844 lmu = [3.73129] lsigma = [-0.2676252] dL/dlmu =  [-0.02124023] dL/dlsigma

In [11]:
## Final estimate of NB1 mean parm (on log scale, and mean scale)
[l_mu.data.numpy(), np.exp(l_mu.data.numpy())]

[array([3.7314703], dtype=float32), array([41.740437], dtype=float32)]

In [12]:
## Final estimate of NB1 log-sigma parm (on log scale, and exponentiated scale)
[l_sigma.data.numpy(), np.exp(l_sigma.data.numpy())]

[array([-0.29050773], dtype=float32), array([0.74788374], dtype=float32)]

In [13]:
## Compare against "intercept only" Negative Binomial 1 regression model

In [14]:
'''
> library(gamlss)

> R_NBI <- gamlss(fish ~ 1, data = species, family = "NBI")
GAMLSS-RS iteration 1: Global Deviance = 660.9155 
GAMLSS-RS iteration 2: Global Deviance = 660.9155 
> summary(R_NBI)
******************************************************************
Family:  c("NBI", "Negative Binomial type I") 

Call:  gamlss(formula = fish ~ 1, family = "NBI", data = species) 

Fitting method: RS() 

------------------------------------------------------------------
Mu link function:  log
Mu Coefficients:
            Estimate Std. Error t value            Pr(>|t|)    
(Intercept)    3.732      0.105   35.54 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
Sigma link function:  log
Sigma Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  -0.2906     0.1581  -1.838   0.0705 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
No. of observations in the fit:  70 
Degrees of Freedom for the fit:  2
      Residual Deg. of Freedom:  68 
                      at cycle:  2 
 
Global Deviance:     660.9155 
            AIC:     664.9155 
            SBC:     669.4125 
******************************************************************
'''
;

''

In [15]:
################################
## Extend above example to Negative Binomial 1 regression
## y=Fish-count; x=Lake-size ==> y ~ b0 + b1*x
## Goal is to estimate Negative Binomial 1 regression parms: {b0, b1}
################################

In [16]:
## Instantiate data tensor, and variable for (NB1) model parameters
x = torch.autograd.Variable(torch.from_numpy(dat.scale_x.to_numpy())).type(torch.FloatTensor)
y = torch.autograd.Variable(torch.from_numpy(dat.fish.to_numpy())).type(torch.FloatTensor)
b0 = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
b1 = torch.autograd.Variable(torch.rand(1), requires_grad=True) 
l_sigma = torch.autograd.Variable(torch.Tensor([-0.3]), requires_grad=True) 

In [17]:
def negbin1_nll(x, y, b0, b1, log_sigma):
    nll = (
    -torch.sum( torch.lgamma(y + 1/torch.exp(log_sigma)) 
    - torch.lgamma(1/torch.exp(log_sigma)) 
    - torch.lgamma(y+1) 
    + y*torch.log((torch.exp(log_sigma)*torch.exp(b0 + b1*x))/(1 + torch.exp(log_sigma)*torch.exp(b0 + b1*x))) 
    + (1/torch.exp(log_sigma))*torch.log(1/(1 + torch.exp(log_sigma)*torch.exp(b0 + b1*x))) )
    )
    return nll

In [18]:
## Learning rate
learning_rate_b0 = 2e-4
learning_rate_b1 = 2e-4
learning_rate_sigma = 2e-4

## Training loop
for t in range(25000):
    ## Backprop on negative log likelihood loss
    NLLnb1 = negbin1_nll(x=x, y=y, b0=b0, b1=b1, log_sigma=l_sigma)
    NLLnb1.backward()
    ## Logging to console
    if t % 100 == 0:
        print("Iteration = ", t, 
              "loglik  =", NLLnb1.data.numpy(), 
              "b0 =", b0.data.numpy(), 
              "b1 =", b1.data.numpy(), 
              "l_sigma =", l_sigma.data.numpy(), 
              "dL/db0 = ", b0.grad.data.numpy(),
              "dL/db1 = ", b1.grad.data.numpy(),
              "dL/dlsigma = ", l_sigma.grad.data.numpy()
             )
    ## SGD update of parms
    b0.data -= learning_rate_b0 * torch.clamp(b0.grad.data, -1, 1)
    b1.data -= learning_rate_b1 * torch.clamp(b1.grad.data, -1, 1)
    l_sigma.data -= learning_rate_sigma * torch.clamp(l_sigma.grad.data, -1, 1)
    ## Zero the gradients
    b0.grad.data.zero_()
    b1.grad.data.zero_()
    l_sigma.grad.data.zero_()
    

Iteration =  0 loglik  = 1335.2981 b0 = [0.6129582] b1 = [0.50209737] l_sigma = [-0.3] dL/db0 =  [-990.9984] dL/db1 =  [-322.3601] dL/dlsigma =  [-748.3817]
Iteration =  100 loglik  = 1294.9219 b0 = [0.63295555] b1 = [0.5220947] l_sigma = [-0.27999967] dL/db0 =  [-959.5194] dL/db1 =  [-295.56458] dL/dlsigma =  [-721.4824]
Iteration =  200 loglik  = 1256.2261 b0 = [0.6529529] b1 = [0.5420921] l_sigma = [-0.25999933] dL/db0 =  [-928.78516] dL/db1 =  [-269.64697] dL/dlsigma =  [-695.25854]
Iteration =  300 loglik  = 1219.1648 b0 = [0.67295027] b1 = [0.56208944] l_sigma = [-0.239999] dL/db0 =  [-898.8146] dL/db1 =  [-244.62207] dL/dlsigma =  [-669.7287]
Iteration =  400 loglik  = 1183.6892 b0 = [0.6929476] b1 = [0.5820868] l_sigma = [-0.21999866] dL/db0 =  [-869.62305] dL/db1 =  [-220.49927] dL/dlsigma =  [-644.9076]
Iteration =  500 loglik  = 1149.7524 b0 = [0.712945] b1 = [0.60208416] l_sigma = [-0.19999832] dL/db0 =  [-841.2217] dL/db1 =  [-197.28418] dL/dlsigma =  [-620.8064]
Iteration

Iteration =  5200 loglik  = 459.62125 b0 = [1.6530154] b1 = [0.6292908] l_sigma = [0.7399795] dL/db0 =  [-170.45288] dL/db1 =  [0.12060547] dL/dlsigma =  [-74.02611]
Iteration =  5300 loglik  = 454.84967 b0 = [1.6730187] b1 = [0.6269309] l_sigma = [0.75997686] dL/db0 =  [-163.85376] dL/db1 =  [0.11560059] dL/dlsigma =  [-68.85009]
Iteration =  5400 loglik  = 450.30945 b0 = [1.693022] b1 = [0.62466526] l_sigma = [0.7799742] dL/db0 =  [-157.47314] dL/db1 =  [0.11083984] dL/dlsigma =  [-63.85492]
Iteration =  5500 loglik  = 445.99323 b0 = [1.7130253] b1 = [0.6224908] l_sigma = [0.7999716] dL/db0 =  [-151.30615] dL/db1 =  [0.10644531] dL/dlsigma =  [-59.035545]
Iteration =  5600 loglik  = 441.8924 b0 = [1.7330287] b1 = [0.6204038] l_sigma = [0.81996894] dL/db0 =  [-145.34741] dL/db1 =  [0.10229492] dL/dlsigma =  [-54.387375]
Iteration =  5700 loglik  = 437.99973 b0 = [1.753032] b1 = [0.61840075] l_sigma = [0.8399663] dL/db0 =  [-139.59155] dL/db1 =  [0.09814453] dL/dlsigma =  [-49.90589]
I

Iteration =  10400 loglik  = 354.10025 b0 = [2.693188] b1 = [0.59114814] l_sigma = [0.5037935] dL/db0 =  [-54.819824] dL/db1 =  [0.01000977] dL/dlsigma =  [12.441589]
Iteration =  10500 loglik  = 352.75928 b0 = [2.7131913] b1 = [0.5909461] l_sigma = [0.48379368] dL/db0 =  [-54.004883] dL/db1 =  [0.01025391] dL/dlsigma =  [12.779068]
Iteration =  10600 loglik  = 351.4285 b0 = [2.7331946] b1 = [0.5907402] l_sigma = [0.46379334] dL/db0 =  [-53.17334] dL/db1 =  [0.01037598] dL/dlsigma =  [13.111328]
Iteration =  10700 loglik  = 350.1075 b0 = [2.753198] b1 = [0.5905298] l_sigma = [0.443793] dL/db0 =  [-52.325195] dL/db1 =  [0.01086426] dL/dlsigma =  [13.436859]
Iteration =  10800 loglik  = 348.79767 b0 = [2.7732012] b1 = [0.5903157] l_sigma = [0.42379266] dL/db0 =  [-51.45996] dL/db1 =  [0.01074219] dL/dlsigma =  [13.755844]
Iteration =  10900 loglik  = 347.49884 b0 = [2.7932045] b1 = [0.5900975] l_sigma = [0.40379232] dL/db0 =  [-50.57666] dL/db1 =  [0.01086426] dL/dlsigma =  [14.0681305]


Iteration =  15400 loglik  = 311.60587 b0 = [3.5511994] b1 = [0.57844627] l_sigma = [-0.4962206] dL/db0 =  [0.00195312] dL/db1 =  [-0.00598145] dL/dlsigma =  [10.149597]
Iteration =  15500 loglik  = 311.4087 b0 = [3.5511632] b1 = [0.57856554] l_sigma = [-0.5162185] dL/db0 =  [0.00170898] dL/db1 =  [-0.00598145] dL/dlsigma =  [9.57431]
Iteration =  15600 loglik  = 311.22308 b0 = [3.551128] b1 = [0.57868636] l_sigma = [-0.53621584] dL/db0 =  [0.00195312] dL/db1 =  [-0.00610352] dL/dlsigma =  [8.992432]
Iteration =  15700 loglik  = 311.04922 b0 = [3.5510902] b1 = [0.5788099] l_sigma = [-0.5562132] dL/db0 =  [0.00195312] dL/db1 =  [-0.00622559] dL/dlsigma =  [8.402618]
Iteration =  15800 loglik  = 310.88687 b0 = [3.5510526] b1 = [0.57893544] l_sigma = [-0.57621056] dL/db0 =  [0.00195312] dL/db1 =  [-0.00634766] dL/dlsigma =  [7.8060303]
Iteration =  15900 loglik  = 310.73688 b0 = [3.5510147] b1 = [0.57906336] l_sigma = [-0.5962079] dL/db0 =  [0.00195312] dL/db1 =  [-0.00634766] dL/dlsigma 

Iteration =  20500 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  20600 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  20700 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  20800 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  20900 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  21000 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma = [-0.8173692] dL/db0 =  [0.00024414] dL/db1 =  [0.] dL/dlsigma =  [0.]
Iteration =  21100 loglik  = 309.92212 b0 = [3.5505316] b1 = [0.5806778] l_sigma =

In [19]:
## Final estimate of NBI regression parms: {b0,b1}
[b0.data.numpy(), b1.data.numpy(), l_sigma.data.numpy()]

[array([3.5505316], dtype=float32),
 array([0.5806778], dtype=float32),
 array([-0.8173692], dtype=float32)]

In [20]:
## Compare above NBI regression estimates obtained from NBI SGD; against those from GAMLSS package 
## Note: we NEED to scale the covariate/feature vector "x" (lake-size), in order to obtain reasonable parm estimates (convergence, etc.)

In [21]:
'''
> ## NBI Model
library(gamlss)

> R_NBI <- gamlss(fish ~ scale_x, data = species, family = "NBI")
GAMLSS-RS iteration 1: Global Deviance = 619.8502 
GAMLSS-RS iteration 2: Global Deviance = 619.8443 
GAMLSS-RS iteration 3: Global Deviance = 619.8443 
> summary(R_NBI)
******************************************************************
Family:  c("NBI", "Negative Binomial type I") 

Call:  gamlss(formula = fish ~ scale_x, family = "NBI", data = species) 

Fitting method: RS() 

------------------------------------------------------------------
Mu link function:  log
Mu Coefficients:
            Estimate Std. Error t value             Pr(>|t|)    
(Intercept)  3.55053    0.08252  43.027 < 0.0000000000000002 ***
scale_x      0.58071    0.07704   7.538       0.000000000164 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
Sigma link function:  log
Sigma Coefficients:
            Estimate Std. Error t value   Pr(>|t|)    
(Intercept)  -0.8174     0.1694  -4.824 0.00000845 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

------------------------------------------------------------------
No. of observations in the fit:  70 
Degrees of Freedom for the fit:  3
      Residual Deg. of Freedom:  67 
                      at cycle:  3 
 
Global Deviance:     619.8443 
            AIC:     625.8443 
            SBC:     632.5898 
******************************************************************
'''
;

''

In [22]:
########################
## Print session info to console 
########################
sinfo()

-----
numpy       1.20.3
pandas      1.3.1
sinfo       0.3.1
torch       1.9.0
-----
IPython             7.26.0
jupyter_client      6.1.12
jupyter_core        4.7.1
jupyterlab          3.1.7
notebook            6.4.3
-----
Python 3.9.6 (default, Aug 18 2021, 15:44:49) [MSC v.1916 64 bit (AMD64)]
Windows-10-10.0.19042-SP0
8 logical CPU cores, Intel64 Family 6 Model 126 Stepping 5, GenuineIntel
-----
Session information updated at 2021-08-21 02:17
