In [1]:
import numpy as np
from families import Gaussian, Bernoulli, Poisson
from glm import GLM

import statsmodels.api as sm
import statsmodels

  from pandas.core import datetools


In [2]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = np.random.uniform(size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [3]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x10f96acc0>

In [4]:
model.coef_

array([ 0.97283562, -1.9782353 ,  1.01741106])

In [5]:
model.parameter_covariance_

array([[  6.87553818e-04,  -5.82484656e-04,  -5.93551871e-04],
       [ -5.82484656e-04,   1.16309256e-03,  -4.96812149e-07],
       [ -5.93551871e-04,  -4.96812149e-07,   1.18333000e-03]])

In [6]:
np.sqrt(np.diag(model.parameter_covariance_))

array([ 0.02622125,  0.03410414,  0.03439956])

In [7]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.298
Model:                            OLS   Adj. R-squared:                  0.298
Method:                 Least Squares   F-statistic:                     2119.
Date:                Sun, 27 Aug 2017   Prob (F-statistic):               0.00
Time:                        09:58:25   Log-Likelihood:                -14080.
No. Observations:               10000   AIC:                         2.817e+04
Df Residuals:                    9997   BIC:                         2.819e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9728      0.026     37.101      0.0

## Linear Model with Sample Weights

In [8]:
sample_weights = np.random.uniform(0, 2, size=N)

In [9]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [10]:
model.coef_

array([ 0.96515947, -2.00042585,  1.04629058])

## Logistic Model

In [11]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [12]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.GLM at 0x11b0cddd8>

In [13]:
model.coef_

array([ 1.00263994, -2.08255651,  1.06109309])

In [14]:
model.dispersion_

array(1.0)

In [15]:
model.parameter_covariance_

array([[ 0.0032954 , -0.00297931, -0.00253656],
       [-0.00297931,  0.00597923, -0.00047367],
       [-0.00253656, -0.00047367,  0.00574351]])

In [16]:
np.sqrt(np.diag(model.parameter_covariance_) / model.dispersion_)

array([ 0.0574056 ,  0.0773255 ,  0.07578593])

In [17]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.620576
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sun, 27 Aug 2017   Pseudo R-squ.:                 0.07253
Time:                        09:58:25   Log-Likelihood:                -6205.8
converged:                       True   LL-Null:                       -6691.0
                                        LLR p-value:                1.758e-211
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0026      0.057     17.466      0.000       0.890       1.115
x1            -2.0826      0.

In [18]:
coefs = np.empty((1000, 3))
for i in range(1000):
    y_logistic = np.random.binomial(1, p=p, size=N)
    model = GLM(family=Bernoulli())
    model.fit(X, y_logistic)
    coefs[i, :] = model.coef_
    
print(coefs.std(axis=0))

[ 0.0567005   0.07613682  0.07495514]


## Poission Model

In [19]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [20]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.GLM at 0x11d1c9e48>

In [21]:
model.coef_

array([ 0.98907883, -1.95988788,  0.98968922])

In [22]:
model.parameter_covariance_

array([[  3.46769035e-04,  -2.43641600e-04,  -3.65751291e-04],
       [ -2.43641600e-04,   7.04259854e-04,   6.86776346e-07],
       [ -3.65751291e-04,   6.86776346e-07,   6.27622948e-04]])

In [23]:
np.sqrt(np.diag(model.parameter_covariance_))

array([ 0.01862174,  0.02653789,  0.0250524 ])

In [24]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.586115
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sun, 27 Aug 2017   Pseudo R-squ.:                  0.1925
Time:                        09:58:30   Log-Likelihood:                -15861.
converged:                       True   LL-Null:                       -19643.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9891      0.019     53.114      0.000       0.953       1.026
x1            -1.9599      0.

## Poisson with Exposures

In [25]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [26]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.GLM at 0x11d1c9898>

In [27]:
model.coef_

array([ 0.99147346, -1.99804414,  1.01097446])

## Linear Model with Correlated Predictors

In [28]:
N = 1000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.9*X[:, 1] + np.random.uniform(-0.1, 0.1, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [29]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x11d1c9b70>

In [30]:
model.coef_

array([ 0.97546766, -2.78108901,  1.94327397])

In [31]:
model.parameter_covariance_

array([[  4.07572986e-03,  -5.95539209e-03,  -2.08378062e-04],
       [ -5.95539209e-03,   2.47742472e-01,  -2.61850746e-01],
       [ -2.08378062e-04,  -2.61850746e-01,   2.91094356e-01]])