In [2]:
import numpy as np
from glm.families import Gaussian, Bernoulli, Poisson, Gamma
from glm.glm import GLM
from glm.simulation import Simulation

import statsmodels.api as sm
import statsmodels

  from pandas.core import datetools


In [3]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = np.random.uniform(size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [4]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

NameError: name 'np' is not defined

In [4]:
model.coef_

array([ 0.99635543, -1.99629741,  1.00064224])

In [5]:
model.coef_covariance_matrix_

array([[  7.14102057e-04,  -6.26724241e-04,  -6.08710202e-04],
       [ -6.26724241e-04,   1.25421734e-03,   9.25665417e-06],
       [ -6.08710202e-04,   9.25665417e-06,   1.22054212e-03]])

In [6]:
model.coef_standard_error_

array([ 0.02672269,  0.03541493,  0.03493626])

In [7]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     2011.
Date:                Tue, 05 Sep 2017   Prob (F-statistic):               0.00
Time:                        20:24:31   Log-Likelihood:                -14284.
No. Observations:               10000   AIC:                         2.857e+04
Df Residuals:                    9997   BIC:                         2.860e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9964      0.027     37.285      0.0

## Run some simulations off the linear model.

In [8]:
s = Simulation(model)

In [9]:
s.sample(X)

array([[ 3.70129661, -0.6846578 , -0.09455588, ...,  0.77735591,
        -0.78314013, -0.1368336 ],
       [ 1.26816058,  1.58064373, -0.63720806, ...,  1.0595045 ,
         1.32458491,  0.14019694],
       [ 2.19145526, -1.38957948, -0.24567161, ..., -1.59956006,
         0.93516367, -0.77032893],
       ..., 
       [ 1.5610019 , -1.97574987, -0.43519168, ...,  0.13564043,
         0.26483758, -1.01793018],
       [ 0.70239723, -2.25595563, -2.79748173, ...,  1.31902664,
         0.43670634,  2.01167441],
       [ 2.08770028,  1.14225446, -0.10541682, ..., -0.33825367,
        -0.63851084,  1.26672304]])

In [10]:
models = s.parametric_bootstrap(X, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.98162236 -2.03567327  1.01568139]
[ 1.01062235 -2.02111842  0.98941209]
[ 0.98177338 -1.93334512  0.9635636 ]
[ 0.97348952 -1.96427168  1.00651917]
[ 1.02700507 -1.99583012  0.97857184]
[ 1.0269677  -2.03274058  0.93637709]
[ 0.98656013 -1.97215881  0.98412844]
[ 0.99296205 -1.94002276  0.93362129]
[ 0.95122823 -1.937337    1.02629062]
[ 0.98781081 -1.97403761  0.9849708 ]


In [11]:
models = s.non_parametric_bootstrap(X, y, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.98958337 -1.99858477  1.02894515]
[ 0.98910574 -1.98980774  1.01597902]
[ 1.00686395 -2.03099162  1.02014553]
[ 0.97955368 -1.99309755  1.05420866]
[ 1.00916242 -2.03016171  1.00387838]
[ 0.96171034 -1.9965813   1.074397  ]
[ 0.99978291 -1.97152692  0.99051099]
[ 0.9870019  -1.98532416  1.0078555 ]
[ 0.98754665 -2.08987659  1.06658947]
[ 0.96139181 -1.96498539  1.02310981]


## Linear Model with Sample Weights

In [12]:
sample_weights = np.random.uniform(0, 2, size=N)

In [13]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [14]:
model.coef_

array([ 0.98002025, -1.97878839,  0.99701917])

## Logistic Model

In [15]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [16]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.GLM at 0x1145431d0>

In [17]:
model.coef_

array([ 0.89871355, -1.91699804,  1.03891725])

In [18]:
model.dispersion_

array(1.0)

In [19]:
model.coef_covariance_matrix_

array([[ 0.00320247, -0.00296787, -0.00247548],
       [-0.00296787,  0.00597349, -0.00036727],
       [-0.00247548, -0.00036727,  0.00557585]])

In [20]:
model.coef_standard_error_

array([ 0.05659035,  0.07728837,  0.07467164])

In [21]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.628920
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Tue, 05 Sep 2017   Pseudo R-squ.:                 0.06309
Time:                        20:24:31   Log-Likelihood:                -6289.2
converged:                       True   LL-Null:                       -6712.7
                                        LLR p-value:                1.172e-184
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8987      0.057     15.881      0.000       0.788       1.010
x1            -1.9170      0.

In [22]:
s = Simulation(model)

In [23]:
s.sample(X, n_sim=10)

array([[ 1.,  1.,  0., ...,  0.,  1.,  1.],
       [ 1.,  1.,  0., ...,  1.,  1.,  0.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  1., ...,  1.,  0.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  1.]])

In [24]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.92564864 -1.92688149  1.06622891]
[ 0.85065123 -1.91724283  1.10727003]
[ 0.91145371 -1.87940933  0.97294795]
[ 0.95059166 -2.00422411  1.01795682]
[ 0.95294682 -1.92089939  0.93576771]
[ 0.83301548 -1.89802154  1.10498029]
[ 0.94930102 -2.00072179  1.08404554]
[ 0.92893991 -1.89640031  0.91271676]
[ 0.95307018 -1.99558355  0.9548541 ]
[ 0.98007991 -1.93357466  0.89697612]


In [25]:
for model in s.non_parametric_bootstrap(X, y_logistic, n_sim=10):
    print(model.coef_)

[ 0.99432829 -2.06510277  1.00814253]
[ 0.98563528 -2.02506098  0.97763048]
[ 0.86571611 -1.87862626  1.04218002]
[ 0.85818542 -1.76339913  0.97009285]
[ 0.96069046 -1.92246395  0.94057993]
[ 0.88824452 -1.83838339  0.9966131 ]
[ 0.91820619 -1.99393334  1.06745476]
[ 0.83200891 -1.88370187  1.06091485]
[ 0.96691717 -1.99306609  0.99032914]
[ 0.90465221 -1.98037578  1.13214442]


## Poission Model

In [26]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [27]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.GLM at 0x114543fd0>

In [28]:
model.coef_

array([ 1.0086694 , -1.97818485,  0.97117054])

In [29]:
model.coef_covariance_matrix_

array([[  3.43843334e-04,  -2.56069421e-04,  -3.57504957e-04],
       [ -2.56069421e-04,   7.35249718e-04,   4.66147780e-06],
       [ -3.57504957e-04,   4.66147780e-06,   6.17483912e-04]])

In [30]:
model.coef_standard_error_

array([ 0.01854301,  0.02711549,  0.02484922])

In [31]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.592442
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Tue, 05 Sep 2017   Pseudo R-squ.:                  0.1894
Time:                        20:24:31   Log-Likelihood:                -15924.
converged:                       True   LL-Null:                       -19645.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0087      0.019     54.396      0.000       0.972       1.045
x1            -1.9782      0.

In [32]:
s = Simulation(model)

In [33]:
s.sample(X, n_sim=10)

array([[ 7.,  1.,  3., ...,  1.,  2.,  1.],
       [ 6.,  2.,  2., ...,  0.,  1.,  2.],
       [ 2.,  1.,  0., ...,  1.,  0.,  2.],
       ..., 
       [ 5.,  0.,  4., ...,  4.,  1.,  2.],
       [ 4.,  0.,  0., ...,  1.,  3.,  1.],
       [ 3.,  0.,  1., ...,  0.,  3.,  3.]])

In [34]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 1.04131551 -2.02870274  0.96015203]
[ 0.9865432  -1.97783889  1.00230886]
[ 0.98413632 -1.95545574  0.9862919 ]
[ 0.9959502  -1.94803506  0.96143026]
[ 0.97781876 -1.95695045  1.00165066]
[ 1.01496379 -2.00056926  0.99042938]
[ 1.02135523 -1.99645755  0.95254862]
[ 0.9994143  -1.97387585  0.9956619 ]
[ 1.01578883 -1.95262976  0.95795374]
[ 1.04933868 -2.02090905  0.94344272]


In [35]:
for model in s.non_parametric_bootstrap(X, y_poisson, n_sim=10):
    print(model.coef_)

[ 1.06290381 -1.96248999  0.90134792]
[ 1.03185593 -1.94488788  0.92644831]
[ 0.99314465 -1.95786974  0.99195353]
[ 0.99376247 -1.98555699  0.99527739]
[ 1.01671451 -1.94341999  0.94767768]
[ 1.02577043 -1.94293666  0.9318222 ]
[ 1.02339721 -1.98288654  0.95649635]
[ 1.01188683 -1.98392503  0.95299995]
[ 1.04415875 -2.02044843  0.93458004]
[ 1.01155041 -1.96508247  0.94387053]


## Poisson with Exposures

In [36]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [37]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.GLM at 0x11455b1d0>

In [38]:
model.coef_

array([ 1.00427703, -2.00560461,  1.00295974])

In [39]:
model.coef_standard_error_

array([ 0.0082743 ,  0.0120978 ,  0.01103477])

## Gamma Regression

In [40]:
mu = np.exp(nu)
y_gamma = np.random.gamma(shape=2.0, scale=(mu / 2.0), size=N)

In [41]:
gamma_model = GLM(family=Gamma())
gamma_model.fit(X, y_gamma)

<glm.GLM at 0x11455b2b0>

In [42]:
gamma_model.coef_

array([ 1.03103796, -2.02817775,  0.99021627])

In [43]:
gamma_model.coef_standard_error_

array([ 0.01943828,  0.02576108,  0.02541289])

In [44]:
gamma_model.dispersion_

0.53935872224986214

In [45]:
gamma_model = sm.GLM(y_gamma, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = gamma_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                   0.49234992195
Method:                          IRLS   Log-Likelihood:                -13984.
Date:                Tue, 05 Sep 2017   Deviance:                       5392.0
Time:                        20:24:31   Pearson chi2:                 4.92e+03
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0310      0.019     55.516      0.000       0.995       1.067
x1            -2.0282      0.025    -82.403      0.0

## Exponential Regression

In [46]:
mu = np.exp(nu)
y_exponential = np.random.exponential(scale=mu, size=N)

In [47]:
exponential_model = GLM(family=Gamma())
exponential_model.fit(X, y_exponential)

<glm.GLM at 0x114543f28>

In [48]:
exponential_model.coef_

array([ 0.97864103, -1.98623894,  1.00804803])

In [49]:
exponential_model.coef_standard_error_

array([ 0.02861641,  0.03792463,  0.03741204])

In [50]:
exponential_model.dispersion_

1.1689401343999695

In [51]:
exponential_model = sm.GLM(y_exponential, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = exponential_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                   1.01735526808
Method:                          IRLS   Log-Likelihood:                -14923.
Date:                Tue, 05 Sep 2017   Deviance:                       11686.
Time:                        20:24:31   Pearson chi2:                 1.02e+04
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9786      0.027     36.658      0.000       0.926       1.031
x1            -1.9862      0.035    -56.140      0.0

## Linear Model with Correlated Predictors

In [52]:
N = 1000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [53]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x11455b550>

In [54]:
model.coef_

array([ 1.06930601, -2.04554535,  1.09497987])

In [55]:
model.coef_covariance_matrix_

array([[ 0.00413713, -0.006213  , -0.00012022],
       [-0.006213  ,  0.01547032, -0.00597631],
       [-0.00012022, -0.00597631,  0.01213519]])

In [56]:
model.coef_standard_error_

array([ 0.06432056,  0.12437974,  0.11015986])

In [57]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.219
Model:                            OLS   Adj. R-squared:                  0.218
Method:                 Least Squares   F-statistic:                     140.0
Date:                Tue, 05 Sep 2017   Prob (F-statistic):           2.67e-54
Time:                        20:24:31   Log-Likelihood:                -1415.2
No. Observations:                1000   AIC:                             2836.
Df Residuals:                     997   BIC:                             2851.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0693      0.064     16.625      0.0