In [1]:
import numpy as np
import pandas as pd

from glm.families import Gaussian, Bernoulli, Poisson, Gamma
from glm.glm import GLM
from glm.simulation import Simulation

import statsmodels.api as sm
import statsmodels

In [2]:
N = 1000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(-1, 1, size=N)
X[:, 2] = np.random.uniform(-1, 1, size=N)
nu = 1 - 2*X[:, 1] + np.random.normal(0.0, 1.0, size=N)

## Linear Model

In [3]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.glm.GLM at 0x12a60feb0>

In [4]:
model.coef_

array([ 1.02412258, -2.02181704, -0.00926685])

In [5]:
model.coef_covariance_matrix_

array([[ 2.05253835e-03, -4.78378986e-05, -5.79404850e-06],
       [-4.78378986e-05,  6.24854792e-03, -6.71004212e-04],
       [-5.79404850e-06, -6.71004212e-04,  6.49409189e-03]])

In [6]:
model.coef_standard_error_

array([0.04530495, 0.07904776, 0.08058593])

In [7]:
model.p_values_

array([1.93080463e-113, 1.36966301e-144, 9.08450345e-001])

In [8]:
yhat = model.predict(X)
yhat[:10]

array([ 2.66224115,  2.82770149,  1.85749566,  2.66425329,  0.6837718 ,
        0.41979   , -0.37194544,  0.68375672, -0.40879303,  0.88094072])

In [9]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.399
Model:                            OLS   Adj. R-squared:                  0.398
Method:                 Least Squares   F-statistic:                     331.1
Date:                Wed, 29 Jan 2020   Prob (F-statistic):          5.42e-111
Time:                        11:05:25   Log-Likelihood:                -1776.9
No. Observations:                1000   AIC:                             3560.
Df Residuals:                     997   BIC:                             3574.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0241      0.045     22.605      0.0

In [10]:
res.params

array([ 1.02412258, -2.02181704, -0.00926685])

In [11]:
res.bse

array([0.04530495, 0.07904776, 0.08058593])

## Linear Regression With Formula

In [12]:
df = pd.DataFrame(
    np.concatenate([X[:, 1:], y.reshape(-1, 1)], axis=1),
    columns=['Moshi', 'SwimSwim', 'y'])

In [13]:
df.shape

(1000, 3)

In [14]:
model = GLM(family=Gaussian())
model.fit(df, formula='y ~ Moshi + SwimSwim')

<glm.glm.GLM at 0x12c654c70>

In [15]:
model.coef_

array([ 1.02412258, -2.02181704, -0.00926685])

In [16]:
model.summary()

Gaussian GLM Model Summary.
Name         Parameter Estimate  Standard Error
-----------------------------------------------
Intercept         1.02            0.05
Moshi            -2.02            0.08
SwimSwim         -0.01            0.08


In [17]:
yhat = model.predict(df)
yhat[:10]

array([ 2.66224115,  2.82770149,  1.85749566,  2.66425329,  0.6837718 ,
        0.41979   , -0.37194544,  0.68375672, -0.40879303,  0.88094072])

In [18]:
model.formula

'y ~ Moshi + SwimSwim'

### Simulation with Model Formula

In [19]:
from glm.simulation import Simulation

In [20]:
sim = Simulation(model)

In [21]:
df.shape

(1000, 3)

In [24]:
boots = sim.non_parametric_bootstrap(df, n_sim=5)

In [27]:
boots[0].X_names

['Intercept', 'Moshi', 'SwimSwim']

## Run some simulations off the linear model.

In [68]:
s = Simulation(model)

In [69]:
s.sample(X)

array([[ 2.72485002,  1.06203265,  0.14325478, ..., -1.14142867,
         1.46297585,  1.59846956],
       [ 3.04345059,  1.91203858,  0.36177651, ..., -0.06844137,
         2.31213208,  0.67339918],
       [ 1.24277034,  4.75516935,  0.90761723, ..., -2.49040742,
         3.49276037,  3.03020002],
       ..., 
       [-0.71346133,  1.1558288 , -0.16495367, ...,  0.84193825,
         3.71962176,  0.99770795],
       [ 0.7338006 ,  1.54286613, -0.93109339, ..., -0.25872836,
         5.77269828,  0.35305152],
       [-0.11973334,  0.69005778,  0.99108631, ...,  1.22836895,
         4.98481066,  3.74340941]])

In [70]:
models = s.parametric_bootstrap(X, n_sim=10)
for model in models:
    print(model.coef_)

[ 1.09076847 -1.97885617  0.06667513]
[ 1.07317892 -1.93018894 -0.02723429]
[ 1.07995212 -1.92470157  0.0230675 ]
[ 1.130081   -1.97247471 -0.05137084]
[ 1.17167148 -2.00706441  0.10064266]
[ 1.10740914 -1.9678318  -0.02178495]
[ 1.08897211 -2.04248583 -0.04755837]
[ 0.99510678 -1.84586693 -0.1274409 ]
[ 1.07852068 -1.86491168 -0.02583743]
[ 1.04571672 -1.92017373  0.05497773]


In [71]:
models = s.non_parametric_bootstrap(X, y, n_sim=10)
for model in models:
    print(model.coef_)

[ 1.08899747 -1.97909035  0.0242303 ]
[ 1.1438153  -2.0103238   0.03155373]
[  1.07893939e+00  -1.87856140e+00  -1.24740123e-03]
[ 1.06236628 -1.91918097 -0.01660036]
[ 1.05378187 -2.0633577  -0.07282795]
[ 1.07175649 -1.95840599  0.06544901]
[ 1.09868402 -2.00568189 -0.08815472]
[ 1.03488206 -2.00576657  0.05256831]
[ 1.12335559 -1.86088373 -0.0611884 ]
[ 1.16289827 -1.96426665  0.00614709]


## Linear Model with Sample Weights

In [72]:
sample_weights = np.random.uniform(0, 2, size=N)

In [73]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [74]:
model.coef_

array([ 1.06640549, -1.90311277,  0.02257427])

## Logistic Model

In [75]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [76]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.glm.GLM at 0x10c2b13c8>

In [77]:
model.coef_

array([ 0.80789905, -1.73306961, -0.01066346])

In [78]:
model.dispersion_

array(1.0)

In [79]:
model.coef_covariance_matrix_

array([[ 0.00590394, -0.00333689, -0.00028904],
       [-0.00333689,  0.02086212,  0.00132631],
       [-0.00028904,  0.00132631,  0.01620665]])

In [80]:
model.coef_standard_error_

array([ 0.0768371 ,  0.14443724,  0.12730533])

In [81]:
model.p_values_

array([  3.70602872e-26,   1.80302706e-33,   9.33244943e-01])

In [82]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.551934
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      997
Method:                           MLE   Df Model:                            2
Date:                Tue, 15 May 2018   Pseudo R-squ.:                  0.1381
Time:                        11:26:01   Log-Likelihood:                -551.93
converged:                       True   LL-Null:                       -640.37
                                        LLR p-value:                 3.917e-39
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8079      0.077     10.514      0.000       0.657       0.958
x1            -1.7331      0.

In [56]:
s = Simulation(model)

In [57]:
s.sample(X, n_sim=10)

array([[ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 1.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.]])

In [58]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.99422083 -1.96298402  1.05235721]
[ 0.96443709 -1.99466032  1.03640602]
[ 1.00035789 -2.06225649  1.00416288]
[ 0.95647016 -1.9472292   1.09357851]
[ 0.93641543 -1.96767443  1.01963135]
[ 0.96543839 -1.95048238  1.0720573 ]
[ 0.97230804 -1.96945012  1.01792291]
[ 0.94768126 -2.01506523  1.12307292]
[ 0.98593321 -2.01224627  1.03240051]
[ 0.95445028 -1.95445635  1.01371649]


In [59]:
for model in s.non_parametric_bootstrap(X, y_logistic, n_sim=10):
    print(model.coef_)

[ 0.97689309 -1.97413283  1.10627714]
[ 0.95892487 -1.99413312  0.98665457]
[ 0.94129474 -1.94450942  1.00859525]
[ 0.93336539 -1.95264506  0.99312847]
[ 0.93940596 -1.91603988  0.99230598]
[ 0.95260326 -1.95432899  0.96347066]
[ 0.97404336 -1.95443378  1.07337741]
[ 0.93040456 -1.85128555  0.98612584]
[ 0.94250477 -1.9621798   1.02531186]
[ 0.9412844  -1.94538671  1.06160179]


## Poission Model

In [60]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [61]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.glm.GLM at 0x114430978>

In [62]:
model.coef_

array([ 0.9993738 , -2.01557039,  0.98708617])

In [63]:
model.coef_covariance_matrix_

array([[  5.19786341e-05,   5.41592437e-05,  -1.88130624e-05],
       [  5.41592437e-05,   1.00811556e-04,  -9.71528570e-07],
       [ -1.88130624e-05,  -9.71528570e-07,   6.18642911e-05]])

In [64]:
model.coef_standard_error_

array([ 0.00720962,  0.0100405 ,  0.00786539])

In [65]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.857216
         Iterations 19
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sun, 17 Sep 2017   Pseudo R-squ.:                  0.6578
Time:                        16:10:00   Log-Likelihood:                -18572.
converged:                       True   LL-Null:                       -54276.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9994      0.007    138.617      0.000       0.985       1.014
x1            -2.0156      0

In [66]:
s = Simulation(model)

In [67]:
s.sample(X, n_sim=10)

array([[  1.,   7.,   0., ...,   5.,   2.,   2.],
       [  3.,   8.,   1., ...,   5.,   0.,   5.],
       [  1.,   6.,   0., ...,   7.,   0.,   2.],
       ..., 
       [  0.,  10.,   1., ...,  10.,   1.,   6.],
       [  2.,   4.,   0., ...,   4.,   1.,   2.],
       [  3.,   5.,   0., ...,   3.,   1.,   3.]])

In [68]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.9962561  -2.02223879  0.98520933]
[ 1.00174874 -2.016179    0.98438619]
[ 1.01444728 -2.00367046  0.97880814]
[ 0.99162838 -2.01637286  1.00481727]
[ 1.00802374 -2.01063381  0.96973567]
[ 1.00579857 -2.01884458  0.9866297 ]
[ 0.9960399  -2.01891399  0.99412272]
[ 0.99357372 -2.0331467   0.99903938]
[ 0.99830196 -2.01604988  0.98445944]
[ 1.00074698 -2.02003424  0.98114143]


In [36]:
for model in s.non_parametric_bootstrap(X, y_poisson, n_sim=10):
    print(model.coef_)

[ 0.99944037 -1.95435021  0.97540803]
[ 1.03624533 -1.96567739  0.91936164]
[ 0.98336717 -1.96622823  1.00794745]
[ 1.02681072 -1.9644957   0.92060739]
[ 1.01231914 -1.94929026  0.96059324]
[ 1.01857949 -1.96132365  0.95859381]
[ 0.99512153 -1.97303933  0.99006156]
[ 1.01719772 -1.93460717  0.9529537 ]
[ 0.99912808 -1.95763946  0.97810085]
[ 1.00010555 -1.94437559  0.97117927]


## Poisson with Exposures

In [37]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [38]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.glm.GLM at 0x118155438>

In [39]:
model.coef_

array([ 1.00325959, -1.99621054,  0.98932295])

In [40]:
model.coef_standard_error_

array([ 0.0084044 ,  0.01200987,  0.01120648])

## Gamma Regression

In [41]:
mu = np.exp(nu)
y_gamma = np.random.gamma(shape=2.0, scale=(mu / 2.0), size=N)

In [42]:
gamma_model = GLM(family=Gamma())
gamma_model.fit(X, y_gamma)

<glm.glm.GLM at 0x118155358>

In [43]:
gamma_model.coef_

array([ 1.04538732, -1.99544524,  0.92854162])

In [44]:
gamma_model.coef_standard_error_

array([ 0.01960164,  0.02551379,  0.02568215])

In [45]:
gamma_model.dispersion_

0.54319477864783849

In [46]:
gamma_model = sm.GLM(y_gamma, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = gamma_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                  0.499038505471
Method:                          IRLS   Log-Likelihood:                -13910.
Date:                Sun, 17 Sep 2017   Deviance:                       5430.3
Time:                        14:47:52   Pearson chi2:                 4.99e+03
No. Iterations:                     5                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0454      0.019     55.641      0.000       1.009       1.082
x1            -1.9954      0.024    -81.597      0.0

## Exponential Regression

In [47]:
mu = np.exp(nu)
y_exponential = np.random.exponential(scale=mu, size=N)

In [48]:
exponential_model = GLM(family=Gamma())
exponential_model.fit(X, y_exponential)

<glm.glm.GLM at 0x11814d4a8>

In [49]:
exponential_model.coef_

array([ 1.02547471, -2.03635258,  0.99990048])

In [50]:
exponential_model.coef_standard_error_

array([ 0.02824993,  0.03677053,  0.03701316])

In [51]:
exponential_model.dispersion_

1.1282497110609833

In [52]:
exponential_model = sm.GLM(y_exponential, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = exponential_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                   0.98383654671
Method:                          IRLS   Log-Likelihood:                -15006.
Date:                Sun, 17 Sep 2017   Deviance:                       11279.
Time:                        14:47:52   Pearson chi2:                 9.84e+03
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0255      0.026     38.873      0.000       0.974       1.077
x1            -2.0364      0.034    -59.305      0.0

## Linear Model with Correlated Predictors

In [53]:
N = 1000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [54]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.glm.GLM at 0x118155160>

In [55]:
model.coef_

array([ 1.04961873, -2.09917034,  0.92403124])

In [56]:
model.coef_covariance_matrix_

array([[ 0.00390364, -0.00599316,  0.00019188],
       [-0.00599316,  0.01504632, -0.00598741],
       [ 0.00019188, -0.00598741,  0.01107235]])

In [57]:
model.coef_standard_error_

array([ 0.06247909,  0.12266344,  0.10522523])

In [58]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.226
Method:                 Least Squares   F-statistic:                     146.9
Date:                Sun, 17 Sep 2017   Prob (F-statistic):           1.23e-56
Time:                        14:47:52   Log-Likelihood:                -1398.6
No. Observations:                1000   AIC:                             2803.
Df Residuals:                     997   BIC:                             2818.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0496      0.062     16.800      0.0