In [58]:
import numpy as np
from families import Gaussian, Bernoulli, Poisson
from glm import GLM
from simulation import Simulation

import statsmodels.api as sm
import statsmodels

In [59]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = np.random.uniform(size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [60]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x11d048828>

In [61]:
model.coef_

array([ 0.99012575, -2.03050689,  1.03338721])

In [62]:
model.coef_covariance_matrix_

array([[  6.61568782e-04,  -5.59391251e-04,  -5.71045192e-04],
       [ -5.59391251e-04,   1.14025367e-03,  -1.51522322e-05],
       [ -5.71045192e-04,  -1.51522322e-05,   1.15266436e-03]])

In [63]:
model.coef_standard_error_

array([ 0.02572098,  0.03376764,  0.03395091])

In [64]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.310
Model:                            OLS   Adj. R-squared:                  0.310
Method:                 Least Squares   F-statistic:                     2247.
Date:                Thu, 31 Aug 2017   Prob (F-statistic):               0.00
Time:                        20:32:24   Log-Likelihood:                -14024.
No. Observations:               10000   AIC:                         2.805e+04
Df Residuals:                    9997   BIC:                         2.808e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9901      0.026     38.495      0.0

## Run some simulations off the linear model.

In [65]:
s = Simulation(model)

In [66]:
s.sample(X)

array([[ 1.0169067 ,  2.30358981,  1.25405858, ...,  0.6578417 ,
         0.97077118,  0.71437614],
       [ 0.23039934,  2.93839439,  0.65908541, ...,  1.21655112,
        -0.12994446, -0.28533433],
       [ 0.48423457,  3.88413237,  0.89324821, ...,  3.63291064,
        -0.96303421, -1.38667515],
       ..., 
       [ 0.91443084,  1.13149329,  1.23269333, ...,  1.2619434 ,
        -0.84411013, -0.9672626 ],
       [ 1.68811327,  0.9060831 ,  3.05975441, ...,  2.49456912,
         1.29999224, -0.10753916],
       [-0.66257397,  1.68117595,  0.64238738, ...,  3.65992798,
         0.92680093,  0.38102131]])

In [67]:
models = s.parametric_bootstrap(X, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.97576049 -2.0365819   1.05271293]
[ 0.98565861 -2.02946046  1.04894415]
[ 0.98585588 -2.0160375   1.04578312]
[ 0.98883412 -2.0306613   1.01416709]
[ 0.98441635 -1.9888065   1.03089003]
[ 0.96040388 -2.01517418  1.07194308]
[ 0.99841736 -2.00246023  1.01234271]
[ 0.97826631 -2.04919359  1.07699907]
[ 0.9762117  -1.99203309  1.02651406]
[ 1.0052343  -1.97449567  0.95116409]


In [68]:
models = s.non_parametric_bootstrap(X, y, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.93847765 -1.95989564  1.05760751]
[ 1.02351551 -2.07241499  1.02093914]
[ 0.99429092 -2.04613155  1.05253188]
[ 1.01835538 -2.07177749  0.99182212]
[ 0.98575099 -2.05043034  1.04260073]
[ 1.0012661  -2.03359687  1.06290603]
[ 0.94279831 -2.04666513  1.13509242]
[ 1.021483   -2.0483463   0.98419557]
[ 0.98585365 -2.01262255  1.01915351]
[ 0.95476329 -2.05900235  1.13193697]


## Linear Model with Sample Weights

In [69]:
sample_weights = np.random.uniform(0, 2, size=N)

In [70]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [71]:
model.coef_

array([ 0.97727694, -2.01428371,  1.03092396])

## Logistic Model

In [72]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [73]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.GLM at 0x11d04c748>

In [74]:
model.coef_

array([ 0.9557087 , -1.98396942,  0.95348685])

In [75]:
model.dispersion_

array(1.0)

In [76]:
model.coef_covariance_matrix_

array([[ 0.00314718, -0.00281759, -0.00245855],
       [-0.00281759,  0.00575934, -0.0004638 ],
       [-0.00245855, -0.0004638 ,  0.00553116]])

In [77]:
model.coef_standard_error_

array([ 0.05609973,  0.07589029,  0.07437175])

In [78]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.628612
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Thu, 31 Aug 2017   Pseudo R-squ.:                 0.06552
Time:                        20:32:30   Log-Likelihood:                -6286.1
converged:                       True   LL-Null:                       -6726.9
                                        LLR p-value:                3.879e-192
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9557      0.056     17.036      0.000       0.846       1.066
x1            -1.9840      0.

In [79]:
X.shape[0]

10000

In [80]:
s = Simulation(model)

In [81]:
s.sample(X, n_sim=10)

array([[ 0.,  1.,  1., ...,  0.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  1.,  0.,  1.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.],
       [ 1.,  0.,  1., ...,  1.,  1.,  0.]])

In [82]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 1.04922623 -2.10660995  0.91388233]
[ 1.02292572 -2.0543495   0.89129902]
[ 1.04332149 -2.10936694  0.92235681]
[ 0.99480486 -2.04755492  0.93204081]
[ 1.07071614 -2.03538029  0.84614258]
[ 0.92249173 -1.89231514  0.93822164]
[ 0.9876872  -2.09843855  0.98571232]
[ 0.86725325 -1.85237114  0.94442227]
[ 1.01371999 -2.016722    0.82123156]
[ 0.99997336 -2.09310288  0.99525827]


In [83]:
for model in s.non_parametric_bootstrap(X, y_logistic, n_sim=10):
    print(model.coef_)

[ 0.95341559 -2.02352743  0.95763801]
[ 0.93408256 -1.99602387  1.05838911]
[ 0.95682048 -1.91836315  0.8066071 ]
[ 1.00430771 -2.03977869  0.95115963]
[ 0.98713396 -2.03390483  0.87865217]
[ 0.97083281 -2.017276    0.90649712]
[ 0.93743486 -2.01966411  1.05347461]
[ 0.94619079 -2.06068354  1.07041153]
[ 1.06515297 -2.02713988  0.80099542]
[ 0.90562146 -2.03396292  1.00113271]


## Poission Model

In [84]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [85]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.GLM at 0x11cef15f8>

In [86]:
model.coef_

array([ 0.96788949, -1.99050176,  1.04195954])

In [87]:
model.coef_covariance_matrix_

array([[  3.41279183e-04,  -2.36389554e-04,  -3.61130598e-04],
       [ -2.36389554e-04,   7.10349029e-04,  -9.51442912e-06],
       [ -3.61130598e-04,  -9.51442912e-06,   6.22435960e-04]])

In [88]:
model.coef_standard_error_

array([ 0.01847374,  0.02665237,  0.02494867])

In [89]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.589532
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Thu, 31 Aug 2017   Pseudo R-squ.:                  0.1980
Time:                        20:32:46   Log-Likelihood:                -15895.
converged:                       True   LL-Null:                       -19821.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9679      0.018     52.393      0.000       0.932       1.004
x1            -1.9905      0.

In [91]:
s = Simulation(model)

In [93]:
s.sample(X, n_sim=10)

array([[ 3.,  3.,  2., ...,  5.,  1.,  0.],
       [ 3.,  5.,  1., ...,  3.,  2.,  0.],
       [ 1.,  4.,  3., ...,  8.,  1.,  2.],
       ..., 
       [ 2.,  0.,  2., ...,  5.,  2.,  0.],
       [ 5.,  5.,  1., ...,  6.,  1.,  4.],
       [ 2.,  4.,  2., ...,  5.,  3.,  0.]])

In [99]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.96151099 -1.99728407  1.05577442]
[ 1.00558146 -2.00089176  0.99918063]
[ 0.97312533 -1.98748874  1.02778791]
[ 0.94403445 -1.96937844  1.07427859]
[ 0.98358348 -2.00822818  1.03359978]
[ 0.94349465 -1.99877039  1.07177374]
[ 0.96621698 -1.9884313   1.04983511]
[ 0.96819303 -2.00038328  1.05842744]
[ 0.96992219 -2.02523479  1.06749679]
[ 0.95474677 -1.97042477  1.04619878]


In [101]:
for model in s.non_parametric_bootstrap(X, y_poisson, n_sim=10):
    print(model.coef_)

[ 0.984897   -2.02894934  1.04021779]
[ 0.96020869 -1.94306207  1.00875646]
[ 0.94336372 -2.00068361  1.09770317]
[ 0.94243268 -1.94597514  1.04374761]
[ 0.94617518 -1.99298741  1.08540471]
[ 0.96786723 -2.06137197  1.07513042]
[ 0.96362497 -1.98529532  1.05708189]
[ 0.97659714 -1.97570261  1.03351564]
[ 0.97316038 -1.97283943  1.02878469]
[ 0.99404346 -2.05488398  1.03913288]


## Poisson with Exposures

In [31]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [32]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.GLM at 0x11a612320>

In [33]:
model.coef_

array([ 1.00070441, -2.00660168,  1.00680519])

## Linear Model with Correlated Predictors

In [34]:
N = 100
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [35]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x11a612780>

In [36]:
model.coef_

array([ 0.97863671, -1.87050695,  0.63148404])

In [37]:
model.coef_covariance_matrix_

array([[ 0.0452718 , -0.06974194,  0.00739348],
       [-0.06974194,  0.17369825, -0.08084008],
       [ 0.00739348, -0.08084008,  0.1340376 ]])

In [38]:
model.coef_standard_error_

array([ 0.21277171,  0.41677123,  0.36611146])

In [39]:
smod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.572937
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Thu, 31 Aug 2017   Pseudo R-squ.:                  0.1969
Time:                        20:22:29   Log-Likelihood:                -15729.
converged:                       True   LL-Null:                       -19585.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0169      0.019     54.861      0.000       0.981       1.053
x1            -2.0211      0.