In [1]:
import numpy as np
from families import Gaussian, Bernoulli, Poisson
from glm import GLM
from simulate import Simulation

import statsmodels.api as sm
import statsmodels

  from pandas.core import datetools


In [2]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = np.random.uniform(size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [3]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x103e2cba8>

In [4]:
model.coef_

array([ 1.02662982, -2.00689534,  0.97315765])

In [5]:
model.coef_covariance_matrix_

array([[  6.98009426e-04,  -5.97492508e-04,  -5.99827306e-04],
       [ -5.97492508e-04,   1.22414128e-03,  -1.76138485e-05],
       [ -5.99827306e-04,  -1.76138485e-05,   1.21283021e-03]])

In [6]:
model.coef_standard_error_

array([ 0.02641987,  0.03498773,  0.03482571])

In [7]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     2013.
Date:                Tue, 29 Aug 2017   Prob (F-statistic):               0.00
Time:                        21:28:59   Log-Likelihood:                -14242.
No. Observations:               10000   AIC:                         2.849e+04
Df Residuals:                    9997   BIC:                         2.851e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0266      0.026     38.858      0.0

In [8]:
s = Simulation(model)

In [9]:
s.sample(X)

array([[ 0.73365651,  0.80991087,  1.05693376, ..., -1.03728119,
         1.02814989,  1.45978269],
       [-0.37673923,  0.11526435,  0.99587777, ..., -1.12673948,
        -0.16283743,  1.01772313],
       [ 2.29599493,  0.46341846,  2.74691534, ..., -0.2144835 ,
         1.06719874,  1.16468047],
       ..., 
       [-0.29884266,  0.3774177 ,  1.0201069 , ..., -0.18011773,
         1.62147134, -0.69506036],
       [ 1.01200878,  2.21459133, -2.81323676, ...,  1.6359281 ,
         0.90394193, -0.23068041],
       [-0.94119256,  2.53267066,  1.81224906, ...,  0.87192684,
         1.19614948,  1.92611619]])

In [37]:
models = s.parameteric_bootstrap(X, n_sim=3)
for model in models:
    print(model.coef_)

[ 1.41358154 -2.57473853  1.35877735]
[ 0.99580065 -2.0501879   1.01397079]
[ 0.79776362 -1.57506204  1.12329694]


## Linear Model with Sample Weights

In [11]:
sample_weights = np.random.uniform(0, 2, size=N)

In [12]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [13]:
model.coef_

array([ 1.03646375, -2.0034211 ,  0.95508425])

## Logistic Model

In [14]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [15]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.GLM at 0x10f5ba940>

In [16]:
model.coef_

array([ 1.00909882, -1.95075566,  0.94139471])

In [17]:
model.dispersion_

array(1.0)

In [18]:
model.coef_covariance_matrix_

array([[ 0.00322092, -0.00293221, -0.0024815 ],
       [-0.00293221,  0.005957  , -0.00046789],
       [-0.0024815 , -0.00046789,  0.0056166 ]])

In [19]:
model.coef_standard_error_

array([ 0.05675315,  0.07718157,  0.074944  ])

In [20]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.624807
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Tue, 29 Aug 2017   Pseudo R-squ.:                 0.06190
Time:                        21:28:59   Log-Likelihood:                -6248.1
converged:                       True   LL-Null:                       -6660.4
                                        LLR p-value:                8.839e-180
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0091      0.057     17.780      0.000       0.898       1.120
x1            -1.9508      0.

In [21]:
coefs = np.empty((1000, 3))
for i in range(1000):
    y_logistic = np.random.binomial(1, p=p, size=N)
    model = GLM(family=Bernoulli())
    model.fit(X, y_logistic)
    coefs[i, :] = model.coef_
    
print(coefs.std(axis=0))

[ 0.05972696  0.0781678   0.07708184]


## Poission Model

In [22]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [23]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.GLM at 0x1116be5c0>

In [24]:
model.coef_

array([ 1.00373364, -1.99765604,  1.00028229])

In [25]:
model.coef_covariance_matrix_

array([[  3.39740482e-04,  -2.42718949e-04,  -3.57137992e-04],
       [ -2.42718949e-04,   7.19341461e-04,  -5.06093087e-06],
       [ -3.57137992e-04,  -5.06093087e-06,   6.16364154e-04]])

In [26]:
model.coef_standard_error_

array([ 0.01843205,  0.02682054,  0.02482668])

In [27]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.609461
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Tue, 29 Aug 2017   Pseudo R-squ.:                  0.1934
Time:                        21:29:04   Log-Likelihood:                -16095.
converged:                       True   LL-Null:                       -19954.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0037      0.018     54.456      0.000       0.968       1.040
x1            -1.9977      0.

## Poisson with Exposures

In [28]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [29]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.GLM at 0x10f5bae80>

In [30]:
model.coef_

array([ 0.98734115, -1.99382456,  1.00943382])

## Linear Model with Correlated Predictors

In [31]:
N = 100
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [32]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x10f5bae48>

In [33]:
model.coef_

array([ 0.901248  , -1.87875528,  0.95034967])

In [34]:
model.coef_covariance_matrix_

array([[ 0.05334932, -0.07458111, -0.00610246],
       [-0.07458111,  0.17783147, -0.0699096 ],
       [-0.00610246, -0.0699096 ,  0.14609301]])

In [35]:
model.coef_standard_error_

array([ 0.23097473,  0.42170068,  0.38222115])

In [36]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.155
Method:                 Least Squares   F-statistic:                     10.11
Date:                Tue, 29 Aug 2017   Prob (F-statistic):           0.000102
Time:                        21:29:04   Log-Likelihood:                -147.32
No. Observations:                 100   AIC:                             300.6
Df Residuals:                      97   BIC:                             308.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9012      0.231      3.902      0.0