# statsmodels
## statistical models, hypothesis tests, and data exploration

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# grab the data/load data
data = sm.datasets.get_rdataset("Guerry", "HistData").data

In [4]:
type(data)

pandas.core.frame.DataFrame

In [6]:
data.columns

Index(['dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy',
       'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce',
       'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery',
       'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area',
       'Pop1831'],
      dtype='object')

In [7]:
# Fit OLS regression use ln
# With R style formulas
res = smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=data).fit()

In [10]:
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,22.2
Date:,"Sun, 08 Aug 2021",Prob (F-statistic):,1.9e-08
Time:,13:44:00,Log-Likelihood:,-379.82
No. Observations:,86,AIC:,765.6
Df Residuals:,83,BIC:,773.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,246.4341,35.233,6.995,0.000,176.358,316.510
Literacy,-0.4889,0.128,-3.832,0.000,-0.743,-0.235
np.log(Pop1831),-31.3114,5.977,-5.239,0.000,-43.199,-19.424

0,1,2,3
Omnibus:,3.713,Durbin-Watson:,2.019
Prob(Omnibus):,0.156,Jarque-Bera (JB):,3.394
Skew:,-0.487,Prob(JB):,0.183
Kurtosis:,3.003,Cond. No.,702.0


In [12]:
# Using numpy arrays
nobs = 100
X = np.random.random((nobs, 2))
X = sm.add_constant(X)

In [13]:
beta = [1,.1,.5] # coef
e = np.random.random(nobs) # error
y = np.dot(X, beta) + e

In [14]:
res_2 = sm.OLS(y, X).fit()

In [15]:
res_2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.302
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,21.01
Date:,"Sun, 08 Aug 2021",Prob (F-statistic):,2.62e-08
Time:,13:46:47,Log-Likelihood:,-6.5828
No. Observations:,100,AIC:,19.17
Df Residuals:,97,BIC:,26.98
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5181,0.075,20.347,0.000,1.370,1.666
x1,0.1317,0.090,1.458,0.148,-0.048,0.311
x2,0.6090,0.094,6.461,0.000,0.422,0.796

0,1,2,3
Omnibus:,10.324,Durbin-Watson:,1.751
Prob(Omnibus):,0.006,Jarque-Bera (JB):,4.208
Skew:,-0.213,Prob(JB):,0.122
Kurtosis:,2.09,Cond. No.,5.74


In [16]:
dir(res)

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abat_diagonal',
 '_cache',
 '_data_attr',
 '_data_in_cache',
 '_get_robustcov_results',
 '_is_nested',
 '_use_t',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_prediction',
 'get_robustcov_results',
 'initia