# Statsmodels
## Statisical models, hypothesis tests, and data exploration

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
# grab the data/load
data = sm.datasets.get_rdataset("Guerry", "HistData").data

In [3]:
type(data)

pandas.core.frame.DataFrame

In [4]:
data.head()

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,...,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,...,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,...,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,...,46,42,76,66,16,85,34,161.927,7340,298.26
3,4,E,Basses-Alpes,12935,7289,46,2733,23018,14238,1:Sm,...,70,12,37,80,32,29,2,351.399,6925,155.9
4,5,E,Hautes-Alpes,17488,8174,69,6962,23076,16171,1:Sm,...,22,23,64,79,35,7,1,320.28,5549,129.1


In [5]:
data.columns

Index(['dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy',
       'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce',
       'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery',
       'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area',
       'Pop1831'],
      dtype='object')

In [10]:
# fit ols regression use ln
# with R-style forumlas 
res = smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data = data).fit()

In [15]:
# default confidence interval significance level = 0.05
res.summary(alpha = .05)

# changes the format of table 
# print(res.summary(alpha = .05))

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     22.20
Date:                Thu, 02 Mar 2023   Prob (F-statistic):           1.90e-08
Time:                        11:48:18   Log-Likelihood:                -379.82
No. Observations:                  86   AIC:                             765.6
Df Residuals:                      83   BIC:                             773.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         246.4341     35.233     

In [22]:
# using numpy arrays 
nobs = 100
X = np.random.random((nobs,2))
X = sm.add_constant(X) 
X

array([[1.00000000e+00, 3.75233348e-01, 1.06875812e-01],
       [1.00000000e+00, 6.75372608e-01, 7.55492191e-02],
       [1.00000000e+00, 8.19855585e-01, 7.65413976e-01],
       [1.00000000e+00, 6.87798348e-01, 8.72244862e-01],
       [1.00000000e+00, 1.90260197e-02, 7.21867867e-01],
       [1.00000000e+00, 9.49692919e-01, 6.54102603e-01],
       [1.00000000e+00, 3.29513867e-01, 5.48544679e-02],
       [1.00000000e+00, 2.94800709e-01, 9.98903584e-01],
       [1.00000000e+00, 3.60925061e-02, 1.08875276e-01],
       [1.00000000e+00, 6.97162589e-01, 4.43250609e-01],
       [1.00000000e+00, 6.86989300e-01, 3.50010697e-01],
       [1.00000000e+00, 5.27101229e-01, 1.43414319e-01],
       [1.00000000e+00, 2.58052771e-01, 8.25554405e-01],
       [1.00000000e+00, 9.00378908e-01, 5.86642614e-01],
       [1.00000000e+00, 8.61185287e-01, 3.60378091e-03],
       [1.00000000e+00, 7.79636687e-01, 8.69829806e-01],
       [1.00000000e+00, 7.89010802e-02, 3.57033834e-01],
       [1.00000000e+00, 1.60266

In [24]:
beta = [1,.1,.5]
e = np.random.random(nobs)
y = np.dot(X, beta) + e
y

array([1.92347323, 1.10859668, 2.41771026, 2.34060485, 1.48994881,
       1.46432526, 1.27794662, 2.01032496, 1.93003738, 1.45774973,
       1.56133906, 1.90523653, 2.00292525, 2.28320043, 1.9009595 ,
       2.26119441, 1.5966072 , 1.71123627, 1.76664218, 1.4979376 ,
       2.14799323, 1.37555661, 1.4429603 , 1.51389146, 2.0153947 ,
       2.03131562, 1.9228942 , 2.08079778, 2.06708448, 1.75934763,
       1.91522357, 1.58978811, 2.28411642, 1.37148339, 1.87548948,
       1.69006277, 1.58024978, 2.28049347, 1.91705966, 1.76248933,
       1.65993847, 1.93079771, 1.57897365, 1.62308034, 1.60481578,
       1.85996476, 1.62891066, 1.7275808 , 1.70929889, 2.12660209,
       1.39327178, 1.3639382 , 2.33341346, 1.73440685, 1.83223695,
       1.66339614, 1.88511293, 1.41897404, 1.37502281, 1.70596646,
       2.34494712, 2.47450252, 1.83307212, 2.07501233, 1.64426663,
       1.90341017, 2.05513981, 1.87879831, 1.30166827, 1.7498617 ,
       1.77337876, 1.67854151, 1.61308477, 2.15726815, 1.83304

In [18]:
res_2 = sm.OLS(y,X).fit()

In [19]:
res_2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.192
Method:,Least Squares,F-statistic:,12.74
Date:,"Thu, 02 Mar 2023",Prob (F-statistic):,1.22e-05
Time:,11:52:17,Log-Likelihood:,-16.513
No. Observations:,100,AIC:,39.03
Df Residuals:,97,BIC:,46.84
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.6065,0.071,22.556,0.000,1.465,1.748
x1,-0.0420,0.097,-0.432,0.667,-0.235,0.151
x2,0.4719,0.094,5.025,0.000,0.285,0.658

0,1,2,3
Omnibus:,11.612,Durbin-Watson:,2.249
Prob(Omnibus):,0.003,Jarque-Bera (JB):,4.316
Skew:,-0.191,Prob(JB):,0.116
Kurtosis:,2.057,Cond. No.,4.94


In [25]:
# how to functions to grab from res
dir(res)

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abat_diagonal',
 '_cache',
 '_data_attr',
 '_data_in_cache',
 '_get_robustcov_results',
 '_is_nested',
 '_use_t',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'diagn',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_prediction',
 'get_robustcov_results',
 'info_c