# Working through Statsmodels documentation

## Getting Started...

In [1]:
from __future__ import print_function
import statsmodels.api as sm
import pandas as pd
from patsy import dmatrices

In [2]:
df = sm.datasets.get_rdataset("Guerry", "HistData").data

In [5]:
df.head()

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,...,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,...,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,...,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,...,46,42,76,66,16,85,34,161.927,7340,298.26
3,4,E,Basses-Alpes,12935,7289,46,2733,23018,14238,1:Sm,...,70,12,37,80,32,29,2,351.399,6925,155.9
4,5,E,Hautes-Alpes,17488,8174,69,6962,23076,16171,1:Sm,...,22,23,64,79,35,7,1,320.28,5549,129.1


In [6]:
vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']
df = df[vars]
df[-5:]

Unnamed: 0,Department,Lottery,Literacy,Wealth,Region
81,Vienne,40,25,68,W
82,Haute-Vienne,55,13,67,C
83,Vosges,14,62,82,E
84,Yonne,51,47,30,C
85,Corse,83,49,37,


In [7]:
# to eliminate NaN from fro 85
df = df.dropna()
df[-5:]

Unnamed: 0,Department,Lottery,Literacy,Wealth,Region
80,Vendee,68,28,56,W
81,Vienne,40,25,68,W
82,Haute-Vienne,55,13,67,C
83,Vosges,14,62,82,E
84,Yonne,51,47,30,C


### Building  an OLS model with the endog (dependent) and exog (independent) variables

In [8]:
y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')

In [13]:
print(X[:3])
y

   Intercept  Region[T.E]  Region[T.N]  Region[T.S]  Region[T.W]  Literacy  \
0        1.0          1.0          0.0          0.0          0.0      37.0   
1        1.0          0.0          1.0          0.0          0.0      51.0   
2        1.0          0.0          0.0          0.0          0.0      13.0   

   Wealth  
0    73.0  
1    22.0  
2    61.0  


Unnamed: 0,Lottery
0,41.0
1,38.0
2,66.0
3,80.0
4,79.0
5,70.0
6,31.0
7,75.0
8,28.0
9,50.0


In [18]:
#Fitting a model in statsmodels typically involves 3 easy steps:
# 1. Use the model class to describe the model
# 2. Fit the model using a class method
# 3. Inspect the results using a summary method

# For OLS, this is achieved by:
mod = sm.OLS(y, X)    # Describe model
res = mod.fit()       # Fit model
print(res.summary())   # Summarize model

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     6.636
Date:                Thu, 15 Aug 2019   Prob (F-statistic):           1.07e-05
Time:                        09:24:22   Log-Likelihood:                -375.30
No. Observations:                  85   AIC:                             764.6
Df Residuals:                      78   BIC:                             781.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      38.6517      9.456      4.087      