
# Coding Demo: Running Regression
Here, we illustrate how to use Python to run regressions and visualize data.

In [6]:
# First install necessary packages
from statsmodels.compat import lzip
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import math
from statsmodels.formula.api import ols
from numpy.linalg import inv

The dataset of interest is the **House Prices in the City of Windsor, Canada** dataset. Read about this dataset [here](https://vincentarelbundock.github.io/Rdatasets/doc/AER/HousePrices.html) 

In [7]:
# Load the dataset. 
data_path = 'https://vincentarelbundock.github.io/Rdatasets/csv/AER/HousePrices.csv'
houseprices_dataset = pd.read_csv(data_path )
houseprices_dataset.head()

Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrooms,stories,driveway,recreation,fullbase,gasheat,aircon,garage,prefer
0,1,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,2,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,3,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,4,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,5,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [9]:
price_model = ols("np.log(price) ~ (np.log(lotsize) + bedrooms + bathrooms + stories + garage + driveway + recreation + fullbase + gasheat +aircon + prefer)", data=houseprices_dataset).fit()
print(price_model.summary())

                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.687
Model:                            OLS   Adj. R-squared:                  0.680
Method:                 Least Squares   F-statistic:                     106.3
Date:                Mon, 20 Sep 2021   Prob (F-statistic):          9.24e-127
Time:                        22:09:23   Log-Likelihood:                 82.412
No. Observations:                 546   AIC:                            -140.8
Df Residuals:                     534   BIC:                            -89.19
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             7.7451      0.21

Suppose we test hypotheses that each component of $\beta$ is nonzero.
This could be many hypotheses in a large dataset, or if you include many transformations of your data.

In [12]:
price_model = ols("np.log(price) ~ (np.log(lotsize) + bedrooms + bathrooms + stories + garage + driveway + recreation + fullbase + gasheat +aircon + prefer)**2", data=houseprices_dataset).fit()
print(price_model.summary())

                            OLS Regression Results                            
Dep. Variable:          np.log(price)   R-squared:                       0.731
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     19.68
Date:                Mon, 20 Sep 2021   Prob (F-statistic):          7.55e-100
Time:                        22:12:01   Log-Likelihood:                 123.75
No. Observations:                 546   AIC:                            -113.5
Df Residuals:                     479   BIC:                             174.8
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                                        coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

In [14]:
pvalues = price_model.pvalues
print(pvalues)

Intercept            1.247809e-09
driveway[T.yes]      1.657302e-01
recreation[T.yes]    2.476117e-03
fullbase[T.yes]      4.085500e-01
gasheat[T.yes]       8.697076e-02
                         ...     
bedrooms:stories     6.864999e-01
bedrooms:garage      5.706714e-01
bathrooms:stories    2.298405e-01
bathrooms:garage     9.524899e-01
stories:garage       9.567784e-01
Length: 67, dtype: float64


We can now run multiple hypothesis testing with the python package `statsmodels.stats.multitest.multipletests`

In [18]:
from statsmodels.stats.multitest import multipletests
multipletests(pvalues, alpha=0.05, method='holm')

(array([ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False]),
 array([8.36031884e-08, 1.00000000e+00, 1.60947622e-01, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 2.37516658e-01, 1.00000000e+00, 3.07072546e-01,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.47971267e-01, 1.0000000

In [19]:
multipletests(pvalues, alpha=0.05, method='fdr_bh')

(array([ True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False]),
 array([8.36031884e-08, 4.82779141e-01, 5.52999522e-02, 7.82081498e-01,
        3.75867206e-01, 9.70010553e-01, 8.65985885e-01, 4.04600481e-01,
        3.75867206e-01, 6.21625629e-02, 8.65985885e-01, 6.53138432e-02,
        7.33374573e-01, 8.65985885e-01, 2.57914340e-01, 2.57914340e-01,
        2.57914340e-01, 4.51982678e-01, 7.46854161e-01, 1.72405646e-01,
        5.49975500e-01, 7.92672880e-01, 1.72405646e-01, 5.22798779e-01,
        5.52999522e-02, 4.5198267