## Fitting a Log-Linear Model Using the Statsmodels Formula API

### You have seen how to use the statsmodels API to fit a linear regression model. In this activity, you are asked to fit a log-linear model. Your model should represent the relationship between the log-transformed dependent variable (log of crime rate per capita) and the median value of owner-occupied homes.

In [1]:
#2
%matplotlib inline
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg 
import pandas as pd
import numpy as np
import patsy
from statsmodels.graphics.correlation import plot_corr
from sklearn.model_selection import train_test_split
plt.style.use('seaborn')

In [2]:
#3 load dataset into dataframe
rawBostonData = pd.read_csv('Boston.csv')

In [3]:
#4 inspect first 5 rows of dataframe
rawBostonData.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [4]:
#5 drop null values
rawBostonData = rawBostonData.dropna()

In [5]:
#6 check for duplicate records
rawBostonData = rawBostonData.drop_duplicates()

In [6]:
#7 list the column names of the dataframe
rawBostonData.columns

Index(['CRIM', ' ZN ', 'INDUS ', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'LSTAT', 'MEDV'],
      dtype='object')

In [7]:
#8 rename column names to something meaningful
renamedBostonData = rawBostonData.rename(columns={'CRIM':'crimeRatePerCapita', ' ZN ':'landOver25k_sqft', 'INDUS ':'non-retailLandProptn', 'CHAS':'riverDummy', 'NOX':'nitrixOxide_pp10m', 'RM':'AvgNo.RoomsPerDwelling', 'AGE': 'ProptnOwnerOccupied', 'DIS':'weightedDist', 'RAD':'radialHighwayAccess','TAX':'propTaxRate_per10k', 'PTRATIO':'pupilTeacherRatio', 'LSTAT':'pctLowerStatus', 'MEDV':'medianValue_Ks'})

In [8]:
renamedBostonData

Unnamed: 0,crimeRatePerCapita,landOver25k_sqft,non-retailLandProptn,riverDummy,nitrixOxide_pp10m,AvgNo.RoomsPerDwelling,ProptnOwnerOccupied,weightedDist,radialHighwayAccess,propTaxRate_per10k,pupilTeacherRatio,pctLowerStatus,medianValue_Ks
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [9]:
#9 inspect the data types of the colums
renamedBostonData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   crimeRatePerCapita      506 non-null    float64
 1   landOver25k_sqft        506 non-null    float64
 2   non-retailLandProptn    506 non-null    float64
 3   riverDummy              506 non-null    int64  
 4   nitrixOxide_pp10m       506 non-null    float64
 5   AvgNo.RoomsPerDwelling  506 non-null    float64
 6   ProptnOwnerOccupied     506 non-null    float64
 7   weightedDist            506 non-null    float64
 8   radialHighwayAccess     506 non-null    int64  
 9   propTaxRate_per10k      506 non-null    int64  
 10  pupilTeacherRatio       506 non-null    float64
 11  pctLowerStatus          506 non-null    float64
 12  medianValue_Ks          506 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 55.3 KB


In [10]:
#10 calculate basic stats for numeric columns in dataframe
renamedBostonData.describe(include=[np.number]).T

# .T transposes the output of describe function to get better layout

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crimeRatePerCapita,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677082,88.9762
landOver25k_sqft,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
non-retailLandProptn,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
riverDummy,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nitrixOxide_pp10m,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
AvgNo.RoomsPerDwelling,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
ProptnOwnerOccupied,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
weightedDist,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
radialHighwayAccess,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
propTaxRate_per10k,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [11]:
#11 divide the dataset into training and test sets; train the dataset(s)
X = renamedBostonData.drop('crimeRatePerCapita', axis=1)
y = renamedBostonData[['crimeRatePerCapita']]
seed = 10
test_data_size = 0.3
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_data_size,random_state=seed)
train_data = pd.concat([X_train,y_train], axis=1)
test_data = pd.concat([X_test,y_test], axis=1)

## 1

In [13]:
# Define a linear regression model and assign it to a variable named linearModel: 
# linearModel = smf.ols(formula='crimeRatePerCapita ~ medianValue_Ks', data=train_data)

In [24]:
# define a linear regression model and assign it to variable named linearModel
linearModel = smf.ols(formula='np.log(crimeRatePerCapita) ~ medianValue_Ks', data=train_data)

In [25]:
linearModelResults = linearModel.fit()

In [27]:
print(linearModelResults.summary())

                                OLS Regression Results                                
Dep. Variable:     np.log(crimeRatePerCapita)   R-squared:                       0.238
Model:                                    OLS   Adj. R-squared:                  0.236
Method:                         Least Squares   F-statistic:                     109.9
Date:                        Tue, 21 Sep 2021   Prob (F-statistic):           1.48e-22
Time:                                08:06:43   Log-Likelihood:                -727.67
No. Observations:                         354   AIC:                             1459.
Df Residuals:                             352   BIC:                             1467.
Df Model:                                   1                                         
Covariance Type:                    nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------