## Coding Exercise #0407

### 1. Linear regression diagnostics and modeling using StatsModels library:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn import metrics

warnings.filterwarnings(action='ignore')                  # Turn off the warnings.
%matplotlib inline

#### 1.1. Load the 'Boston' dataset from Scikit-Learn and convert it into a DataFrame:

In [2]:
data = fetch_california_housing()

In [3]:
# The explanatory variables.
X = data['data']
header = data['feature_names']

In [4]:
# The response variable.
Y = data['target']
Y = Y.reshape(-1, 1)

In [5]:
# Transform into a DataFrame.
df = pd.DataFrame(np.append(X,Y,axis = 1))
df.columns = list(header) + ['PRICE']

In [6]:
df.head(5)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


#### 1.2. Linear modeling using R-styled formula:

In [9]:
# Using all the explanatory variables.
myModel1 = smf.ols(formula = "PRICE ~ MedInc + HouseAge + AveRooms + AveBedrms + Population + AveOccup + Latitude + Longitude ", data=df)
result1 = myModel1.fit()
result1.summary()

0,1,2,3
Dep. Variable:,PRICE,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,3970.0
Date:,"Fri, 14 Jun 2024",Prob (F-statistic):,0.0
Time:,13:28:42,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45270.0
Df Residuals:,20631,BIC:,45340.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-36.9419,0.659,-56.067,0.000,-38.233,-35.650
MedInc,0.4367,0.004,104.054,0.000,0.428,0.445
HouseAge,0.0094,0.000,21.143,0.000,0.009,0.010
AveRooms,-0.1073,0.006,-18.235,0.000,-0.119,-0.096
AveBedrms,0.6451,0.028,22.928,0.000,0.590,0.700
Population,-3.976e-06,4.75e-06,-0.837,0.402,-1.33e-05,5.33e-06
AveOccup,-0.0038,0.000,-7.769,0.000,-0.005,-0.003
Latitude,-0.4213,0.007,-58.541,0.000,-0.435,-0.407
Longitude,-0.4345,0.008,-57.682,0.000,-0.449,-0.420

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,238000.0


**NOTE: INDUS and AGE do not seem to be significant. Adjusted $R^2 = 0.734$ and $AIC = 3026$.**

In [10]:
# Dropping the explanatory variables 'INDUS' and 'AGE'.
myModel2 = smf.ols(formula = "PRICE ~ MedInc + HouseAge + AveRooms + AveBedrms + Population + AveOccup + Latitude + Longitude ", data=df)
result2 = myModel2.fit()
result2.summary()

0,1,2,3
Dep. Variable:,PRICE,R-squared:,0.606
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,3970.0
Date:,"Fri, 14 Jun 2024",Prob (F-statistic):,0.0
Time:,13:28:58,Log-Likelihood:,-22624.0
No. Observations:,20640,AIC:,45270.0
Df Residuals:,20631,BIC:,45340.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-36.9419,0.659,-56.067,0.000,-38.233,-35.650
MedInc,0.4367,0.004,104.054,0.000,0.428,0.445
HouseAge,0.0094,0.000,21.143,0.000,0.009,0.010
AveRooms,-0.1073,0.006,-18.235,0.000,-0.119,-0.096
AveBedrms,0.6451,0.028,22.928,0.000,0.590,0.700
Population,-3.976e-06,4.75e-06,-0.837,0.402,-1.33e-05,5.33e-06
AveOccup,-0.0038,0.000,-7.769,0.000,-0.005,-0.003
Latitude,-0.4213,0.007,-58.541,0.000,-0.435,-0.407
Longitude,-0.4345,0.008,-57.682,0.000,-0.449,-0.420

0,1,2,3
Omnibus:,4393.65,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.596
Skew:,1.082,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,238000.0


**NOTE: All the explanatory variables are significant. Adjusted $R^2 = 0.735$ and $AIC = 3022$.**

**CONCLUSION: The second model is slightly optimized compared to the first one.**