<a href="https://colab.research.google.com/github/oreomcflurryyy/data-analysis/blob/main/boston_housing_model_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installations

In [130]:
!pip install ISLP
!pip install l0bnb
!pip install pygam



#Importing packages

In [131]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import matplotlib.pyplot as plt
import seaborn as sns

In [132]:
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestRegressor as RF,
      GradientBoostingRegressor as GBR,
      BaggingRegressor as BR)

In [133]:
import sklearn.model_selection as skm
from sklearn.model_selection import train_test_split
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
import sklearn.model_selection as skm
from sklearn.neighbors import KNeighborsRegressor as KNN
import sklearn.linear_model as skl
from sklearn.preprocessing import StandardScaler

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from ISLP.models import \
     (Stepwise,
      sklearn_selected,
      sklearn_selection_path)

from l0bnb import fit_path

In [135]:
from statsmodels.api import OLS
import statsmodels.api as sm
from statsmodels.stats.outliers_influence \
     import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm

In [136]:
from ISLP import load_data, confusion_table
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from ISLP.models import sklearn_sm
from ISLP.bart import BART
from pygam import (s as s_gam,
                   l as l_gam,
                   f as f_gam,
                   LinearGAM,
                   LogisticGAM)

from ISLP.transforms import (BSpline,
                             NaturalSpline)
from ISLP.models import bs, ns
from ISLP.pygam import (approx_lam,
                        degrees_of_freedom,
                        plot as plot_gam,
                        anova as anova_gam)

#Data handling

In [137]:
Boston = load_data("Boston")
Boston

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [138]:
scaler = StandardScaler()

model = MS(Boston.columns.drop('medv'), intercept=False)
D = model.fit_transform(Boston)
feature_names = list(D.columns)
X = np.asarray(D)
X = scaler.fit_transform(X)

In [139]:
(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                Boston['medv'],
                                test_size=0.3,
                                random_state=0)

#Linear Regression

In [140]:
model_LR = skl.LinearRegression()

In [141]:
kfold = skm.KFold(5,
                  shuffle=True,
                  random_state=10)
grid = skm.GridSearchCV(model_LR,
                        {'fit_intercept': [True, False]},
                        refit=True,
                        cv=kfold,
                        scoring='neg_mean_squared_error')
G = grid.fit(X_train, y_train)

In [142]:
best_ = grid.best_estimator_
np.mean((y_test - best_.predict(X_test))**2)

28.146790268541203

In [143]:
grid.best_params_

{'fit_intercept': True}

#Regression Tress

In [144]:
reg = DTR(max_depth=30)
reg.fit(X_train, y_train)

In [145]:
ccp_path = reg.cost_complexity_pruning_path(X_train, y_train)
grid = skm.GridSearchCV(reg,
                        {'ccp_alpha': ccp_path.ccp_alphas},
                        refit=True,
                        cv=kfold,
                        scoring='neg_mean_squared_error')
G = grid.fit(X_train, y_train)

In [146]:
best_ = grid.best_estimator_
np.mean((y_test - best_.predict(X_test))**2)

26.515432531588715

In [147]:
best_.get_depth()

11

#Bagging and Random Forests

In [148]:
bag = BR(n_estimators=500, random_state=0)
bag.fit(X_train, y_train)

rf = RF(n_estimators=500, max_features=6, random_state=0)
rf.fit(X_train, y_train)

In [149]:
print("Bagging MSE:", np.mean((y_test - bag.predict(X_test))**2))
print("Random Forest MSE:", np.mean((y_test - rf.predict(X_test))**2))

Bagging MSE: 14.978727673421053
Random Forest MSE: 18.761169122894742


#Boosting

In [150]:
boost_boston = GBR(n_estimators=5000,
                   learning_rate=0.001,
                   max_depth=3,
                   random_state=0)
boost_boston.fit(X_train, y_train)

In [151]:
y_hat_boost = boost_boston.predict(X_test);
np.mean((y_test - y_hat_boost)**2)

14.408450266242562

In [152]:
boost_boston = GBR(n_estimators=5000,
                   learning_rate=0.2,
                   max_depth=3,
                   random_state=0)
boost_boston.fit(X_train,
                 y_train)
y_hat_boost = boost_boston.predict(X_test);
np.mean((y_test - y_hat_boost)**2)

14.403684406494323

In [153]:
grid.best_params_

{'ccp_alpha': 0.04379388619853823}

#Ridge Regression

In [154]:
ridge = skl.Ridge()
ridge.fit(X_train, y_train)

In [155]:
alphas = np.logspace(-6, 6, 13)
param_grid = {'alpha': alphas}

grid = skm.GridSearchCV(ridge,
                        param_grid,
                        cv=kfold,
                        scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

best_ridge_model = grid.best_estimator_
np.mean((y_test - best_ridge_model.predict(X_test))**2)

28.19815664765892

In [156]:
grid.best_params_

{'alpha': 1.0}

#Lasso Regression

In [157]:
lasso = skl.Lasso()
lasso.fit(X_train, y_train)

In [158]:
alphas = np.logspace(-6, 6, 13)
param_grid = {'alpha': alphas}

grid = skm.GridSearchCV(lasso,
                        param_grid,
                        cv=kfold,
                        scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

best_lasso_model = grid.best_estimator_
np.mean((y_test - best_lasso_model.predict(X_test))**2)

28.21406019878846

In [159]:
grid.best_params_

{'alpha': 0.01}

#K-Nearest Neighbours

In [160]:
knn = KNN()
knn.fit(X_train, y_train)

In [161]:
n = np.arange(1, 21)
param_grid = {'n_neighbors': n}

grid = skm.GridSearchCV(knn,
                        param_grid,
                        cv=kfold,
                        scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

best_knn = grid.best_estimator_
np.mean((y_test - best_knn.predict(X_test))**2)

28.493735380116952

In [162]:
grid.best_params_

{'n_neighbors': 3}

Supervised Learning Models Performance on the Boston Housing Dataset**

The performance of various supervised learning models applied to the Boston Housing Dataset is evaluated based on their Mean Squared Error (MSE) values. The MSEs are as follows:

---

### **Table: Model Performance Ranked by Mean Squared Error (MSE)**

| **Rank** | **Model**                  | **Mean Squared Error (MSE)** |
|:--------:|:---------------------------|-----------------------------:|
| 1        | **Boosting**               | **14.40**                    |
| 2        | **Bagging**                | **14.98**                    |
| 3        | **Random Forest**          | 18.76                        |
| 4        | **Regression Trees**       | 26.52                        |
| 5        | **Linear Regression**      | 28.15                        |
| 6        | **Ridge Regression**       | 28.19                        |
| 7        | **Lasso Regression**       | 28.21                        |
| 8        | **K-Nearest Neighbors**    | 28.49                        |

---

### **Analysis and Insights**

#### **1. Boosting (MSE: 14.40)**

- **Best Performance:** Boosting techniques achieved the lowest MSE, indicating the highest accuracy in predicting housing prices.
- **Why It Performed Well:**
  - **Sequential Learning:** Boosting builds models in sequence, where each new model focuses on correcting errors from previous ones.
  - **Complex Pattern Capturing:** Effectively models complex non-linear relationships and interactions between variables.

#### **2. Bagging (MSE: 14.98)**

- **Strong Performance:** Bagging closely follows Boosting with a slightly higher MSE.
- **Why It Performed Well:**
  - **Variance Reduction:** Bagging reduces variance by training multiple models on different subsets of the data and aggregating their predictions.
  - **Robustness to Overfitting:** Helps prevent overfitting compared to individual trees.

#### **3. Random Forest (MSE: 18.76)**

- **Good Performance:** Random Forests, an ensemble of decision trees with added randomness, performed well with an MSE under 20.
- **Why It Performed Well:**
  - **Feature Randomness:** By selecting random subsets of features, it decorrelates trees and enhances generalization.
  - **Ensemble Strength:** Combines multiple trees to improve predictive accuracy.

#### **4. Regression Trees (Decision Trees) (MSE: 26.52)**

- **Moderate Performance:** Standalone regression trees have a higher MSE compared to ensemble methods.
- **Why It Performed Moderately:**
  - **Overfitting Risk:** Individual trees can overfit the training data if not properly pruned.
  - **Lack of Ensemble Benefits:** Without aggregation, they may not generalize well to new data.

#### **5. Linear Regression (MSE: 28.15)**

- **Baseline Model:** Serves as a reference point for evaluating other models.
- **Why It Performed Less Well:**
  - **Linearity Assumption:** May not capture non-linear relationships in the data.
  - **Sensitivity to Outliers:** Linear regression can be influenced by extreme values or skewed data.

#### **6. Ridge Regression (MSE: 28.19)**

- **Similar to Linear Regression:** Shows very close MSE to standard linear regression.
- **Why It Performed Less Well:**
  - **L2 Regularization:** Adds penalty to large coefficients to reduce overfitting but may not significantly improve MSE if linearity assumption isn't sufficient.

#### **7. Lasso Regression (MSE: 28.21)**

- **Comparable Performance:** MSE is almost the same as Linear and Ridge Regression.
- **Why It Performed Less Well:**
  - **L1 Regularization:** Can zero out coefficients of less important features; however, does not substantially improve predictive accuracy in this case.
  - **Linearity Limitation:** Still based on a linear model that may not capture complex patterns.

#### **8. K-Nearest Neighbors (MSE: 28.49)**

- **Improved Performance:** KNN now shows performance similar to linear models, better than previously reported higher MSEs.
- **Why It Performed Moderately:**
  - **Local Patterns:** KNN predicts based on the average of nearest neighbors, which can be effective if nearby data points have similar target values.
  - **Sensitivity to Parameters:** Performance depends on the number of neighbors (K) and distance metrics used.
  - **Feature Scaling:** Proper scaling can enhance KNN performance.

---



### **Key Observations**

1. **Ensemble Methods Lead:** Boosting and Bagging outperform other models, indicating the advantage of ensemble techniques in capturing complex relationships.

2. **Random Forests Perform Well:** Random Forests provide a good balance between performance and computational efficiency, benefiting from ensemble averaging.

3. **Similar Performance Among Linear Models:** Linear Regression, Ridge, and Lasso Regression exhibit very similar MSEs, suggesting that regularization did not significantly enhance model performance for this dataset.

4. **KNN Improvement:** K-Nearest Neighbors now performs comparably to linear models, likely due to parameter tuning or better preprocessing steps.

5. **Regression Trees Alone Are Limited:** Without ensemble methods, single decision trees perform worse than their aggregated counterparts.

---

### **Recommendations**

#### **1. Favor Ensemble Methods**

- **Boosting:** Given the lowest MSE, Boosting methods (e.g., Gradient Boosting, XGBoost, LightGBM) are recommended for their superior predictive performance.
- **Bagging and Random Forests:** Offer robust alternatives with strong performance, especially when computational resources are limited.

#### **2. Optimize Model Parameters**

- **Hyperparameter Tuning:**
  - **Boosting:** Adjust learning rate, number of estimators, maximum depth, and regularization parameters.
  - **KNN:** Experiment with different values of K and distance metrics.
- **Cross-Validation:** Use techniques like k-fold cross-validation to ensure model generalization and avoid overfitting.

#### **3. Enhance Data Preprocessing**

- **Feature Scaling:** Important for models like KNN and algorithms sensitive to the scale of data.
- **Feature Engineering:**
  - **Transformations:** Apply logarithmic or power transformations to address skewness.
  - **Interaction Terms:** Introduce to allow linear models to capture non-linear relationships.

#### **4. Consider Model Interpretability**

- **Feature Importance:** Use models that provide insights into feature contributions (e.g., Random Forests, Boosting models with feature importance scores).
- **Simpler Models for Explanation:** Linear models, despite higher MSEs, can be valuable when interpretability is critical.

---


### **Conclusion**

- **Boosting Methods Excel:** Boosting achieved the lowest MSE, indicating its effectiveness in modeling complex relationships within the data.
- **Ensemble Methods Are Beneficial:** Both Boosting and Bagging significantly outperform individual models due to their ability to reduce bias and variance.
- **Linear Models Have Limitations:** The similar MSEs among Linear, Ridge, and Lasso Regression suggest that regularization alone doesn't enhance performance when linearity assumptions don't hold.
- **KNN Performance Depends on Preprocessing:** With appropriate scaling and parameter tuning, KNN can perform comparably to linear models.

