In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV


In [None]:
# Read the data
ames = pd.read_csv("/content/AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [None]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

rr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge_regression", Ridge())]
)

In [None]:
#cross_validate
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')


array([-2.59303720e+21, -1.13145211e+19, -7.57138616e+20, -4.47669752e+18,
       -2.55949915e+20])

-R2 scores: model is massively overfit

1. Make a pipeline that uses all the variables in the Ames dataset, and then fits Ridge Regression with
.

2. Cross-validate this pipeline and compare the results to the ordinary linear regression.

3. Then fit the model on the whole dataset and get the coefficients. Make a plot of these coefficients compared to the ones from ordinary linear regression.

In [None]:
#1. See above
cross_val_score(rr_pipeline_1, X, y, cv = 5, scoring = 'r2')


array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

Results are much better. R2 scores are very high

In [None]:
rr_pipeline_1.fit(X, y)
rr_coef = pd.DataFrame(rr_pipeline_1["ridge_regression"].coef_)
rr_coef.head()

Unnamed: 0,0
0,-5585.147073
1,1279.59973
2,-5465.717759
3,7876.141644
4,3046.095382


Using the same pipeline as previously, perform tuning on
.

You should always try
 values on a log scale; that is, don’t use [1,2,3,4]; instead use something like [0.001, 0.01, 0.1, 1, 10]

In [None]:
#degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

#gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [None]:
param_grid = {
    "ridge_regression__alpha": [0.001, 0.01, 0.1, 1, 10]
}
grid_search = GridSearchCV(rr_pipeline_1, param_grid, cv=5, scoring='r2')
grid_search.fit(X, y)
print("best_params", grid_search.best_params_)


best_params {'ridge_regression__alpha': 10}


Create a LASSO pipeline, and tune
.

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS

In [None]:
lasso_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso_regression", Lasso())]
)

In [None]:
param_grid1 = {
    "lasso_regression__alpha": [0.001, 0.01, 0.1, 1, 10]
}
grid_search1 = GridSearchCV(lasso_pipeline_1, param_grid1, cv=5, scoring='r2')
grid_search1.fit(X, y)
print("best_params", grid_search1.best_params_)



best_params {'lasso_regression__alpha': 10}


In [None]:
lasso_pipeline_1.fit(X, y)
l_coef = pd.DataFrame(lasso_pipeline_1["lasso_regression"].coef_)
l_coef.head()


Unnamed: 0,0
0,-4816.528388
1,1227.244385
2,-4418.855409
3,7519.825758
4,3532.939294


In [None]:
from plotnine import *
(
    ggplot(l_coef, aes())

)

Create an Elastic Net pipeline, and tune
 and
.

Fit your best model on the full Ames data, and compare the coefficients to Ridge and OLS.

In [None]:
elasticnet_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elasticnet_regression", ElasticNet())]
)

In [None]:
param_grid3 = {
    "elasticnet_regression__alpha": [0.001, 0.01, 0.1, 1, 10],
    "elasticnet_regression__l1_ratio": [.2, .5, .8]
}
grid_search3 = GridSearchCV(elasticnet_pipeline_1, param_grid3, cv=5, scoring='r2')
grid_search3.fit(X, y)
print("best_params", grid_search3.best_params_)
best_elasticnet = grid_search3.best_estimator_
best_elasticnet.fit(X, y)
best_elasticnet_coefficients = best_elasticnet.named_steps["elasticnet_regression"].coef_





best_params {'elasticnet_regression__alpha': 0.01, 'elasticnet_regression__l1_ratio': 0.5}




In [None]:
en_coef = pd.DataFrame(best_elasticnet["elasticnet_regression"].coef_)
en_coef.head()

Unnamed: 0,0
0,-141.971922
1,349.433582
2,-20.358041
3,-55.085186
4,1192.787937


In [None]:
cross_val_score(lasso_pipeline_1, X, y, cv = 5, scoring = 'r2')



array([0.89774385, 0.91093785, 0.79691806, 0.77426245, 0.90589888])