In [3]:
pip install scikit-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [4]:
# Read the data
ames = pd.read_csv("/content/AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [9]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [6]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([0.89727873, 0.91038253, 0.78900365, 0.77208628, 0.9006982 ])

In [24]:
from sklearn.linear_model import Ridge

ridge_pipeline = Pipeline([
    ("preprocessing", ct),
    ("ridge_regression", Ridge(alpha=1.0))  # changed here
])

cross_val_score(ridge_pipeline, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

In [18]:

lr_pipeline_1.fit(X, y)

ridge_pipeline.fit(X, y)

# Extract coefficients
lr_coefficients = lr_pipeline_1.named_steps['linear_regression'].coef_
ridge_coefficients = ridge_pipeline.named_steps['ridge_regression'].coef_


# Create a DataFrame to compare coefficients
coefficients_df = pd.DataFrame({
                                'Linear Regression': lr_coefficients,
                                'Ridge Regression': ridge_coefficients})
coefficients_df.head()


Unnamed: 0,Linear Regression,Ridge Regression
0,-4648.554959,-5585.147073
1,1546.687999,1279.59973
2,-7516.051924,-5465.717759
3,7867.442231,7876.141644
4,3774.44054,3046.095382


In [20]:
from sklearn.linear_model import Ridge

alpha_values = [0.001, 0.01, 0.1, 1, 10]
cv_scores = {}

for alpha in alpha_values:
    ridge_pipeline = Pipeline([
        ("preprocessing", ct),
        ("ridge_regression", Ridge(alpha=alpha))
    ])
    scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
    cv_scores[alpha] = scores.mean() # Keep the mean stored in the dictionary
    print(f"Cross-validation R2 scores for alpha={alpha}: {scores}")

# You can now compare these scores with the linear regression scores:
# print(f"Linear Regression cross-validation R2 scores: {cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')}")

Cross-validation R2 scores for alpha=0.001: [0.8972854  0.91040618 0.78901601 0.7721318  0.90076168]
Cross-validation R2 scores for alpha=0.01: [0.89734306 0.91061417 0.7891259  0.77253192 0.90131686]
Cross-validation R2 scores for alpha=0.1: [0.89774358 0.91230557 0.79010977 0.77576412 0.90558729]
Cross-validation R2 scores for alpha=1: [0.89815807 0.91744024 0.79493606 0.78522563 0.91389818]
Cross-validation R2 scores for alpha=10: [0.8977621  0.92081211 0.80057243 0.78711955 0.91509487]


In [28]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=1.0, max_iter=10000))  # changed here
])

cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

array([0.89774385, 0.91093785, 0.79691806, 0.77426245, 0.90589714])

In [41]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=.001))  # changed here
])

alpha_0001_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [42]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=.01))  # changed here
])

alpha_001_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [43]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=.1))  # changed here
])

alpha_01_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [44]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=1))  # changed here
])

alpha_1_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

  model = cd_fast.enet_coordinate_descent(


In [52]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=10))  # changed here
])

alpha_10_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

In [53]:
from sklearn.linear_model import Lasso

lasso_pipeline = Pipeline([
    ("preprocessing", ct),
    ("lasso_regression", Lasso(alpha=100))  # changed here
])

alpha_100_score = cross_val_score(lasso_pipeline, X, y, cv = 5, scoring = 'r2')

In [58]:
# Print Linear Regression cross-validation scores
lr_scores = cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')
print(f"Linear Regression cross-validation R2 scores: {lr_scores}")

# Print Ridge Regression cross-validation scores for different alphas
alpha_values = [0.001, 0.01, 0.1, 1, 10]
cv_scores = {}

for alpha in alpha_values:
    ridge_pipeline = Pipeline([
        ("preprocessing", ct),
        ("ridge_regression", Ridge(alpha=alpha))
    ])
    scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
    cv_scores[alpha] = scores.mean() # Keep the mean stored in the dictionary
    print(f"Cross-validation R2 scores for alpha={alpha}: {scores}")
# Print Lasso Regression cross-validation scores for different alphas
print("\nLasso Regression cross-validation R2 scores for different alphas:")
print(f"  alpha=0.001: {alpha_0001_score}")
print(f"  alpha=0.01: {alpha_001_score}")
print(f"  alpha=0.1: {alpha_01_score}")
print(f"  alpha=1: {alpha_1_score}")
print(f"  alpha=10: {alpha_10_score}")
print(f"  alpha=100: {alpha_100_score}")

Linear Regression cross-validation R2 scores: [0.89727873 0.91038253 0.78900365 0.77208628 0.9006982 ]
Cross-validation R2 scores for alpha=0.001: [0.8972854  0.91040618 0.78901601 0.7721318  0.90076168]
Cross-validation R2 scores for alpha=0.01: [0.89734306 0.91061417 0.7891259  0.77253192 0.90131686]
Cross-validation R2 scores for alpha=0.1: [0.89774358 0.91230557 0.79010977 0.77576412 0.90558729]
Cross-validation R2 scores for alpha=1: [0.89815807 0.91744024 0.79493606 0.78522563 0.91389818]
Cross-validation R2 scores for alpha=10: [0.8977621  0.92081211 0.80057243 0.78711955 0.91509487]

Lasso Regression cross-validation R2 scores for different alphas:
  alpha=0.001: [0.8972019  0.9103958  0.79032004 0.77402031 0.90555653]
  alpha=0.01: [0.89720561 0.91040134 0.79085941 0.77406031 0.90550225]
  alpha=0.1: [0.89725821 0.91045103 0.79595065 0.77407171 0.90535981]
  alpha=1: [0.89774385 0.91093785 0.79691806 0.77426245 0.90589888]
  alpha=10: [0.90077569 0.91506699 0.80141962 0.776649