In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
ames = pd.read_csv("/content/AmesHousing.csv")


In [4]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
ct1 = ColumnTransformer(
  [
    #("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)
#remainder drop means any columns we are not calling to drop those columns and not use them

lr_pipeline1 = Pipeline(
  [("preprocessing", ct1),
  ("linear_regression", LinearRegression())]
)

lr_pipeline1
scores = cross_val_score(lr_pipeline1, X, y, cv=5, scoring='r2')
scores.mean()

lr_fittd = lr_pipeline1.fit(X_train, y_train)
y_pred = lr_pipeline1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
npsqrt = np.sqrt(mse)
npsqrt


52405.70983772867

In [7]:
scores = cross_val_score(lr_pipeline1, X, y, cv=5, scoring='r2')
scores.mean()

0.504208752508862

In [8]:
ct2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)
#remainder drop means any columns we are not calling to drop those columns and not use them

lr_pipeline2 = Pipeline(
  [("preprocessing", ct2),
  ("linear_regression", LinearRegression())]
)

lr_pipeline2

lr_fitted = lr_pipeline2.fit(X_train, y_train)
y_pred = lr_pipeline1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
npsqrt = np.sqrt(mse)
npsqrt





52405.70983772867

In [13]:
#scores = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='r2')
scores = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores.mean()

-54140.66302092876

In [15]:
ct3 = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct3.fit_transform(X)

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline3 = Pipeline(
  [("preprocessing", ct3),
    ("preprocessing2", ct_inter),
  ("linear_regression", LinearRegression())])
lr_pipeline3


lr_fitted = lr_pipeline3.fit(X_train, y_train)
y_pred = lr_pipeline3.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
npsqrt = np.sqrt(mse)
npsqrt


49729.96593682053

In [16]:
scores = cross_val_score(lr_pipeline3, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores.mean()

-53435.02493709766

In [17]:


# Combine the transformations into one ColumnTransformer
ct_combined = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_totrms", PolynomialFeatures(degree=5), ["TotRms AbvGrd"]),
        ("poly_grlivarea", PolynomialFeatures(degree=5), ["Gr Liv Area"])
    ],
    remainder="drop"
).set_output(transform="pandas")

# Create pipeline with the combined ColumnTransformer
lr_pipeline_14 = Pipeline(
    [
        ("preprocessing", ct_combined),
        ("linear_regression", LinearRegression())
    ]
).set_output(transform="pandas")



lr_fitted = lr_pipeline_14.fit(X_train, y_train)
y_pred = lr_pipeline_14.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
npsqrt = np.sqrt(mse)
npsqrt


52577.34743089861

In [19]:
# Perform cross-validation
scores = cross_val_score(lr_pipeline_14, X, y, cv=5, scoring='neg_root_mean_squared_error')
scores.mean()

-56303.01901575634

Consider one hundred modeling options for house price:


House size, trying degrees 1 through 10

Number of rooms, trying degrees 1 through 10

Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?



In [21]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial2", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 11),
           "preprocessing__polynomial2__degree": np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [22]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_
gscv_fitted.cv_results_["mean_test_score"]

array([ 5.32882439e-01,  5.37471938e-01,  5.57640609e-01,  5.49239651e-01,
        4.51860123e-01,  3.33837438e-01,  2.93217770e-02, -9.68094802e-01,
       -4.54560839e+00, -1.61879353e+01,  5.32382847e-01,  5.33567353e-01,
        5.56857257e-01,  5.50157883e-01,  4.51860121e-01,  3.33837438e-01,
        2.93217540e-02, -9.68096194e-01, -4.54560441e+00, -1.61879378e+01,
        5.35924169e-01,  5.34134134e-01,  5.54039049e-01,  5.50627573e-01,
        5.05207779e-01,  3.33837438e-01,  2.93216723e-02, -9.68095979e-01,
       -4.54560441e+00, -1.61879378e+01,  5.41528749e-01,  5.35417599e-01,
        5.50392432e-01,  5.56932107e-01,  4.96715171e-01,  3.33837440e-01,
        2.93216921e-02, -9.68095979e-01, -4.54560441e+00, -1.61879378e+01,
        5.41066183e-01,  5.30267305e-01,  5.46549255e-01,  5.56413549e-01,
        4.92694119e-01,  3.33837437e-01,  2.93216921e-02, -9.68095979e-01,
       -4.54560441e+00, -1.61879378e+01,  5.34862257e-01,  5.33313563e-01,
        5.45170683e-01,  

In [24]:
paramsdf = pd.DataFrame(gscv_fitted.cv_results_['params'])
df = paramsdf.assign(score = gscv_fitted.cv_results_['mean_test_score']).sort_values(by = "score", ascending = False)
df

Unnamed: 0,preprocessing__polynomial2__degree,preprocessing__polynomial__degree,score
2,1,3,0.557641
33,4,4,0.556932
12,2,3,0.556857
43,5,4,0.556414
22,3,3,0.554039
...,...,...,...
89,9,10,-16.188760
99,10,10,-16.188760
90,10,1,-184.221203
91,10,2,-189.473656


 Which model performed the best?

 The model that performed best is the one that has the highest R squared value which can be seen by how we organized the df.

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

This could be very time consuming if you had a large number of variables and at some point might not be practical either.