In [40]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning, module='sklearn')

In [53]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

df_housing = pd.read_csv('./AmesHousing.csv')
df_housing.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [42]:
df_housing.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [43]:
X = df_housing.drop('SalePrice', axis = 1)
y = df_housing['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [44]:
num_transformer = Pipeline(
    [("imputer", SimpleImputer(strategy='mean')),
     ("scaler", StandardScaler())]
)

ct = ColumnTransformer(
    [("num", num_transformer, ['Gr Liv Area', 'TotRms AbvGrd'])],
    remainder="drop"
)

lr_pipeline_1 = Pipeline(
    [("preprocessing", ct),
     ("lr", LinearRegression())]
)

lr_pipeline_1.fit(X_train, y_train)
y_pred = lr_pipeline_1.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)


58186.942963912246

In [45]:
scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
scores.mean()

55806.32634926364

In [46]:
cat_transformer = Pipeline(
    [("impute", SimpleImputer(strategy="most_frequent")),
     ("ohe", OneHotEncoder(sparse_output = False))]
)

ct = ColumnTransformer(
    [("num", num_transformer, ['Gr Liv Area', 'TotRms AbvGrd']),
     ("cat", cat_transformer, ['Bldg Type'])],
    remainder="drop"
)

lr_pipeline_2 = Pipeline(
    [("preprocessing", ct),
     ("lr", LinearRegression())]
)

lr_pipeline_2.fit(X_train, y_train)
y_pred = lr_pipeline_2.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)


56461.0481441448

In [47]:
scores = cross_val_score(lr_pipeline_2, X, y, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
scores.mean()

54168.081429193844

In [48]:
ct = ColumnTransformer(
    [("num", num_transformer, ['Gr Liv Area']),
     ("cat", cat_transformer, ['Bldg Type'])],
    remainder="drop"
)

lr_pipeline_3 = Pipeline(
    [("preprocessing", ct),
     ("interaction", PolynomialFeatures(interaction_only = True)),
     ("lr", LinearRegression())]
)

lr_pipeline_3.fit(X_train, y_train)
y_pred = lr_pipeline_3.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)

55468.6798402389

In [49]:
scores = cross_val_score(lr_pipeline_3, X, y, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
scores.mean()

53430.92197532816

In [50]:
ct = ColumnTransformer(
    [("num", num_transformer, ['Gr Liv Area']),
     ("cat", cat_transformer, ['Bldg Type']),
     ("poly", PolynomialFeatures(degree=5, include_bias=False), ['Gr Liv Area', 'TotRms AbvGrd'])],
    remainder="drop"
)

lr_pipeline_4 = Pipeline(
    [("preprocessing", ct),
     ("lr", LinearRegression())]
)

lr_pipeline_4.fit(X_train, y_train)
y_pred = lr_pipeline_4.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)

60172.485455291884

In [51]:
scores = cross_val_score(lr_pipeline_4, X, y, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
scores.mean()

60202.43327789127

Best model is size and building type, and their interaction for both rmse and cross-val rmse

In [63]:
best_rmse = np.inf
best_degrees = None

feature_union = FeatureUnion(
    [("num", ColumnTransformer([("num", num_transformer, ['Gr Liv Area', 'TotRms AbvGrd'])], remainder="drop")),
     ("cat", ColumnTransformer([("cat", cat_transformer, ['Bldg Type'])], remainder="drop")),
     ("poly_size", ColumnTransformer([("poly", PolynomialFeatures(), ['Gr Liv Area'])], remainder="drop")),
     ("poly_room", ColumnTransformer([("poly", PolynomialFeatures(), ['TotRms AbvGrd'])], remainder="drop"))],
)

pipeline = Pipeline(
    [("preprocessing", feature_union),
    ("lr", LinearRegression())]
)

degrees = {
    'preprocessing__poly_size__poly__degree': np.arange(1, 11),
    'preprocessing__poly_room__poly__degree': np.arange(1, 11)
}

gscv = GridSearchCV(pipeline, degrees, cv=5, scoring='neg_mean_squared_error')
gscv.fit(X, y)

best_parameters = gscv.best_params_
best_rmse = np.sqrt(-gscv.best_score_)

print(f"Best parameters: {best_parameters}")
print(f"Best RMSE: {best_rmse}")


Best parameters: {'preprocessing__poly_room__poly__degree': 1, 'preprocessing__poly_size__poly__degree': 3}
Best RMSE: 52896.32273784652


In [64]:
gscv = GridSearchCV(pipeline, degrees, cv=5, scoring='r2')
gscv.fit(X, y)

best_parameters = gscv.best_params_
best_r2 = gscv.best_score_

print(f"Best parameters: {best_parameters}")
print(f"Best R-squared: {best_r2}")

Best parameters: {'preprocessing__poly_room__poly__degree': 1, 'preprocessing__poly_size__poly__degree': 3}
Best R-squared: 0.5576406124353915
