# Preprocessing, Models and Results - Ames Housing Data

Having cleaned and selected preliminary features, in this notebook we prepare the data for modeling, and model and score using Linear Regression, Lasso and Ridge.

## Import Data from EDA

In [11]:
# Import the usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')
from sklearn.impute import SimpleImputer

In [12]:
train = pd.read_csv('../data/train_cleaned.csv')
train.drop(columns='Unnamed: 0', inplace=True)
test = pd.read_csv('../data/test_cleaned.csv')
test.drop(columns='Unnamed: 0', inplace=True)

## Null-Baseline Model

In [13]:
sale_means = np.full_like(train['SalePrice'], train['SalePrice'].mean())

In [14]:
mean_squared_error(train['SalePrice'], sale_means, squared=False)

79276.56098763898

## Linear Regression Numericals

Let's start with a simple Linear Regression on the numerical features selected in EDA.

In [15]:
X = train[['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']]
y = train['SalePrice']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

In [17]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg.score(X_test, y_test)

0.7743928990861233

In [18]:
preds = linreg.predict(X_test)

### RMSE

In [19]:
mean_squared_error(y_test, preds, squared=False)

37279.67084087376

### Kaggle submission

In [20]:
test['SalePrice'] = linreg.predict(test[['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']])

In [21]:
test[['Id', 'SalePrice']].to_csv('../data/submission_baseline.csv', index=False)

## Location, Location, Location model
Fit a Linear Regression using only the Neighborhoods features.

In [22]:
X = train[['Neighborhood']]
y = train['SalePrice']
kaggle = test[['Neighborhood']]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

In [24]:
ct = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
ct

In [25]:
pipe = make_pipeline(ct, StandardScaler(with_mean=False), LinearRegression())
pipe

In [26]:
pipe.fit(X_train, y_train)

In [27]:
pipe.score(X_test, y_test)

0.5985982825845704

In [28]:
preds = pipe.predict(X_test)

In [29]:
mean_squared_error(y_test, preds, squared=False
)

49726.203146518536

Our RMSE using only neighborhood is considerably worse than our baseline Linear Regression.

### Kaggle submission

In [30]:
test['SalePrice'] = pipe.predict(kaggle)

In [31]:
test[['Id', 'SalePrice']].to_csv('../data/submission_location_linreg.csv', index=False)

## Incorporate all location proxies identified in EDA

In addtion to the Neighborhoods feature, the following features were identified as location proxies in EDA:

- Lot Shape
- Lot Config
- Condition 1
- Condition 2

In [32]:
X = train[['Lot Shape', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2']]
y = train['SalePrice']
kaggle = test[['Lot Shape', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

In [34]:
ct2 = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
ct2

In [35]:
pipe2 = make_pipeline(ct2, StandardScaler(with_mean=False), LinearRegression())
pipe2

In [36]:
pipe2.fit(X_train, y_train)

In [37]:
pipe2.score(X_test, y_test)

0.6262624740055753

In [38]:
preds = pipe2.predict(X_test)

In [39]:
mean_squared_error(y_test, preds, squared=False)

47982.07662062991

Our RMSE using location proxies is considerably worse than our baseline Linear Regression, even if it is better than using only Neighborhood.

### Kaggle submission

In [40]:
test['SalePrice'] = pipe2.predict(kaggle)

In [41]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_location_linreg.csv', index=False)

## Combine all selected features and fit on Linear Regression

### Train-Test split

In [42]:
X = train.drop(columns='SalePrice')
y = train['SalePrice']
kaggle = test[X.columns]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

### Column transformations

In [44]:
ct3 = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
ct3

### Pipeline with Column transformer, Standard Scaler and Linear Regression

In [45]:
pipe3 = make_pipeline(ct3, StandardScaler(with_mean=False), LinearRegression())
pipe3

### Fit, Predict and Score

In [46]:
pipe3.fit(X_train, y_train)

In [47]:
pipe3.score(X_test, y_test)

0.8686755662913601

In [48]:
preds = pipe3.predict(X_test)

In [49]:
mean_squared_error(y_test, preds, squared=False)

28442.545078913496

Our RMSE using all features selected in EDA is considerably better than our baseline Linear Regression.

### Kaggle submission

In [50]:
test['SalePrice'] = pipe3.predict(kaggle)

In [51]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_EDA_linreg.csv', index=False)

## Lasso and Ridge using GridSearchCV

### Train-Test Split

In [52]:
X = train.drop(columns='SalePrice')
y = train['SalePrice']
kaggle = test[X.columns]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

### Column transformations

In [54]:
ct4 = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    remainder='passthrough',
    verbose_feature_names_out=False
)
ct4

### Pipeline with Column transformer, Standard Scaler and Lasso Regression

In [55]:
pipe4 = make_pipeline(ct4, StandardScaler(with_mean=False), Lasso())
pipe4

In [56]:
params = {
    'lasso__alpha': [.01, .1, 1, 10, 100],
    'lasso__max_iter': [100_000]
}

### Grid Search

In [57]:
gs1 = GridSearchCV(pipe4, params, n_jobs=-1)
gs1

In [58]:
gs1.fit(X_train, y_train)
gs1.score(X_test, y_test)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


0.8694410739351349

In [59]:
preds = gs1.predict(X_test)
mean_squared_error(y_test, preds, squared=False)

28359.526225239984

### Kaggle submission Lasso

In [60]:
test['SalePrice'] = gs1.predict(kaggle)

In [61]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_EDA_lasso.csv', index=False)

### Ridge Regression pipeline  

In [62]:
pipe5 = make_pipeline(ct4, StandardScaler(with_mean=False), Ridge())
pipe5

In [63]:
pipe5.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'standardscaler', 'ridge', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'ridge__alpha', 'ridge__copy_X', 'ridge__fit_intercept', 'ridge__max_iter', 'ridge__normalize', 'ridge__positive', 'ridge__random_state', 'ridge__solver', 'ridge__tol'])

In [64]:
params = {
    'ridge__alpha': [.01, .1, 1, 10, 100],
    'ridge__max_iter': [100_000]
}

In [65]:
gs2 = GridSearchCV(pipe5, params, n_jobs=-1)
gs2

In [66]:
gs2.fit(X_train, y_train)
gs2.score(X_test, y_test)

0.8703680799485327

In [67]:
preds = gs2.predict(X_test)
mean_squared_error(y_test, preds, squared=False)

28258.666467040028

### Get most important features and their coeficients

In [68]:
coefs = gs2.best_estimator_.named_steps['ridge'].coef_

In [69]:
feature_names = gs2.best_estimator_.named_steps.get('columntransformer').get_feature_names_out()

In [70]:
pd.DataFrame(zip(feature_names, coefs)).sort_values(by=1, ascending=False).head(10)

Unnamed: 0,0,1
100,Gr Liv Area,24879.064311
98,Overall Qual,15318.891758
101,Garage Area,10796.906749
96,Year Built,10781.465885
71,Kitchen Qual_Ex,9359.936066
32,Neighborhood_NridgHt,8627.049868
62,Exter Qual_Ex,7206.545181
31,Neighborhood_NoRidge,5614.944538
38,Neighborhood_StoneBr,5072.002438
57,Bldg Type_1Fam,4689.916928


### Kaggle submission

In [71]:
test['SalePrice'] = gs2.predict(kaggle)

In [72]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_EDA_ridge.csv', index=False)

## Bonus attempt for Kaggle. Polynomial features, Ridge


### Train-Test Split

In [73]:
X = train.drop(columns='SalePrice')
y = train['SalePrice']
kaggle = test[X.columns]

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

In [75]:
pipe6 = make_pipeline(ct4, PolynomialFeatures(), StandardScaler(with_mean=False), Ridge())
pipe6

In [76]:
pipe6.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'polynomialfeatures', 'standardscaler', 'ridge', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__verbose_feature_names_out', 'columntransformer__onehotencoder', 'columntransformer__onehotencoder__categories', 'columntransformer__onehotencoder__drop', 'columntransformer__onehotencoder__dtype', 'columntransformer__onehotencoder__handle_unknown', 'columntransformer__onehotencoder__sparse', 'polynomialfeatures__degree', 'polynomialfeatures__include_bias', 'polynomialfeatures__interaction_only', 'polynomialfeatures__order', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'ridge__alpha', 'ridge__copy_X', 'ridge__fit_intercept', 'ridge__max_iter', 'ridge__normalize', 'ridge__positive', 'ridge__random_state', 'ridge__solver'

In [77]:
params = {
    'ridge__alpha': [.01, .1, 1, 10, 100],
    'ridge__max_iter': [100_000]
}

In [78]:
gs3 = GridSearchCV(pipe6, params, n_jobs=-1)
gs3

In [79]:
gs3.fit(X_train, y_train)
gs3.score(X_test, y_test)

0.8576599948036657

In [80]:
preds = gs3.predict(X_test)
mean_squared_error(y_test, preds, squared=False)

29611.415986208664

### Kaggle submission

In [81]:
test['SalePrice'] = gs3.predict(kaggle)

In [82]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_EDA_poly_ridge.csv', index=False)

## Bonus attempt for Kaggle. Ridge on all features


In [100]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

si = SimpleImputer(strategy='most_frequent')

### Train-Test Split

In [101]:
X = train.drop(columns='SalePrice')
y = train['SalePrice']
kaggle = test[X.columns]


In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1331)

In [107]:
pipe6 = make_pipeline(ct4, SimpleImputer(strategy='most_frequent'), StandardScaler(with_mean=False), Ridge())
pipe6

In [108]:
params = {
    'ridge__alpha': [.01, .1, 1, 10, 100],
    'ridge__max_iter': [100_000]
}

In [109]:
gs3 = GridSearchCV(pipe6, params, n_jobs=-1)
gs3

In [110]:
gs3.fit(X_train, y_train)
gs3.score(X_test, y_test)

0.923511269853292

In [111]:
preds = gs3.predict(X_test)
mean_squared_error(y_test, preds, squared=False)

22788.581379038846

### Kaggle submission

In [98]:
test['SalePrice'] = gs3.predict(kaggle)

In [99]:
test[['Id', 'SalePrice']].to_csv('../data/submission_all_features_ridge.csv', index=False)

### Get most important features and their coeficients

In [112]:
coefs = gs3.best_estimator_.named_steps['ridge'].coef_

In [113]:
feature_names = gs3.best_estimator_.named_steps.get('columntransformer').get_feature_names_out()

In [114]:
pd.DataFrame(zip(feature_names, coefs)).sort_values(by=1, ascending=False).head(10)

Unnamed: 0,0,1
282,Gr Liv Area,11652.535379
278,Total Bsmt SF,8620.313822
270,Overall Qual,8612.523756
279,1st Flr SF,7953.134179
275,BsmtFin SF 1,6851.08039
280,2nd Flr SF,6475.638365
49,Neighborhood_NridgHt,5265.637016
269,Lot Area,5071.178403
274,Mas Vnr Area,4779.61526
48,Neighborhood_NoRidge,4654.236618


In [115]:
pd.DataFrame(zip(feature_names, coefs)).sort_values(by=1, ascending=False).tail(10)

Unnamed: 0,0,1
79,Bldg Type_TwnhsE,-2047.775917
287,Bedroom AbvGr,-2156.102487
50,Neighborhood_OldTown,-2331.213872
45,Neighborhood_NAmes,-2469.008807
300,Misc Val,-2538.656783
37,Neighborhood_Edwards,-2738.200875
231,Garage Cond_Ex,-2782.091349
288,Kitchen AbvGr,-2967.891819
250,Misc Feature_Elev,-10608.77934
94,Roof Matl_ClyTile,-12914.562415
