# Preprocessing and Feature Engineering - Ames Housing Data

Having cleaned and selected preliminary features, in this notebook we prepare the data for modeling.

## Import Data from EDA

In [2]:
# Import the usual suspects
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [11]:
train = pd.read_csv('../data/train_cleaned.csv')
train.drop(columns='Unnamed: 0', inplace=True)
test = pd.read_csv('../data/test.csv')

In [4]:
# Sanity check
train.head()

Unnamed: 0,Lot Shape,Land Contour,Lot Config,Neighborhood,Bldg Type,Year Built,Year Remod/Add,Exter Qual,Exter Cond,Overall Qual,Overall Cond,Gr Liv Area,Kitchen Qual,Garage Area,Garage Qual,Garage Cond,Sale Type,SalePrice
0,IR1,Lvl,CulDSac,Sawyer,1Fam,1976,2005,Gd,TA,6,8,1479,Gd,475.0,TA,TA,WD,130500
1,IR1,Lvl,CulDSac,SawyerW,1Fam,1996,1997,Gd,TA,7,5,2122,Gd,559.0,TA,TA,WD,220000
2,Reg,Lvl,Inside,NAmes,1Fam,1953,2007,TA,Gd,5,7,1057,Gd,246.0,TA,TA,WD,109000
3,Reg,Lvl,Inside,Timber,1Fam,2006,2007,TA,TA,5,5,1444,TA,400.0,TA,TA,WD,174000
4,IR1,Lvl,Inside,SawyerW,1Fam,1900,1993,TA,TA,6,8,1445,TA,484.0,TA,TA,WD,138500


## Linear Regression

Let's start with a simple Linear Regression on the numerical features selected in EDA. I will use this as my baseline model, using Root Mean Squared Error as score to align with Kaggle.

In [5]:
X = train[['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']]
y = train['SalePrice']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [8]:
preds = linreg.predict(X_test)

## Baseline RMSE

In [10]:
mean_squared_error(y_test, preds, squared=False)

34478.37446761926

## Save Baseline for Kaggle submission

In [15]:
test['SalePrice'] = linreg.predict(test[['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']])

In [16]:
test[['Id', 'SalePrice']].to_csv('../data/submission_linreg1.csv', index=False)

## Linear Regression with Standard Scaling

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
linreg.fit(X_scaled, y)
preds_scaled = linreg.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_linreg_scaled.csv', index=False)

## Linear Regression, including binarized 'Neighborhood', and Standard Scaler

In [None]:
X_test = test[['Neighborhood', 'Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']]
X = train[['Neighborhood', 'Overall Qual', 'Gr Liv Area', 'Garage Area', 'Year Built', 'Year Remod/Add']]
y = train['SalePrice']

In [None]:
# Use pandas .get_dummies() to binarize 'Neighborhood'. With help from https://stackoverflow.com/questions/32387266/converting-categorical-values-to-binary-using-pandas
# Drop first binary column
X_test = pd.get_dummies(X_test, drop_first=True)
X = pd.get_dummies(X, drop_first=True)

**I'm getting and error when standard scaling, apparently there are neighborhood columns missing in X or X_test, will add these columns with all "0" where necesssary**

In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X_test['Neighborhood_GrnHill'] = 0
X_test['Neighborhood_Landmrk'] = 0

In [None]:
# A little help here from https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# Keeps dummies in same place as X
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X = X.reindex(sorted(X.columns), axis=1)


In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X.columns == X_test.columns

In [None]:
X.shape

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
linreg = LinearRegression()

linreg.fit(X_scaled, y)
preds_scaled = linreg.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_linreg_scaled_neighbrhd.csv', index=False)

In [None]:
linreg.coef_

In [None]:
pd.DataFrame(linreg.coef_, X_test.columns).sort_values(0, ascending=False)

## LinReg on Neighborhoods only

In [None]:
X_test = test['Neighborhood']
X = train['Neighborhood']
y = train['SalePrice']

In [None]:
# Use pandas .get_dummies() to binarize 'Neighborhood'. With help from https://stackoverflow.com/questions/32387266/converting-categorical-values-to-binary-using-pandas
# Drop first binary column
X_test = pd.get_dummies(X_test, drop_first=True)
X = pd.get_dummies(X, drop_first=True)

**I'm getting and error when standard scaling, apparently there are neighborhood columns missing in X or X_test, will add these columns with all "0" where necesssary**

In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X_test['GrnHill'] = 0
X_test['Landmrk'] = 0

In [None]:
# A little help here from https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# Keeps dummies in same place as X
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X = X.reindex(sorted(X.columns), axis=1)


In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X.columns == X_test.columns

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
linreg = LinearRegression()

linreg.fit(X_scaled, y)
preds_scaled = linreg.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_linreg_scaled_neighbrhd_only.csv', index=False)

In [None]:
linreg.coef_

In [None]:
pd.DataFrame(linreg.coef_, X_test.columns).sort_values(0, ascending=False)

## LinReg using only location features

First let's go back to the data description and redefine our 'interesting' features to include location proxies only.

MS Zoning, Lot Config, Neighborhood, Condition 1, Condition 2, 

In [None]:
# Import data
test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')
test.columns

In [None]:
# List interesting features from reading data description
interesting = [
    'MS Zoning',
    'Lot Config',
    'Neighborhood',
    'Condition 1',
    'Condition 2',
    'SalePrice',
]

In [None]:
# Keep only interesting features
train = train[interesting]


In [None]:
train.dtypes

In [None]:
X_test = test[interesting[:-1]]
X = train.drop(columns='SalePrice')
y = train['SalePrice']

In [None]:
# Use pandas .get_dummies() to binarize categorical columns. With help from https://stackoverflow.com/questions/32387266/converting-categorical-values-to-binary-using-pandas
# Drop first binary column
X_test = pd.get_dummies(X_test, drop_first=True)
X = pd.get_dummies(X, drop_first=True)

In [None]:
X.columns

In [None]:
X_test.columns

Missing columns in X_test: 

In [None]:
missing = [col for col in X.columns if col not in X_test.columns]
missing

In [None]:
X_test[missing] = 0

In [None]:
# A little help here from https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# Keeps dummies in same place as X
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X = X.reindex(sorted(X.columns), axis=1)


In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X.columns == X_test.columns

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
linreg = LinearRegression()

linreg.fit(X_scaled, y)
preds_scaled = linreg.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_linreg_scaled_location.csv', index=False)

## Lasso Regression using interesting features and location features

In [None]:
# Import data
test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')
test.columns

In [None]:
# List interesting features from reading data description
interesting = [
    'MS Zoning',
    'Lot Config',
    'Neighborhood',
    'Condition 1',
    'Condition 2',
    'Bldg Type',
    'Overall Qual',
    'Gr Liv Area',
    'Garage Area',
    'Year Built',
    'Year Remod/Add',
    'SalePrice',
]

In [None]:
# Keep only interesting features
train = train[interesting]


In [None]:
train.dtypes

In [None]:
train['Garage Area'].fillna(0, inplace=True)
test['Garage Area'].fillna(0, inplace=True)

In [None]:
X_test = test[interesting[:-1]]
X = train.drop(columns='SalePrice')
y = train['SalePrice']

In [None]:
# Use pandas .get_dummies() to binarize categorical columns. With help from https://stackoverflow.com/questions/32387266/converting-categorical-values-to-binary-using-pandas
# Drop first binary column
X_test = pd.get_dummies(X_test, drop_first=True)
X = pd.get_dummies(X, drop_first=True)

In [None]:
X.columns

In [None]:
X_test.columns

Missing columns in X_test: 

In [None]:
missing = [col for col in X.columns if col not in X_test.columns]
missing

In [None]:
X_test[missing] = 0

In [None]:
X.shape

In [None]:
X_test.shape

In [None]:
# A little help here from https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# Keeps dummies in same place as X
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X = X.reindex(sorted(X.columns), axis=1)


In [None]:
X.columns

In [None]:
X_test.columns

In [None]:
X.columns == X_test.columns

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
X_scaled.shape

In [None]:
X_test_scaled.shape

In [None]:
lasso = Lasso(alpha=10, max_iter=10_000)

lasso.fit(X_scaled, y)
preds_scaled = lasso.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_lasso_alpha_10.csv', index=False)

In [None]:
pd.DataFrame(lasso.coef_, X.columns).sort_values(by=0, ascending=False)

## Lasso Regression with CV

In [None]:
lassocv = LassoCV(max_iter=10_000)

lassocv.fit(X_scaled, y)
preds_scaled = lassocv.predict(X_test_scaled)
lassocv.alpha_

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_lasso_cv.csv', index=False)

In [None]:
pd.DataFrame(lassocv.coef_, X.columns).sort_values(by=0, ascending=False)

## Ridge Regression with CV

In [None]:
ridge = RidgeCV()

ridge.fit(X_scaled, y)
preds_scaled = ridge.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_ridge_cv.csv', index=False)

In [None]:
pd.DataFrame(ridge.coef_, X.columns).sort_values(by=0, ascending=False)

## Lasso Regression using interesting features and location features.

Since this has been my best scoring model so far, I'm going to run it again but using only the features from my LinReg that also scored well, and using CV.

In [None]:
# Import data
test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/train.csv')

In [None]:
# List interesting features from reading data description
interesting = [
    'Neighborhood',
    'Overall Qual',
    'Gr Liv Area',
    'Garage Area',
    'Year Built',
    'Year Remod/Add',
    'SalePrice',
]

In [None]:
# Keep only interesting features
train = train[interesting]


In [None]:
train.dtypes

In [None]:
train['Garage Area'].fillna(0, inplace=True)
test['Garage Area'].fillna(0, inplace=True)

In [None]:
X_test = test[interesting[:-1]]
X = train.drop(columns='SalePrice')
y = train['SalePrice']

In [None]:
# Use pandas .get_dummies() to binarize categorical columns. With help from https://stackoverflow.com/questions/32387266/converting-categorical-values-to-binary-using-pandas
# Drop first binary column
X_test = pd.get_dummies(X_test, drop_first=True)
X = pd.get_dummies(X, drop_first=True)

Missing columns in X_test: 

In [None]:
missing = [col for col in X.columns if col not in X_test.columns]
missing

In [None]:
X_test[missing] = 0

In [None]:
X.shape

In [None]:
X_test.shape

In [None]:
# A little help here from https://stackoverflow.com/questions/11067027/re-ordering-columns-in-pandas-dataframe-based-on-column-name
# Keeps dummies in same place as X
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
X = X.reindex(sorted(X.columns), axis=1)


In [None]:
X.columns == X_test.columns

In [None]:
ss = StandardScaler()

X_scaled = ss.fit_transform(X)
X_test_scaled = ss.transform(X_test)

In [None]:
X_scaled.shape

In [None]:
X_test_scaled.shape

In [None]:
lasso = LassoCV(max_iter=10_000)

lasso.fit(X_scaled, y)
preds_scaled = lasso.predict(X_test_scaled)

In [None]:
test['SalePrice'] = preds_scaled

In [None]:
test[['Id', 'SalePrice']].to_csv('../data/submission_lasso_neigh_num.csv', index=False)

In [None]:
pd.DataFrame(lasso.coef_, X.columns).sort_values(by=0, ascending=False)