# **Feature Engineering**

### **Imports**
---

In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import metrics

### **Load the data**
-----

In [247]:
train = pd.read_csv('../data/train_preprocessed.csv')
train.head()

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,misc_features_nan,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,544,531379050,RL,43.0,11492.0,AllPub,1996,1997,132.0,3,...,1,0,0,0,0,0,0,0,0,1
1,153,535304180,RL,68.0,7922.0,AllPub,1953,2007,0.0,2,...,1,0,0,0,0,0,0,0,0,1
2,318,916386060,RL,73.0,9802.0,AllPub,2006,2007,0.0,2,...,1,0,0,0,0,0,0,0,0,1
3,255,906425045,RL,82.0,14235.0,AllPub,1900,1993,0.0,2,...,1,0,0,0,0,0,0,0,0,1
4,2827,908186070,RM,35.0,3675.0,AllPub,2005,2006,82.0,2,...,1,0,0,0,0,0,1,0,0,0


In [248]:
test = pd.read_csv('../data/test_preprocessed.csv')
test.head()

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,misc_features_nan,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,RM,69.0,9142.0,AllPub,1910,1950,0.0,2,...,1,0,0,0,0,0,0,0,0,1
1,2718,905108090,RL,,9662.0,AllPub,1977,1977,0.0,2,...,1,0,0,0,0,0,0,0,0,1
2,2414,528218130,RL,58.0,17104.0,AllPub,2006,2006,0.0,3,...,1,0,0,0,0,0,1,0,0,0
3,1989,902207150,RM,60.0,8520.0,AllPub,1923,2006,0.0,3,...,1,0,0,0,0,0,0,0,0,1
4,625,535105100,RL,,9500.0,AllPub,1963,1963,247.0,2,...,1,0,0,0,0,0,0,0,0,1


----
### *Interaction Features*

In [249]:
print(train.shape)
print(test.shape)

(1272, 251)
(878, 251)


In [250]:
train.corr()[['SalePrice']].sort_values(by = 'SalePrice', ascending=False)

Unnamed: 0,SalePrice
SalePrice,1.000000
exterior_quality,0.688407
year_built,0.667750
basement_height,0.663840
above_ground_living_area,0.649481
...,...
foundation_type_Slab,
basement_finished_rating_nan,
basement_finished_rating2_nan,
garage_finished_nan,


In [251]:
train[['garage_yr_built','year_built']].corr()

Unnamed: 0,garage_yr_built,year_built
garage_yr_built,1.0,0.838711
year_built,0.838711,1.0


In [252]:
train[['SalePrice','finished_area','finished_area2','unfinished_area','total_basement_area','garage_yr_built','year_built']].corr().sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice,finished_area,finished_area2,unfinished_area,total_basement_area,garage_yr_built,year_built
SalePrice,1.0,0.279768,-0.039258,0.18702,0.559873,0.608534,0.66775
year_built,0.66775,0.19564,-0.02944,0.169534,0.441272,0.838711,1.0
garage_yr_built,0.608534,0.12504,-0.044423,0.18322,0.367779,1.0,0.838711
total_basement_area,0.559873,0.345926,0.052202,0.429738,1.0,0.367779,0.441272
finished_area,0.279768,1.0,-0.056388,-0.627205,0.345926,0.12504,0.19564
unfinished_area,0.18702,-0.627205,-0.272996,1.0,0.429738,0.18322,0.169534
finished_area2,-0.039258,-0.056388,1.0,-0.272996,0.052202,-0.044423,-0.02944


In [253]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0]

In [254]:
train['finished_space'] = train['finished_area'] * train['finished_area2']
test['finished_space'] = test['finished_area'] * test['finished_area2']

train['basement_space'] = train['unfinished_area'] * train['total_basement_area']
test['basement_space'] = test['unfinished_area'] * test['total_basement_area']

train['garage_yr_built_comb'] = train['garage_yr_built'] * train['year_built']
test['garage_yr_built_comb'] = test['garage_yr_built'] * test['year_built']

In [255]:
train[['SalePrice','finished_space','basement_space','garage_yr_built_comb']].corr().sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice,finished_space,basement_space,garage_yr_built_comb
SalePrice,1.0,0.004733,0.315218,0.669314
garage_yr_built_comb,0.669314,-0.022631,0.293694,1.0
basement_space,0.315218,-0.197421,1.0,0.293694
finished_space,0.004733,1.0,-0.197421,-0.022631


## Model 6: Test with interaction features added

In [256]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0 and col not in ['zoning_type_I (all)',
            'util_avail_NoSewr', 'ames_neighborhood_BrDale', 'prox_to_transport_2_PosA', 'floors_2.5Fin', 'roof_material_Metal', 'roof_material_Roll',
            'roof_material_WdShngl', 'ext_covering_AsphShn', 'ext_covering_PreCast', 'ext_covering2_Other', 'ext_covering2_PreCast', 'masonry_veneer_type_CBlock',
            'masonry_veneer_type_nan', 'foundation_type_Slab', 'basement_finished_rating_nan', 'basement_finished_rating2_nan', 'heating_type_GasA',
            'full_bathrooms_above_ground_4', 'kitchens_above_ground_1', 'garage_finished_nan', 'garage_car_capacity_1', 'sale_type_VWD', 'basement_space','finished_space']]      

In [257]:
X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f'Training Shape: {X_train.shape, y_train.shape}')
print(f'Test Shape: {X_test.shape, y_test.shape}')

lr = LinearRegression()
lr.fit(X_train, y_train)

#Training Score
print(f'Training R2 Score: {lr.score(X_train, y_train)}')
#Test Score
print(f'Test R2 Score: {lr.score(X_test, y_test)}')

Training Shape: ((954, 218), (954,))
Test Shape: ((318, 218), (318,))
Training R2 Score: 0.9435027030014257
Test R2 Score: 0.894808347553714


##### *Improvement from model 5 which produced below results:*
- Training Score: 0.8898394072586665
- Test Score: 0.8859250764243847


In [258]:
#cross-validation score
cross_val_score(lr, X_train, y_train).mean()

0.8841266744097138

In [259]:
#adjusted R-Squared Training data
r2 = lr.score(X_train, y_train)
print(f'Adjusted R-Squared for training: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#adjusted R-Squared Test data
r2 = lr.score(X_test, y_test)
print(f'Adjusted R-Squared for test: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#Root Mean Squared Error
predictions = lr.predict(X_test)

rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Adjusted R-Squared for training: 0.9317525529134031
Adjusted R-Squared for test: 0.8729308813754266
Root Mean Squared Error: 16948.955730431226


In [260]:
# Cross Val Score to evaluate model
cross_val_score(lr, X_train, y_train).mean()

0.8841266744097138

In [261]:
# Make predictions on test data
preds = lr.predict(test[features])

preds.shape

(878,)

In [262]:
# add sale price column to test data set for kaggle submission
test['SalePrice'] = preds

# Create Data Frame with only Id and SalePrice
interaction_features = test[['Id', 'SalePrice']]

# set index for new DF
interaction_features.set_index('Id', inplace=True)
interaction_features.head()

# save submission
interaction_features.to_csv('../data/interaction_features.csv')

In [263]:
interaction_features.shape

(878, 1)

------
### *Combining Features*

In [264]:
# combine bathrooms and drop original columns
train['total_bathrooms'] = train['full_bathrooms_basement'] + train['half_bathrooms_basement'] + train['full_bathrooms_above_ground'] + train['half_bathrooms_above_ground']
test['total_bathrooms'] = test['full_bathrooms_basement'] + test['half_bathrooms_basement'] + test['full_bathrooms_above_ground'] + test['half_bathrooms_above_ground']

train.drop(columns = ['full_bathrooms_basement','half_bathrooms_basement','full_bathrooms_above_ground','half_bathrooms_above_ground'], inplace = True)
test.drop(columns = ['full_bathrooms_basement','half_bathrooms_basement','full_bathrooms_above_ground','half_bathrooms_above_ground'], inplace = True)

# Combine living area
train['total_living_area'] = train['finished_area'] + train['finished_area2'] + train['above_ground_living_area']
test['total_living_area'] = test['finished_area'] + test['finished_area2'] + test['above_ground_living_area']

train.drop(columns = ['finished_area', 'finished_area2', 'above_ground_living_area'], inplace=True)
test.drop(columns = ['finished_area', 'finished_area2', 'above_ground_living_area'], inplace = True)

print(train[['total_bathrooms','total_living_area']].head())

   total_bathrooms  total_living_area
0                4             2759.0
1                2             1788.0
2                3             1444.0
3                2             1445.0
4                3             1619.0


In [265]:
print(train.shape)
print(test.shape)

(1272, 249)
(878, 249)


## Model 7: Test with combined features

In [266]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0 and col not in ['zoning_type_I (all)',
            'util_avail_NoSewr', 'ames_neighborhood_BrDale', 'prox_to_transport_2_PosA', 'floors_2.5Fin', 'roof_material_Metal', 'roof_material_Roll',
            'roof_material_WdShngl', 'ext_covering_AsphShn', 'ext_covering_PreCast', 'ext_covering2_Other', 'ext_covering2_PreCast', 'masonry_veneer_type_CBlock',
            'masonry_veneer_type_nan', 'foundation_type_Slab', 'basement_finished_rating_nan', 'basement_finished_rating2_nan', 'heating_type_GasA',
            'full_bathrooms_above_ground_4', 'kitchens_above_ground_1', 'garage_finished_nan', 'garage_car_capacity_1', 'sale_type_VWD', 'basement_space','finished_space']]      

In [267]:
X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f'Training Shape: {X_train.shape, y_train.shape}')
print(f'Test Shape: {X_test.shape, y_test.shape}')

lr = LinearRegression()
lr.fit(X_train, y_train)

#Training Score
print(f'Training R2 Score: {lr.score(X_train, y_train)}')
#Test Score
print(f'Test R2 Score: {lr.score(X_test, y_test)}')

Training Shape: ((954, 213), (954,))
Test Shape: ((318, 213), (318,))
Training R2 Score: 0.9430456103366734
Test R2 Score: 0.8971282915143795


In [268]:
#cross validation score
cross_val_score(lr, X_train, y_train).mean()

0.8854282160088804

In [269]:
#adjusted R-Squared Training data
r2 = lr.score(X_train, y_train)
print(f'Adjusted R-Squared for training: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#adjusted R-Squared Test data
r2 = lr.score(X_test, y_test)
print(f'Adjusted R-Squared for test: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#Root Mean Squared Error
predictions = lr.predict(X_test)

rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
print(f'Root Mean Squared Error: {rmse}')

Adjusted R-Squared for training: 0.9315255353008021
Adjusted R-Squared for test: 0.8763205924445091
Root Mean Squared Error: 16761.01377200922


Combining features improved the (bathrooms and finished living area) improved the test R-squared score slightly. 

## **Regularization**

#### Model 8: Ridge Regression

In [270]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0 and col not in ['zoning_type_I (all)',
            'util_avail_NoSewr', 'ames_neighborhood_BrDale', 'prox_to_transport_2_PosA', 'floors_2.5Fin', 'roof_material_Metal', 'roof_material_Roll',
            'roof_material_WdShngl', 'ext_covering_AsphShn', 'ext_covering_PreCast', 'ext_covering2_Other', 'ext_covering2_PreCast', 'masonry_veneer_type_CBlock',
            'masonry_veneer_type_nan', 'foundation_type_Slab', 'basement_finished_rating_nan', 'basement_finished_rating2_nan', 'heating_type_GasA',
            'full_bathrooms_above_ground_4', 'kitchens_above_ground_1', 'garage_finished_nan', 'garage_car_capacity_1', 'sale_type_VWD', 'basement_space','finished_space']]      

X = train[features]
y = train['SalePrice']

#Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [271]:
# Scale the data
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (954, 213)
y_train shape is: (954,)
Z_test shape is: (318, 213)
y_test shape is: (318,)


In [272]:
ridge = Ridge()
ridge.fit(Z_train, y_train)

print(f'Training score: {ridge.score(Z_train, y_train)}')
print(f'Test score: {ridge.score(Z_test, y_test)}')

Training score: 0.9430139356846472
Test score: 0.8979600180557962


In [273]:
# This code was adapted from Lesson 4.03 on Regularization
# fitting model using best ridge alpha
alphas = np.logspace(0, 5, 100)

ridge_cv = RidgeCV(alphas = alphas, cv = 5)
ridge_cv.fit(Z_train, y_train)

ridge_cv.alpha_

117.68119524349979

In [274]:
print(f'Best Ridge Training score: {ridge_cv.score(Z_train, y_train)}')
print(f'Best Ridge Test score: {ridge_cv.score(Z_test, y_test)}')

Best Ridge Training score: 0.9374380281415059
Best Ridge Test score: 0.9075702377341283


In [275]:
# add sale price prediction column to test df for kaggle submission
ss = StandardScaler()

preds = ridge_cv.predict(ss.fit_transform(test[features]))
test['SalePrice'] = preds

ridge_df = test[['Id', 'SalePrice']]
ridge_df.set_index('Id', inplace = True)

ridge_df.to_csv('../data/ridge_model.csv')
ridge_df.shape

(878, 1)

#### Lasso Regression

In [276]:
#Lasso
# This code was adapted from Lesson 4.03 on Regularization
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas = l_alphas)

lasso_cv.fit(Z_train, y_train)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [277]:
print(f'The optimal alpha for the Lasso model: {lasso_cv.alpha_}')

print(f'Lasso training score: {lasso_cv.score(Z_train, y_train)}')
print(f'Lasso test score: {lasso_cv.score(Z_test, y_test)}')

The optimal alpha for the Lasso model: 1.0
Lasso training score: 0.9430424619668215
Lasso test score: 0.8974203876064019


Lasso regression did not improve the test score. However, it provides important clues on feature selection.

In [278]:
lasso_list = list(zip(lasso_cv.coef_, features))

In [279]:
new_features = [lasso_list[n][1] for n in range(0, len(lasso_list)) if lasso_list[n][0] != 0]

In [280]:
new_features[:10]

['Id',
 'parcel_id',
 'lot_area',
 'year_built',
 'year_remodeled',
 'exterior_quality',
 'exterior_condition',
 'unfinished_area',
 'total_basement_area',
 'heating_quality']

----
## **Model 9:** Using features selected by Lasso Regression

In [281]:
X = train[new_features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [282]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Model 7 Training Score: {lr.score(X_train, y_train)}')
print(f'Model 7 Test Score: {lr.score(X_test, y_test)}')      

Model 7 Training Score: 0.9430453071988046
Model 7 Test Score: 0.8971211699254816


Did not improve the model score. 

-----
### Write the feature-engineered DFs to a csv file for use in next notebook

In [283]:
train.to_csv('../data/train_engineered.csv', index=False)
test.to_csv('../data/test_engineered.csv', index=False)
print(train.shape)
print(test.shape)

(1272, 249)
(878, 249)
