# **Feature Engineering**

### **Imports**
---

In [204]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import metrics

### **Load the data**
-----

In [90]:
train = pd.read_csv('../data/train_preprocessed.csv')
train.head()

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,misc_features_Shed,misc_features_nan,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,544,531379050,RL,43.0,11492.0,AllPub,1996,1997,132.0,3,...,0,1,0,0,0,0,0,0,0,1
1,153,535304180,RL,68.0,7922.0,AllPub,1953,2007,0.0,2,...,0,1,0,0,0,0,0,0,0,1
2,318,916386060,RL,73.0,9802.0,AllPub,2006,2007,0.0,2,...,0,1,0,0,0,0,0,0,0,1
3,255,906425045,RL,82.0,14235.0,AllPub,1900,1993,0.0,2,...,0,1,0,0,0,0,0,0,0,1
4,2827,908186070,RM,35.0,3675.0,AllPub,2005,2006,82.0,2,...,0,1,0,0,0,0,0,1,0,0


In [91]:
test = pd.read_csv('../data/test_preprocessed.csv')
test.head()

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,misc_features_nan,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,RM,69.0,9142.0,AllPub,1910,1950,0.0,2,...,1,0,0,0,0,0,0,0,0,1
1,2718,905108090,RL,,9662.0,AllPub,1977,1977,0.0,2,...,1,0,0,0,0,0,0,0,0,1
2,2414,528218130,RL,58.0,17104.0,AllPub,2006,2006,0.0,3,...,1,0,0,0,0,0,1,0,0,0
3,1989,902207150,RM,60.0,8520.0,AllPub,1923,2006,0.0,3,...,1,0,0,0,0,0,0,0,0,1
4,625,535105100,RL,,9500.0,AllPub,1963,1963,247.0,2,...,1,0,0,0,0,0,0,0,0,1


----
### *Interaction Features*

In [92]:
train.corr()[['SalePrice']].sort_values(by = 'SalePrice', ascending=False)

Unnamed: 0,SalePrice
SalePrice,1.000000
exterior_quality,0.688407
year_built,0.667750
basement_height,0.663840
above_ground_living_area,0.649481
...,...
foundation_type_CBlock,-0.388697
masonry_veneer_type_None,-0.401485
house_quality_5,-0.429538
garage_location_Detchd,-0.508495


In [93]:
train[['garage_yr_built','year_built']].corr()

Unnamed: 0,garage_yr_built,year_built
garage_yr_built,1.0,0.838711
year_built,0.838711,1.0


In [95]:
train[['SalePrice','finished_area','finished_area2','unfinished_area','total_basement_area','garage_yr_built','year_built']].corr().sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice,finished_area,finished_area2,unfinished_area,total_basement_area,garage_yr_built,year_built
SalePrice,1.0,0.279768,-0.039258,0.18702,0.559873,0.608534,0.66775
year_built,0.66775,0.19564,-0.02944,0.169534,0.441272,0.838711,1.0
garage_yr_built,0.608534,0.12504,-0.044423,0.18322,0.367779,1.0,0.838711
total_basement_area,0.559873,0.345926,0.052202,0.429738,1.0,0.367779,0.441272
finished_area,0.279768,1.0,-0.056388,-0.627205,0.345926,0.12504,0.19564
unfinished_area,0.18702,-0.627205,-0.272996,1.0,0.429738,0.18322,0.169534
finished_area2,-0.039258,-0.056388,1.0,-0.272996,0.052202,-0.044423,-0.02944


In [96]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0]

In [97]:
train['finished_space'] = train['finished_area'] * train['finished_area2']
test['finished_space'] = test['finished_area'] * test['finished_area2']

train['basement_space'] = train['unfinished_area'] * train['total_basement_area']
test['basement_space'] = test['unfinished_area'] * test['total_basement_area']

train['garage_yr_built_comb'] = train['garage_yr_built'] * train['year_built']
test['garage_yr_built_comb'] = test['garage_yr_built'] * test['year_built']

In [98]:
train[['SalePrice','finished_space','basement_space','garage_yr_built_comb']].corr().sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice,finished_space,basement_space,garage_yr_built_comb
SalePrice,1.0,0.004733,0.315218,0.669314
garage_yr_built_comb,0.669314,-0.022631,0.293694,1.0
basement_space,0.315218,-0.197421,1.0,0.293694
finished_space,0.004733,1.0,-0.197421,-0.022631


## Model Test for interaction features

In [99]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0 and col not in ['zoning_type_I (all)',
            'util_avail_NoSewr', 'ames_neighborhood_BrDale', 'prox_to_transport_2_PosA', 'floors_2.5Fin', 'roof_material_Metal', 'roof_material_Roll',
            'roof_material_WdShngl', 'ext_covering_AsphShn', 'ext_covering_PreCast', 'ext_covering2_Other', 'ext_covering2_PreCast', 'masonry_veneer_type_CBlock',
            'masonry_veneer_type_nan', 'foundation_type_Slab', 'basement_finished_rating_nan', 'basement_finished_rating2_nan', 'heating_type_GasA',
            'full_bathrooms_above_ground_4', 'kitchens_above_ground_1', 'garage_finished_nan', 'garage_car_capacity_1', 'sale_type_VWD', 'basement_space','finished_space']]      

In [100]:
X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(f'Training Shape: {X_train.shape, y_train.shape}')
print(f'Test Shape: {X_test.shape, y_test.shape}')

lr = LinearRegression()
lr.fit(X_train, y_train)

#Training Score
print(f'Training R2 Score: {lr.score(X_train, y_train)}')
#Test Score
print(f'Test R2 Score: {lr.score(X_test, y_test)}')

Training Shape: ((954, 208), (954,))
Test Shape: ((318, 208), (318,))
Training R2 Score: 0.9430535788578619
Test R2 Score: 0.8970932525945663


In [101]:
#adjusted R-Squared Training data
r2 = lr.score(X_train, y_train)
print(f'Adjusted R-Squared for training: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#adjusted R-Squared Test data
r2 = lr.score(X_test, y_test)
print(f'Adjusted R-Squared for test: {1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))}')

#Root Mean Squared Error
predictions = lr.predict(X_train)

rmse = np.sqrt(metrics.mean_squared_error(y_train, predictions))
print(f'Root Mean Squared Error: {rmse}')

Adjusted R-Squared for training: 0.931857151747131
Adjusted R-Squared for test: 0.8768604113831499
Root Mean Squared Error: 12471.515689185038


In [102]:
# Make predictions on test data
preds = lr.predict(test[features])

preds.shape

(878,)

In [103]:
# Cross Val Score to evaluate model
cross_val_score(lr, X_train, y_train).mean()

0.8847741010207659

In [104]:
# add sale price column to test data set based on model
test['SalePrice'] = preds

# Create Data Frame with only Id and SalePrice
interaction_features = test[['Id', 'SalePrice']]

# set index for new DF
interaction_features.set_index('Id', inplace=True)
interaction_features.head()

# save submission
interaction_features.to_csv('../data/interaction_features.csv')

##### *Lack of improvement from adding the interaction features chosen above*

In [105]:
train.drop(columns = ['finished_space','half_bathrooms_basement','garage_yr_built_comb'], inplace = False)
test.drop(columns = ['finished_space','half_bathrooms_basement','garage_yr_built_comb'], inplace = False)

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD,basement_space
0,2658,902301120,RM,69.0,9142.0,AllPub,1910,1950,0.0,2,...,0,0,0,0,0,0,0,0,1,1040400
1,2718,905108090,RL,,9662.0,AllPub,1977,1977,0.0,2,...,0,0,0,0,0,0,0,0,1,3869089
2,2414,528218130,RL,58.0,17104.0,AllPub,2006,2006,0.0,3,...,0,0,0,0,0,1,0,0,0,65400
3,1989,902207150,RM,60.0,8520.0,AllPub,1923,2006,0.0,3,...,0,0,0,0,0,0,0,0,1,937024
4,625,535105100,RL,,9500.0,AllPub,1963,1963,247.0,2,...,0,0,0,0,0,0,0,0,1,1094290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,RL,80.0,8000.0,AllPub,1974,1974,0.0,2,...,0,0,0,0,0,0,0,0,1,0
874,1234,535126140,RL,90.0,14670.0,AllPub,1966,1999,410.0,3,...,0,0,0,0,0,0,0,0,1,584016
875,1373,904100040,RL,55.0,8250.0,AllPub,1968,1968,0.0,2,...,0,0,0,0,0,0,0,0,1,199920
876,1672,527425140,RL,60.0,9000.0,AllPub,1971,1971,0.0,2,...,0,0,0,0,0,0,0,0,1,214272


------
### *Combining Features*

In [106]:
# combine bathrooms
train['total_bathrooms'] = train['full_bathrooms_basement'] + train['half_bathrooms_basement'] + train['full_bathrooms_above_ground'] + train['half_bathrooms_above_ground']
test['total_bathrooms'] = test['full_bathrooms_basement'] + test['half_bathrooms_basement'] + test['full_bathrooms_above_ground'] + test['half_bathrooms_above_ground']


# Combine living area
train['total_living_area'] = train['finished_area'] + train['finished_area2'] + train['above_ground_living_area']
test['total_living_area'] = test['finished_area'] + test['finished_area2'] + test['above_ground_living_area']


print(train[['total_bathrooms','total_living_area']].head())

   total_bathrooms  total_living_area
0                4             2759.0
1                2             1788.0
2                3             1444.0
3                2             1445.0
4                3             1619.0


Combining features (bathrooms and finished living area) did not improve the model.

In [107]:
train.drop(columns = ['full_bathrooms_basement','basement_space','full_bathrooms_above_ground','half_bathrooms_above_ground'], inplace = False )
test.drop(columns = ['full_bathrooms_basement','half_bathrooms_basement','full_bathrooms_above_ground','half_bathrooms_above_ground'], inplace = False)

Unnamed: 0,Id,parcel_id,zoning_type,lot_frontage,lot_area,util_avail,year_built,year_remodeled,masonry_veneer_area,exterior_quality,...,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD,finished_space,basement_space,garage_yr_built_comb,total_bathrooms,total_living_area
0,2658,902301120,RM,69.0,9142.0,AllPub,1910,1950,0.0,2,...,0,0,0,0,1,0,1040400,3648100.0,2,1928.0
1,2718,905108090,RL,,9662.0,AllPub,1977,1977,0.0,2,...,0,0,0,0,1,0,3869089,3908529.0,2,1967.0
2,2414,528218130,RL,58.0,17104.0,AllPub,2006,2006,0.0,3,...,0,1,0,0,0,0,65400,4024036.0,4,2050.0
3,1989,902207150,RM,60.0,8520.0,AllPub,1923,2006,0.0,3,...,0,0,0,0,1,0,937024,3721005.0,1,968.0
4,625,535105100,RL,,9500.0,AllPub,1963,1963,247.0,2,...,0,0,0,0,1,0,1094290,3853369.0,3,2003.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662,527377110,RL,80.0,8000.0,AllPub,1974,1974,0.0,2,...,0,0,0,0,1,142443,0,3896676.0,4,2961.0
874,1234,535126140,RL,90.0,14670.0,AllPub,1966,1999,410.0,3,...,0,0,0,0,1,0,584016,3865156.0,3,2563.0
875,1373,904100040,RL,55.0,8250.0,AllPub,1968,1968,0.0,2,...,0,0,0,0,1,123000,199920,3873024.0,1,1953.0
876,1672,527425140,RL,60.0,9000.0,AllPub,1971,1971,0.0,2,...,0,0,0,0,1,0,214272,3890754.0,1,1480.0


## **Regularization**

#### Ridge Regression

In [112]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0 and col not in ['zoning_type_I (all)',
            'util_avail_NoSewr', 'ames_neighborhood_BrDale', 'prox_to_transport_2_PosA', 'floors_2.5Fin', 'roof_material_Metal', 'roof_material_Roll',
            'roof_material_WdShngl', 'ext_covering_AsphShn', 'ext_covering_PreCast', 'ext_covering2_Other', 'ext_covering2_PreCast', 'masonry_veneer_type_CBlock',
            'masonry_veneer_type_nan', 'foundation_type_Slab', 'basement_finished_rating_nan', 'basement_finished_rating2_nan', 'heating_type_GasA',
            'full_bathrooms_above_ground_4', 'kitchens_above_ground_1', 'garage_finished_nan', 'garage_car_capacity_1', 'sale_type_VWD', 'basement_space','finished_space']]      

X = train[features]
y = train['SalePrice']

#Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14)

In [113]:
# Scale the data
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (954, 210)
y_train shape is: (954,)
Z_test shape is: (318, 210)
y_test shape is: (318,)


In [115]:
ridge = Ridge()
ridge.fit(Z_train, y_train)

print(f'Training score: {ridge.score(Z_train, y_train)}')
print(f'Test score: {ridge.score(Z_test, y_test)}')

Training score: 0.9460345095442223
Test score: 0.8803905038332118


In [116]:
# This code was adapted from Lesson 4.03 on Regularization
#fitting model using best ridge alpha
alphas = np.logspace(0, 3, 100)

ridge_cv = RidgeCV(alphas = alphas, cv = 5)
ridge_cv.fit(Z_train, y_train)

ridge_cv.alpha_

162.9750834620645

In [120]:
print(f'Ridge Training score: {ridge_cv.score(Z_train, y_train)}')
print(f'Ridge Test score: {ridge_cv.score(Z_test, y_test)}')

Ridge Training score: 0.9397929596272039
Ridge Test score: 0.8905740993416914


Ridge did not improve the test R-Squared score relative to the best linear regression model above.

#### Lasso Regression

In [119]:
#Lasso
# This code was adapted from Lesson 4.03 on Regularization
l_alphas = np.logspace(-3, 0, 100)

lasso_cv = LassoCV(alphas = l_alphas)

lasso_cv.fit(Z_train, y_train)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]))

In [121]:
print(f'The optimal alpha for the Lasso model: {lasso_cv.alpha_}')

print(f'Lasso training score: {lasso_cv.score(Z_train, y_train)}')
print(f'Lasso test score: {lasso_cv.score(Z_test, y_test)}')

The optimal alpha for the Lasso model: 1.0
Lasso training score: 0.9461313079481675
Lasso test score: 0.8795267055289492


Lasso regression did not improve the test score either. However, it provides important clues on feature selection.

In [185]:
lasso_list = list(zip(lasso_cv.coef_, features))

In [186]:
new_features = [lasso_list[n][1] for n in range(0, len(lasso_list)) if lasso_list[n][0] != 0]

In [190]:
new_features[:10]

['Id',
 'parcel_id',
 'lot_area',
 'year_built',
 'year_remodeled',
 'exterior_quality',
 'exterior_condition',
 'finished_area',
 'finished_area2',
 'unfinished_area']

#### Elastic Net Regression

In [205]:
enet_alphas = np.linspace(0.5, 1.0, 100)

enet_ratio = 0.5

# produces optimal alpha
enet_model_alpha = ElasticNetCV(alphas=enet_alphas, l1_ratio=enet_ratio, cv=5)


enet_model = enet_model_alpha.fit(X_train, y_train)


print(f'Elastic Net training score: {enet_model.score(X_train, y_train)}')
print(f'Elastic Net test score: {enet_model.score(X_test, y_test)}')

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Elastic Net training score: 0.881465643108751
Elastic Net test score: 0.8805181135597636


  model = cd_fast.enet_coordinate_descent(


In [207]:
# make some predictions
predictions = enet_model_alpha.predict(X_test)
predictions[:5]

array([128613.97989678, 121362.68335753,  95214.54525206, 219438.25151549,
        84076.48189741])

# **Model Using Elastic Net**

----
## **Model 7:** Using features selected by Lasso Regression

In [191]:
X = train[new_features]
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [193]:
lr = LinearRegression()
lr.fit(X_train, y_train)

print(f'Model 7 Training Score: {lr.score(X_train, y_train)}')
print(f'Model 7 Test Score: {lr.score(X_test, y_test)}')      

Model 7 Training Score: 0.94243676949847
Model 7 Test Score: 0.896778003964189


In [197]:
# test on kaggle
preds = lr.predict(test[new_features])
test['SalePrice'] = preds

lasso_features = test[['Id', 'SalePrice']]
lasso_features.set_index('Id', inplace=True)
lasso_features.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,133861.355638
2718,153641.10437
2414,203879.51926
1989,111490.411915
625,169432.904872


In [198]:
lasso_features.to_csv('../data/lasso_features.csv')

In [203]:
train.corr()[['SalePrice']].sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice
SalePrice,1.000000
exterior_quality,0.688407
garage_yr_built_comb,0.669314
year_built,0.667750
basement_height,0.663840
...,...
foundation_type_CBlock,-0.388697
masonry_veneer_type_None,-0.401485
house_quality_5,-0.429538
garage_location_Detchd,-0.508495
