# **Preprocessing and Feature Engineering**

### **Imports**
---

In [277]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn import metrics

### **Load the data**
-----

In [275]:
train = pd.read_csv('../data/train_cleaned.csv')
train.head()

Unnamed: 0,Id,parcel_id,dwelling_type,zoning_type,lot_frontage,lot_area,street_surface,alley_surface,property_shape,property_flatness,...,screen_porch_size,pool_size,pool_quality,fence_quality,misc_features,misc_features_value,month_sold,year_sold,sale_type,SalePrice
0,544,531379050,60,RL,43.0,11492.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,4,2009,WD,220000.0
1,153,535304180,20,RL,68.0,7922.0,Pave,,Reg,Lvl,...,0.0,0.0,,,,0.0,1,2010,WD,109000.0
2,318,916386060,60,RL,73.0,9802.0,Pave,,Reg,Lvl,...,0.0,0.0,,,,0.0,4,2010,WD,174000.0
3,255,906425045,50,RL,82.0,14235.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,3,2010,WD,138500.0
4,2827,908186070,180,RM,35.0,3675.0,Pave,,Reg,Lvl,...,0.0,0.0,,,,0.0,6,2006,New,140000.0


In [276]:
test = pd.read_csv('../data/test_cleaned.csv')
test.head()

Unnamed: 0,Id,parcel_id,dwelling_type,zoning_type,lot_frontage,lot_area,street_surface,alley_surface,property_shape,property_flatness,...,three_season_porch_size,screen_porch_size,pool_size,pool_quality,fence_quality,misc_features,misc_features_value,month_sold,year_sold,sale_type
0,2658,902301120,190,RM,69.0,9142.0,Pave,Grvl,Reg,Lvl,...,0.0,0.0,0.0,,,,0.0,4,2006,WD
1,2718,905108090,90,RL,,9662.0,Pave,,IR1,Lvl,...,0.0,0.0,0.0,,,,0.0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104.0,Pave,,IR1,Lvl,...,0.0,0.0,0.0,,,,0.0,9,2006,New
3,1989,902207150,30,RM,60.0,8520.0,Pave,,Reg,Lvl,...,0.0,0.0,0.0,,,,0.0,7,2007,WD
4,625,535105100,20,RL,,9500.0,Pave,,IR1,Lvl,...,0.0,185.0,0.0,,,,0.0,7,2009,WD


----
## **Model Attempt 3:** *Running model with all numerical features*

In [278]:
features = [col for col in test._get_numeric_data().columns if col != 'SalePrice' and test[col].isnull().sum() == 0]

In [279]:
train[features]

Unnamed: 0,Id,parcel_id,dwelling_type,lot_area,house_quality,house_condition,year_built,year_remodeled,finished_area,finished_area2,...,garage_size,wood_deck_size,open_porch_size,enclose_porch_size,three_season_porch_size,screen_porch_size,pool_size,misc_features_value,month_sold,year_sold
0,544,531379050,60,11492.0,7,5,1996,1997,637.0,0.0,...,559.0,0.0,74.0,0.0,0.0,0.0,0.0,0.0,4,2009
1,153,535304180,20,7922.0,5,7,1953,2007,731.0,0.0,...,246.0,0.0,52.0,0.0,0.0,0.0,0.0,0.0,1,2010
2,318,916386060,60,9802.0,5,5,2006,2007,0.0,0.0,...,400.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,4,2010
3,255,906425045,50,14235.0,6,8,1900,1993,0.0,0.0,...,484.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,3,2010
4,2827,908186070,180,3675.0,6,5,2005,2006,547.0,0.0,...,525.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,6,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,1140,531382090,60,8453.0,6,5,1995,1995,362.0,0.0,...,525.0,0.0,70.0,0.0,0.0,0.0,0.0,0.0,4,2008
1268,1587,921126030,20,11449.0,8,5,2007,2007,1011.0,0.0,...,520.0,0.0,276.0,0.0,0.0,0.0,0.0,0.0,1,2008
1269,916,909253010,50,7558.0,6,6,1928,1950,0.0,0.0,...,342.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,2009
1270,639,535179160,20,10400.0,4,5,1956,1956,155.0,750.0,...,294.0,0.0,189.0,140.0,0.0,0.0,0.0,0.0,11,2009


In [280]:
X = train[features]
y = train['SalePrice']

X_train, X_test, y_train, t_test = train_test_split(X, y, random_state=42)

print(f'Training Shape: {X_train.shape, y_train.shape}')
print(f'Test Shape: {X_test.shape, y_test.shape}')

Training Shape: ((954, 35), (954,))
Test Shape: ((318, 35), (318,))


In [281]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [282]:
#Training Score
lr.score(X_train, y_train)

0.8809665491328084

In [283]:
#Test Score
lr.score(X_test, y_test)

0.8731847118523952

In [284]:
# Make predictions on test data
preds = lr.predict(test[features])

preds.shape

(878,)

In [285]:
# Cross Val Score to evaluate model
cross_val_score(lr, X_train, y_train).mean()

0.872599661652767

Solid R-Squared results but could be misleading due to the number of features modeled.

In [286]:
#adjusted R-Squared Training data
r2 = lr.score(X_train, y_train)
1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))

0.87749955541823

In [287]:
#adjusted R-Squared Test data
r2 = lr.score(X_test, y_test)
1 - (((1-r2) * (len(y-1))) / (len(y) - len(features) -1))

0.8694910626830474

In [288]:
#Root Mean Squared Error
predictions = lr.predict(X_train)

rmse = np.sqrt(metrics.mean_squared_error(y_train, predictions))
rmse

18031.02982544714

In [289]:
pd.DataFrame(data = lr.coef_, index=X.columns, columns=['Coefficient'])

Unnamed: 0,Coefficient
Id,-1.715176
parcel_id,6.052376e-07
dwelling_type,-59.15668
lot_area,1.22632
house_quality,13063.61
house_condition,4340.688
year_built,357.3633
year_remodeled,225.2188
finished_area,18.93919
finished_area2,4.655726


In [270]:
# add sale price column to test data set based on model
test['SalePrice'] = preds
test.head()

Unnamed: 0,Id,parcel_id,dwelling_type,zoning_type,lot_frontage,lot_area,street_surface,alley_surface,property_shape,property_flatness,...,screen_porch_size,pool_size,pool_quality,fence_quality,misc_features,misc_features_value,month_sold,year_sold,sale_type,SalePrice
0,2658,902301120,190,RM,69.0,9142.0,Pave,Grvl,Reg,Lvl,...,0.0,0.0,,,,0.0,4,2006,WD,132073.127128
1,2718,905108090,90,RL,,9662.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,8,2006,WD,147911.981694
2,2414,528218130,60,RL,58.0,17104.0,Pave,,IR1,Lvl,...,0.0,0.0,,,,0.0,9,2006,New,211354.736957
3,1989,902207150,30,RM,60.0,8520.0,Pave,,Reg,Lvl,...,0.0,0.0,,,,0.0,7,2007,WD,117175.980949
4,625,535105100,20,RL,,9500.0,Pave,,IR1,Lvl,...,185.0,0.0,,,,0.0,7,2009,WD,183573.600866


In [271]:
# Create Data Frame with only Id and SalePrice
all_numerical_submission = test[['Id', 'SalePrice']]
all_numerical_submission.shape

(878, 2)

In [272]:
# set index for new DF
all_numerical_submission.set_index('Id', inplace=True)
all_numerical_submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,132073.127128
2718,147911.981694
2414,211354.736957
1989,117175.980949
625,183573.600866


In [273]:
# save submission
all_numerical_submission.to_csv('../data/all_numerical_submission.csv')

------
### *Handling Categorical Variables*

In [274]:
# checking data types
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1272 entries, 0 to 1271
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Id                           1272 non-null   int64  
 1   parcel_id                    1272 non-null   int64  
 2   dwelling_type                1272 non-null   int64  
 3   zoning_type                  1272 non-null   object 
 4   lot_frontage                 1272 non-null   float64
 5   lot_area                     1272 non-null   float64
 6   street_surface               1272 non-null   object 
 7   alley_surface                103 non-null    object 
 8   property_shape               1272 non-null   object 
 9   property_flatness            1272 non-null   object 
 10  util_avail                   1272 non-null   object 
 11  lot_position                 1272 non-null   object 
 12  property_slope               1272 non-null   object 
 13  ames_neighborhood 

In [None]:
# candidates for #'s rather than dummy variables
'house_quality','house_condition','exterior_quality','exterior_condition', 'basement_height', 'basement_condition','basement_walkouts', 'heating_quality', 'kitchen_quality', 'fireplace_quality', 'garage_quality',
'garage_condition', 'pool_quality','fence_quality'

In [None]:
# candidates for interaction features or dropping collinear?
house & exterior & garage - - quality/condition
'basement_height', 'basement_condition'

# collinear?
# check with correlation/plot if 90 or greater, drop 1 of them that's less correlated with price
droppping one of the quality / condition ones

In [None]:
#check data types
train.info()

In [None]:
train = pd.get_dummies(data = train, columns = ['dwelling_type','zoning_type','street_surface','alley_surface','property_shape','property_flatness',
                     'util_avail','lot_position','property_slope','ames_neighborhood','prox_to_transport','prox_to_transport_2',
                     'bldg_type', 'floors','house_quality','house_condition','roof_style', 'roof_material','ext_covering','ext_covering2',
                     'masonry_veneer_type','exterior_quality','exterior_condition','foundation_type','basement_height', 'basement_condition',
                     'basement_walkouts','basement_finished_rating','basement_finished_rating2','heating_type','heating_quality','central_air',
                     'electrical_setup', 'full_bathrooms_basement', 'half_bathrooms_basement', 'full_bathrooms_above_ground',
                     'half_bathrooms_above_ground','bedrooms_above_ground', 'kitchens_above_ground', 'kitchen_quality', 'home_functionality',
                     'fireplace_quality', 'garage_location', 'garage_finished', 'garage_car_capacity', 'garage_quality',
                     'garage_condition', 'driveway_surface','pool_quality','fence_quality', 'misc_features', 'sale_type'], drop_first = True)              

In [None]:
print(train.shape)
train.head()

In [None]:
# comparing correlations of all categorical features to target 
train.corr()[['sale_price']].sort_values(by = 'sale_price', ascending=False)

In [9]:
# at this point save a version to model with to try all features
# also save model after before each (combining, interaction, dropping) - run all features through after each to see if any improvement

------
### *Combining Features*

In [8]:
# combine bedrooms
# combine bathrooms
# combine kitchens (above and below ground)
# combine area of home (finished below and above)

### *Interaction Terms*

### *Dropping Collinear Features*

In [None]:
# for exploration and modelling
#1) run model where all features includes - X = df.drop(columns = 'SalesPrice')

# Three different way to create X:
#X = ads[['TV', 'radio', 'newspaper']]

#X = ads.drop(columns = ['sales']) # recommended with high dimensional datasets

model without feature engineering, interaction terms, dropping collinear features manually for all variables