# House Prices: Advanced Regression Techniques
Predict sales prices and practice feature engineering, RFs, and gradient boosting

## 1. Load and explore data

In [None]:
# Upload dataset
import pandas as pd
import seaborn as sns

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

In [None]:
sns.distplot(train['SalePrice'])

In [None]:
corr_mat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_mat, vmax=.8, square=True)

## 2. Prepare Data

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt

def plot_scatter(data):
    fig, axes = plt.subplots(1,len(data.columns.values)-1, sharey=True, figsize=(21, 3))
    for i, col in enumerate(data.columns.values[:-1]):
        data.plot(x=col, y="SalePrice", kind="scatter", ax=axes[i])

def plot_boxplot(cat, data, ordered=False):
    f, ax = plt.subplots(1, len(cat), sharey=True, figsize=(21, 4))
    for i, var in enumerate(cat):
        ord=None
        if ordered:
            ord = data.groupby(var)["SalePrice"].median().fillna(0).sort_values(ascending=False)[::-1].index
        d = pd.concat([data['SalePrice'], data[var]], axis=1)
        sns.boxplot(x=var, y="SalePrice", data=d, ax=ax[i], order=ord, showfliers=False)

In [None]:
train.info()

### 2.0 Features with Missing Values

In [None]:
class MissingValuesAttributeRemove(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        remove_attribute = [
            "Alley",       # Drop Alley (>93% features missing)
            "FireplaceQu", # Drop FireplaceQu (>47% features missing)
            "PoolQC",      # Drop PollQC (>99% features missing)
            "Fence",       # Drop Fence (>80% features missing)
            "MiscFeature", # Drop MiscFeature (>96% features missing)
            "MiscVal",     # Drop MiscVal (value of MiscFeature)
        ]
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.1 Quality Features
* Convert categorical features to ordinal features
* Might be worth removing Garage Quality
* Might be worth using only Overall Quality

In [None]:
class QualityAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        data = data.dropna(subset=['BsmtQual', 'GarageQual'])
        attribute_grade = ['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']
        encoder = OrdinalEncoder(categories = [['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']])
        for attribute in attribute_grade:
            data[[attribute]] = encoder.fit_transform(data[[attribute]])
        return data

In [None]:
quality_pipeline = Pipeline([
    ("quality_attribute_transformer", QualityAttributeTransform()),
])
train_quality = quality_pipeline.fit_transform(train)
train_quality = train_quality[['OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual', 'SalePrice']]

# Correlations matrix
corr_matrix = train_quality.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
# Correlations matrix
corr_matrix = train_quality.corr()
corr_matrix["OverallQual"].sort_values(ascending=False)

### 2.2 Condition Features
* Convert categorical features to ordinal features
* Weak correlation with Sale Price
* Drop features

In [None]:
class ConditionAttributeTransformTemp(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        data = data.dropna(subset=['BsmtCond', 'GarageCond'])

        attribute_grade = ['ExterCond', 'BsmtCond', 'GarageCond']
        encoder = OrdinalEncoder(categories = [['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']])
        for attribute in attribute_grade:
            data[[attribute]] = encoder.fit_transform(data[[attribute]])
        return data

In [None]:
condition_pipeline = Pipeline([
    ("condition_attribute_transformer", ConditionAttributeTransformTemp()),
])
train_condition = condition_pipeline.fit_transform(train)
train_condition = train_condition[['OverallCond', 'ExterCond', 'BsmtCond', 'GarageCond', 'SalePrice']]

# Correlations matrix
corr_matrix = train_condition.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
class ConditionAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        remove_attribute = ['OverallCond', 'ExterCond', 'BsmtCond', 'GarageCond']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.3 Area Features
* Create new feature to include living, basement and garage areas
* Strong correlation with Sale Price
* Drop other features

In [None]:
train_area = train[['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'LotFrontage', 'SalePrice']]
train_area['TotalArea'] = train['GrLivArea'] + train['TotalBsmtSF'] + train['GarageArea']

# Correlations matrix
corr_matrix = train_area.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
class AreaAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        data['TotalArea'] = data['GrLivArea'] + data['TotalBsmtSF'] + data['GarageArea']
        remove_attribute = ['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'LotFrontage']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.4 Number of Rooms Features
* Create new feature to include total number of rooms and bathrooms 
* Strong correlation with Sale Price
* Drop other features

In [None]:
train_rooms = train[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces']]
train_rooms['TotRms'] =  train['TotRmsAbvGrd'] + train['FullBath'] + train['HalfBath']
train_rooms['SalePrice'] = train[['SalePrice']]

# Correlations matrix
corr_matrix = train_rooms.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
plot_scatter(train_rooms)

In [None]:
class RoomsAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        data['TotRms'] =  data['TotRmsAbvGrd'] + data['FullBath'] + data['HalfBath']
        remove_attribute = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.5 Date Features
* Only two features with strong correlation with Sale Price
* Drop other features

In [None]:
train_dates = train[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold', 'SalePrice']]

# Correlations matrix
corr_matrix = train_dates.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
corr_matrix["YearBuilt"]['YearRemodAdd']

In [None]:
plot_scatter(train_dates)

In [None]:
class DatesAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        remove_attribute = ['GarageYrBlt', 'MoSold', 'YrSold']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.6 Other Sortable Categorical Features
* No feature presented a strong correlation with Sle Price
* Drop features for now 
* Might be worth considering Heating QC

In [None]:
class OrdinalAttributeConverter(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        data = data.dropna(subset=["BsmtExposure", "BsmtFinType1", "BsmtFinType2", "GarageFinish"])

        attribute_pave = ['Street', 'PavedDrive']
        for attribute in attribute_pave:
            data.loc[data[attribute] == 'Pave', attribute] = 2
            data.loc[data[attribute] == 'Grvl', attribute] = 0
            data.loc[data[attribute] == 'Y', attribute] = 2
            data.loc[data[attribute] == 'P', attribute] = 1
            data.loc[data[attribute] == 'N', attribute] = 0

        encoder = OrdinalEncoder(categories = [['IR3', 'IR2', 'IR1', 'Reg']])
        data[['LotShape']] = encoder.fit_transform(data[['LotShape']])

        encoder = OrdinalEncoder(categories = [['ELO', 'NoSeWa', 'NoSewr', 'AllPub']])
        data[['Utilities']] = encoder.fit_transform(data[['Utilities']])

        encoder = OrdinalEncoder(categories = [['Sev', 'Mod', 'Gtl']])
        data[['LandSlope']] = encoder.fit_transform(data[['LandSlope']])

        encoder = OrdinalEncoder(categories = [['NA', 'No', 'Mn', 'Av', 'Gd']])
        data[['BsmtExposure']] = encoder.fit_transform(data[['BsmtExposure']])

        encoder = OrdinalEncoder(categories = [['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']])
        data[['BsmtFinType1']] = encoder.fit_transform(data[['BsmtFinType1']])
        data[['BsmtFinType2']] = encoder.fit_transform(data[['BsmtFinType2']])

        encoder = OrdinalEncoder(categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']])
        data[['HeatingQC']] = encoder.fit_transform(data[['HeatingQC']])

        encoder = OrdinalEncoder(categories = [['N', 'Y']])
        data[['CentralAir']] = encoder.fit_transform(data[['CentralAir']])

        encoder = OrdinalEncoder(categories = [['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2',  'Min1', 'Typ']])
        data[['Functional']] = encoder.fit_transform(data[['Functional']])

        encoder = OrdinalEncoder(categories = [['NA', 'Unf', 'RFn', 'Fin']])
        data[['GarageFinish']] = encoder.fit_transform(data[['GarageFinish']])

        return data


In [None]:
train_ordinal = train[['Street', 'PavedDrive', 'LotShape', 'Utilities', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Functional', 'GarageFinish', 'SalePrice']]

ordinal_pipeline = Pipeline([
    ("ordinal_attribute_transformer", OrdinalAttributeConverter()),
])
train_ordinal = ordinal_pipeline.fit_transform(train_ordinal)

# Correlations matrix
corr_matrix = train_ordinal.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
plot_boxplot(['Street', 'PavedDrive', 'LotShape', 'Utilities', 'LandSlope', 'BsmtExposure'], train)
plot_boxplot(['BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Functional', 'GarageFinish'], train)

In [None]:
class SortableCategoricalAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        remove_attribute = ['Street', 'PavedDrive', 'LotShape', 'Utilities', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'Functional', 'GarageFinish']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)
        return data

### 2.7 Other Non-Sortable Categorical Features
* Require one hot encoder
* Drop features without a clear correlation with Sale Price, i.e. features where difference between categories' median values is small and box range overlaps significantly between categories * Drop features with categories without a significant number of cases
* Relevance of features also considered


In [None]:
plot_boxplot(['MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 'Neighborhood'], train, True)
plot_boxplot(['Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle'], train, True)
plot_boxplot(['RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation'], train, True)
plot_boxplot(['Heating', 'Electrical', 'GarageType', 'SaleType', 'SaleCondition'], train, True)

In [None]:
train.groupby('SaleCondition').median()['SalePrice'].sort_values()

In [None]:
class NonSortableCategoricalAttributeTransform(BaseEstimator, TransformerMixin):
    def fit(self, data):
        return self
    def transform(self, data):
        remove_attribute = ['MSSubClass', 'LotConfig', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle' , 'RoofMatl', 'Exterior1st', 'Exterior2nd', , 'Foundation' , 'Heating', 'Electrical', 'GarageType', 'SaleType']
        for attribute in remove_attribute:
            data = data.drop(attribute, axis=1)

        encoder = OrdinalEncoder(categories = [['C (all)', 'RM', 'RH', 'RL', 'FV']])
        data[['MSZoning']] = encoder.fit_transform(data[['MSZoning']])

        encoder = OrdinalEncoder(categories = [['C (all)', 'RM', 'RH', 'RL', 'FV']])
        data[['LandContour']] = encoder.fit_transform(data[['LandContour']])

        encoder = OrdinalEncoder(categories = [['MeadowV', 'IDOTRR', 'BrDale', 'OldTown', 'Edwards', 'BrkSide', 'Sawyer', 'Blueste', 'SWISU', 'NAmes', 'NPkVill', 'Mitchel', 'SawyerW', 'Gilbert', 'NWAmes', 'Blmngtn', 'CollgCr', 'ClearCr', 'Crawfor', 'Veenker', 'Somerst', 'Timber', 'StoneBr', 'NoRidge', 'NridgHt']])
        data[['Neighborhood']] = encoder.fit_transform(data[['Neighborhood']])

        encoder = OrdinalEncoder(categories = [['BrkCmn', 'None', 'BrkFace', 'Stone']])
        data[['MasVnrType']] = encoder.fit_transform(data[['MasVnrType']])
        
        encoder = OrdinalEncoder(categories = [['AdjLand', 'Abnorml', 'Family', 'Alloca', 'Normal', 'Partial']])
        data[['SaleCondition']] = encoder.fit_transform(data[['SaleCondition']])

        return data

## 3. Train Model



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
florest_reg = RandomForestRegressor()
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard Deviation: ", scores.std())

In [None]:

x_train_quality = train_quality[['OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']]
y_train_quality = train_quality[['SalePrice']]
florest_reg.fit(x_train_quality, y_train_quality)

florest_reg_scores = cross_val_score(florest_reg, x_train_quality, y_train_quality, scoring="neg_mean_squared_error", cv=10)
display_scores(np.sqrt(-florest_reg_scores))

In [None]:
x_train_quality = train_quality[['OverallQual']]
y_train_quality = train_quality[['SalePrice']]
florest_reg.fit(x_train_quality, y_train_quality)

florest_reg_scores = cross_val_score(florest_reg, x_train_quality, y_train_quality, scoring="neg_mean_squared_error", cv=10)
display_scores(np.sqrt(-florest_reg_scores))