In [None]:
%matplotlib inline

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.decomposition import PCA

# Exploration of training data set

Data fields
Here's a brief version of what you'll find in the data description file.

* SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
* MSSubClass: The building class
* MSZoning: The general zoning classification
* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* Street: Type of road access
* Alley: Type of alley access
* LotShape: General shape of property
* LandContour: Flatness of the property
* Utilities: Type of utilities available
* LotConfig: Lot configuration
* LandSlope: Slope of property
* Neighborhood: Physical locations within Ames city limits
* Condition1: Proximity to main road or railroad
* Condition2: Proximity to main road or railroad (if a second is present)
* BldgType: Type of dwelling
* HouseStyle: Style of dwelling
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* YearBuilt: Original construction date
* YearRemodAdd: Remodel date
* RoofStyle: Type of roof
* RoofMatl: Roof material
* Exterior1st: Exterior covering on house
* Exterior2nd: Exterior covering on house (if more than one material)
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* ExterQual: Exterior material quality
* ExterCond: Present condition of the material on the exterior
* Foundation: Type of foundation
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinType2: Quality of second finished area (if present)
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* Heating: Type of heating
* HeatingQC: Heating quality and condition
* CentralAir: Central air conditioning
* Electrical: Electrical system
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* Bedroom: Number of bedrooms above basement level
* Kitchen: Number of kitchens
* KitchenQual: Kitchen quality
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Functional: Home functionality rating
* Fireplaces: Number of fireplaces
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* GarageQual: Garage quality
* GarageCond: Garage condition
* PavedDrive: Paved driveway
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories
* MiscVal: Value of miscellaneous feature
* MoSold: Month Sold
* YrSold: Year Sold
* SaleType: Type of sale
* SaleCondition: Condition of sale

In [None]:
train = pd.read_csv('./data/train.csv')

In [None]:
train.info()

In [None]:
corr_matrix = train.corr()
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["SalePrice", "OverallQual", "GrLivArea",
              "GarageCars", "GarageArea"]
scatter_matrix(train[attributes], figsize=(30, 20))


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["OverallQual"]):
    strat_train_set = train.loc[train_index]
    strat_test_set = train.loc[test_index]

In [None]:
train.hist(bins=50, figsize=(40, 30))
plt.show()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columnsToDelete=['Id', 'PoolQC', "Fence", "MiscFeature", "Alley", "FireplaceQu"]):
        self.columnsToDelete = columnsToDelete
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        for column in self.columnsToDelete:
            if column in X.columns.values:
                X = X.drop(columns=column)
            return X

In [None]:
housing = strat_train_set

## Numerical columns

In [None]:
numericColumns = [key for key in dict(housing.dtypes) if dict(housing.dtypes)[key] in ['int64', 'float64'] ]
print(len(numericColumns))

In [None]:
classTypeNumericColumns = ['MSSubClass', 'OverallCond', 'OverallQual']
cardinalNumericColumns = [
    'LotFrontage',
    'LotArea',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal'
]
timeColumns = ['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold', 'YrSold']
valueColumn = ['SalePrice']
print(len(classTypeNumericColumns + cardinalNumericColumns+timeColumns+valueColumn))

#### Cardinal Numeric column data cleanup

In [None]:
housing[cardinalNumericColumns].info()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class NumericColumnFiller(BaseEstimator, TransformerMixin):
    def __init__(self, fillByMean = ['LotFrontage'], fillWithZeros=['MasVnrArea']):
        self.fillByMean = fillByMean
        self.fillWithZeros = fillWithZeros
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for column  in self.fillByMean:
            X[column].fillna(X[column].mean(), inplace=True)
        for column in self.fillWithZeros:
            X[column].fillna(0, inplace=True)        
        return X
        
numericColumnFiller = NumericColumnFiller()
housing = numericColumnFiller.fit_transform(housing)

In [None]:
housing[cardinalNumericColumns].info()

In [None]:
@interact(x=cardinalNumericColumns)
def showScatterPlot(x):
    housing.plot.scatter(x=x, y='SalePrice', marker='.', figsize=(16, 9))

### Converting time columns into age columns

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class MonthYearToAgeConverter(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        if 'YrSold' in X.columns.values:
            X['AgeOfProperty'] = X['YrSold'] - X['YearBuilt']
            X['AgeOfRemodel'] = X['YrSold'] - X['YearRemodAdd']
            X['AgeOfGarage'] = X['YrSold'] - X['GarageYrBlt']
            X['AgeOfGarage'].fillna(X['AgeOfProperty'], inplace=True)
            X = X.drop(columns=['YrSold', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt'])
        return X
    
monthYearToAgeConverter = MonthYearToAgeConverter()
housing = monthYearToAgeConverter.fit_transform(housing)

In [None]:
ageColumns = ['AgeOfProperty', 'AgeOfRemodel', 'AgeOfGarage']
@interact(x=ageColumns)
def showScatterPlot(x):
    housing.plot.scatter(x=x, y='SalePrice', marker='.', figsize=(16, 9))

In [None]:
correl = housing[['SalePrice'] + cardinalNumericColumns + ageColumns].corr()
print(correl['SalePrice'].sort_values(ascending=False))

plt.figure(figsize=(16, 9))
plt.matshow(correl, fignum=1)

In [None]:
num_columns = cardinalNumericColumns + ageColumns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('columnFiller', NumericColumnFiller()),
    ('ageTransformer', MonthYearToAgeConverter()),
    ('scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing[num_columns])

## Non-numeric columns

In [None]:
otherColumns = [key for key in dict(train.dtypes) if dict(train.dtypes)[key] not in ['int64', 'float64']]
print("Non numeric columns count: " + str(len(otherColumns)))

### Non-numeric columns with missing values

In [None]:
discardedColumns = ["PoolQC", "Fence", "MiscFeature", "Alley", "FireplaceQu"]

In [None]:
categoryColumns = [x for x in otherColumns if x not in discardedColumns] + classTypeNumericColumns

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CategoryFiller(BaseEstimator, TransformerMixin):
    def __init__(self, categoryColumns):
        self.categoryColumns = categoryColumns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for column in self.categoryColumns:
            X[column].fillna("unknown", inplace=True)
        return X
categoryFiller = CategoryFiller(categoryColumns = categoryColumns)

housing = categoryFiller.fit_transform(housing)

In [None]:
housing[categoryColumns].info()

### Category column visualisations

In [None]:
@interact(x=(categoryColumns))
def showPlotBox(x):
    housing.boxplot(by=x, column='SalePrice', figsize=(16, 9))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('categoryFiller', CategoryFiller(categoryColumns=categoryColumns)),
    ('oneHotEncoder', OneHotEncoder(sparse=False))
     ])

### Full Pipeline

In [None]:
from sklearn.compose import ColumnTransformer

fullPipeline = ColumnTransformer([
    ('num', num_pipeline, num_columns),
    ('cat', cat_pipeline, categoryColumns)
])

housing_prepared = fullPipeline.fit_transform(housing)

In [None]:
housing_prepared.shape

### Target Preparation

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return np.log1p(X)
    
class TargetTransformerInverted(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return np.expm1(X)

In [None]:
targetTransformer = TargetTransformer()
housing_target = targetTransformer.fit_transform(housing['SalePrice'])

In [None]:
housing_target.shape

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_target)

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)

prediction_transformer = TargetTransformerInverted()
housing_predictions = prediction_transformer.fit_transform(housing_predictions)

lin_mse = mean_squared_error(housing['SalePrice'], housing_predictions)
lin_mse

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_target)

In [None]:
forest_pred = forest_reg.predict(housing_prepared)
forest_pred = prediction_transformer.fit_transform(forest_pred)

forest_mse = mean_squared_error(housing['SalePrice'], forest_pred)
forest_mse

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [100, 150, 200], 'max_features': [80, 100, 128]}
]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, verbose=True)
grid_search.fit(housing_prepared, housing_target)

In [None]:
grid_search.best_params_

In [None]:
best_random_forest = grid_search.best_estimator_

forest_pred = best_random_forest.predict(housing_prepared)
forest_pred = prediction_transformer.fit_transform(forest_pred)

forest_mse = mean_squared_error(housing['SalePrice'], forest_pred)
forest_mse