In [None]:
%matplotlib inline

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from sklearn.decomposition import PCA

# Exploration of training data set

Data fields
Here's a brief version of what you'll find in the data description file.

* SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
* MSSubClass: The building class
* MSZoning: The general zoning classification
* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* Street: Type of road access
* Alley: Type of alley access
* LotShape: General shape of property
* LandContour: Flatness of the property
* Utilities: Type of utilities available
* LotConfig: Lot configuration
* LandSlope: Slope of property
* Neighborhood: Physical locations within Ames city limits
* Condition1: Proximity to main road or railroad
* Condition2: Proximity to main road or railroad (if a second is present)
* BldgType: Type of dwelling
* HouseStyle: Style of dwelling
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* YearBuilt: Original construction date
* YearRemodAdd: Remodel date
* RoofStyle: Type of roof
* RoofMatl: Roof material
* Exterior1st: Exterior covering on house
* Exterior2nd: Exterior covering on house (if more than one material)
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* ExterQual: Exterior material quality
* ExterCond: Present condition of the material on the exterior
* Foundation: Type of foundation
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinType2: Quality of second finished area (if present)
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* Heating: Type of heating
* HeatingQC: Heating quality and condition
* CentralAir: Central air conditioning
* Electrical: Electrical system
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* Bedroom: Number of bedrooms above basement level
* Kitchen: Number of kitchens
* KitchenQual: Kitchen quality
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Functional: Home functionality rating
* Fireplaces: Number of fireplaces
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* GarageQual: Garage quality
* GarageCond: Garage condition
* PavedDrive: Paved driveway
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories
* MiscVal: Value of miscellaneous feature
* MoSold: Month Sold
* YrSold: Year Sold
* SaleType: Type of sale
* SaleCondition: Condition of sale

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
trainTestCombined = pd.concat([train, test])

In [None]:
train.info()

In [None]:
train.head()

## Numerical columns

In [None]:
numericColumns = [key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['int64', 'float64'] ]
print("Numeric columns count: " + str(len(numericColumns)))

### Cardinal Numeric columns

In [None]:
columnsToRemove = ['Id']

In [None]:
classTypeNumericColumns = ['MSSubClass']
cardinalNumericColumns = [
    'LotFrontage',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal'
]
timeColumns = ['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold', 'YrSold']
valueColumn = ['SalePrice']

#### Cardinal Numeric column data cleanup

In [None]:
lotFrontageBase = train.groupby(['LotConfig'])['LotFrontage']
train['LotFrontage'] = lotFrontageBase.transform(lambda x: x.fillna(x.mean()))

In [None]:
train['MasVnrArea'].fillna(0, inplace=True)

In [None]:
train[cardinalNumericColumns].info()

In [None]:
@interact(x=cardinalNumericColumns)
def showScatterPlot(x):
    train.plot.scatter(x=x, y='SalePrice', marker='.', figsize=(16, 9))

### Converting time columns into age columns

In [None]:
train['SaleMonth'] = (train['YrSold']-2000)*12 + train['MoSold']

In [None]:
train['AgeOfProperty'] = train['YrSold'] - train['YearBuilt']

In [None]:
train['AgeOfRemodel'] = train['YrSold'] - train['YearRemodAdd']

In [None]:
train['AgeOfGarage'] = train['YrSold'] - train['GarageYrBlt']
train['AgeOfGarage'].fillna(train['AgeOfProperty'], inplace=True)

In [None]:
columnsToRemove = columnsToRemove + ['YrSold', 'MoSold', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

In [None]:
ageColumns = ['SaleMonth', 'AgeOfProperty', 'AgeOfRemodel', 'AgeOfGarage']
@interact(x=ageColumns)
def showScatterPlot(x):
    train.plot.scatter(x=x, y='SalePrice', marker='.', figsize=(16, 9))

In [None]:
plt.figure(figsize=(16, 9))
plt.matshow(train[cardinalNumericColumns + ageColumns].corr(), fignum=1)

In [None]:
normalized_train = train[cardinalNumericColumns + ageColumns]
normalized_train=(normalized_train-normalized_train.mean())/normalized_train.std()

## Non-numeric columns

In [None]:
otherColumns = [key for key in dict(train.dtypes) if dict(train.dtypes)[key] not in ['int64', 'float64']]
print("Non numeric columns count: " + str(len(otherColumns)))

### Non-numeric columns with missing values

In [None]:
discardedColumns = ["PoolQC", "Fence", "MiscFeature", "Alley", "FireplaceQu"]
columnsToRemove = columnsToRemove + discardedColumns

In [None]:
columnsToRemove

In [None]:
categoryColumns = [x for x in otherColumns if x not in columnsToRemove] + classTypeNumericColumns

### Category column visualisations

In [None]:
@interact(x=(categoryColumns))
def showPlotBox(x):
    train.boxplot(by=x, column='SalePrice', figsize=(16, 9))

## Test data exploration

In [None]:
test.info()

In [None]:
naColumnsInTest = test.columns[test.isna().any()].tolist()

In [None]:
featureColumns = cardinalNumericColumns + ageColumns + categoryColumns

In [None]:
[x for x in naColumnsInTest if x in (cardinalNumericColumns + ageColumns)]

In [None]:
a = np.random.normal(loc=100, scale=100, size=1000)
b = np.random.normal(loc=-10, scale=7, size=1000)
c = -4.5*a + 2*b + np.random.rand(1000)*0.1
d = 0.3*a - 4*b + np.random.rand(1000)*10

In [None]:
df = pd.DataFrame({'a': a, 'b': b, 'c': c, 'd': d})

In [None]:
df['norm_a'] = (df.a - df.a.mean())/df.a.std()
df['norm_b'] = (df.b - df.b.mean())/df.b.std()
df['norm_c'] = (df.c - df.c.mean())/df.c.std()
df['norm_d'] = (df.d - df.d.mean())/df.d.std()

In [None]:
p = PCA(whiten=True, svd_solver='full')
pca.fit(df[['norm_a', 'norm_b', 'norm_c', 'norm_d']])
pca.explained_variance_ratio_