# <center>HOUSE PRICES</center>

<img src='https://storage.googleapis.com/kaggle-competitions/kaggle/5407/logos/front_page.png'>

In [None]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing as pp
import sklearn.model_selection as ms
import sklearn.metrics as metrics
import json

## Load Data Sets

In [None]:
df = pd.read_csv('train.csv')
df = df.set_index('Id')

test_df = pd.read_csv('test.csv')
test_df = test_df.set_index('Id')
df.head()

In [None]:
price = df.SalePrice
print("Average sale price: " + "${:,.0f}".format(price.mean()))

## Merge test and train sets for preprocessing

In [None]:
df = df.drop('SalePrice', axis=1)
all_df = df.append(test_df)
all_df.shape

### Split categorical and numeric features
* Numeric features will be normalized
* Categorical features will be expanded and not normalized

In [None]:
all_features = 'MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition'.split(',')
numeric_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','TotalBsmtSF','Fireplaces', 'GarageCars', 'GarageArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
categorical_features = [f for f in all_features if not(f in numeric_features)]

(len(all_features), len(categorical_features), len(numeric_features))

In [None]:
numeric_df = all_df[numeric_features]
numeric_df.shape

### Missing values
Only impute for numeric variables, categorical will have a NA column

In [None]:
X = numeric_df.as_matrix()

imp = pp.Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
imp = imp.fit(X)
X = imp.transform(X)
X.shape

### Normalization & scale
Normalize numeric values

In [None]:
scaler = pp.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X[0, :]

### Expand categorical into columns
Goal here is to expand all of them and then doing PCA to reduce noise.

In [None]:
def process_categorical(ndf, df, categorical_features):
    for f in categorical_features:
        new_cols = pd.DataFrame(pd.get_dummies(df[f]))
        new_cols.index = df.index
        ndf = pd.merge(ndf, new_cols, how = 'inner', left_index=True, right_index=True)
    return ndf

numeric_df = pd.DataFrame(X)
numeric_df.index = all_df.index
combined_df = process_categorical(numeric_df, all_df, categorical_features)
combined_df.head()

In [None]:
X = combined_df.as_matrix()
X.shape

### PCA

In [None]:
#PCA
from sklearn.decomposition import PCA

test_n = df.shape[0]

pca = PCA()
pca.fit(X[:test_n,:], price)
X = pca.transform(X)
X.shape

### Split again Datasets into new Train and Test sets
Split train and test that were combined to handle preprocessing

In [None]:
X_train = X[:test_n,:]
X_train, X_val, y_train, y_val = ms.train_test_split(X_train, price, test_size=0.3, random_state=0)
X_test = X[test_n:,:]

(X_train.shape, X_val.shape, X_test.shape)

## Model

### Linear Regression Model

In [None]:
from sklearn import linear_model

lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

In [None]:
def print_score(alg, score, params):
    print('%s score is %f with params %s' % (alg, score, json.dumps(params)))

### Gradient Boosting

In [None]:
import xgboost as xgb

params = {'eval_metric':'rmse'}
xm = xgb.DMatrix(X_train, label=y_train)
xmodel = xgb.train(params, xm)
xg_y_pred = xmodel.predict(xgb.DMatrix(X_val))

### Scoring

In [None]:
print('XGBoost score is %f' % metrics.r2_score(y_val, xg_y_pred))
print('Linear Regression score is %f' % lr.score(X_val, y_val))

In [None]:
from sklearn.metrics import mean_squared_error

y_val_pred = best.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
print('ridge mean squared error is %s' % \
      '{:,.2f}'.format(mse))

bmse = mean_squared_error(y_val, xg_y_pred)
print('xgboost mean squared error is %s' % \
      '{:,.2f}'.format(bmse))

In [None]:
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

print("XGBoost RMSLE is %f" % rmsle(xg_y_pred, y_val))

## Vizualisation (Predicted vs Actual)

### XGBoost

In [None]:
fig, ax = plt.subplots()

ax.plot(y_val, xg_y_pred, 'b.')
ax.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'k--')
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('XGBoost')
plt.show()

## Write Submission Files

In [None]:
best.fit(X[:test_n, :], price)
y_submit = best.predict(X_test)
y_submit[y_submit < 0] = 1.
test_df['SalePrice'] = y_submit
test_df.to_csv('submission.csv', columns = ['SalePrice'])

xmodel = xgb.train(params, xgb.DMatrix(X[:test_n, :], label=price))
y_submit = xmodel.predict(xgb.DMatrix(X_test))
y_submit[y_submit < 0] = 1.
test_df['SalePrice'] = y_submit
test_df.to_csv('xg_submission.csv', columns = ['SalePrice'])