# SURPRISE HOUSING PRICE PROJECT

In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn import ensemble, tree, linear_model,cross_validation
from sklearn.model_selection import GridSearchCV
import xgboost as xgboost
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline


In [None]:
#Getting Data

train_predictors = pd.read_csv('train.csv')
test_predictors = pd.read_csv('test.csv')
print("train : " + str(train_predictors.shape))
print("test : " + str(test_predictors.shape))

In [None]:
#Get Correlations

correlations = train_predictors.corr()
correlations = correlations["SalePrice"].sort_values(ascending=False)
correlations

In [None]:
#Drop Target variable from Train Data set
train_target = train_predictors['SalePrice']
#train_predictors = train.drop(['SalePrice'],axis=1)

In [None]:
train_ID = train_predictors['Id']
test_ID = test_predictors['Id']

In [None]:
# Drop Id column from train and test data sets
train_predictors.drop("Id", axis = 1, inplace = True)
test_predictors.drop("Id", axis = 1, inplace = True)

In [None]:
train_predictors.head(10)

In [None]:
plt.scatter(train_predictors.GrLivArea, train_predictors.SalePrice, c= 'red')
plt.title("Outliers")
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()

In [None]:
train_predictors = train_predictors[train_predictors.GrLivArea < 4000]

In [None]:
train_target = train_predictors['SalePrice']
sns.distplot(train_target)

In [None]:
sns.distplot(np.log(train_target))
#Applying log will reduce the effect of Cheap and expensive houses on the models

In [None]:
train_target_final = np.log(train_target)

In [None]:
train_predictors_missing = pd.isna(train_predictors).sum()
test_missing = pd.isna(test_predictors).sum()

missing = pd.concat([train_predictors_missing, test_missing], axis=1, keys=["Train", "Test"])


missing_values = missing[missing.sum(axis=1) > 0]  #Missing Values
missing_values

In [None]:
#There are some features which are categorical and whose missing values are meaningful these we can know by looking Description of Data

meaningful_missing = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", 
                    "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", 
                    "GarageCond", "Fence","PoolQC", "MiscFeature"]

In [None]:
for i in meaningful_missing:
    train_predictors[i].fillna("None", inplace=True)
    test_predictors[i].fillna("None", inplace=True)

In [None]:
train_predictors.head(10)

In [None]:
train_missing1 = pd.isna(train_predictors).sum()
test_missing1 = pd.isna(test_predictors).sum()

missing1 = pd.concat([train_missing1, test_missing1], axis=1, keys=["Train", "Test"])


missing1_values = missing1[missing1.sum(axis=1) > 0]  #Missing Values
missing1_values

In [None]:
#LotFrontage which is Numeric value has many missing values so we remove it
train_predictors.drop("LotFrontage", axis=1, inplace=True)
test_predictors.drop("LotFrontage", axis=1, inplace=True)

In [None]:
train_predictors.columns

In [None]:
test_predictors.shape

In [None]:
train_predictors.shape

In [None]:
numeric_features_train = train_predictors.select_dtypes(include=[np.number])

numeric_features_test = test_predictors.select_dtypes(include=[np.number])

In [None]:
numeric_features_train.columns

In [None]:
numeric_features_test.columns

In [None]:
# There are some Numerical features in the data set which are actually categorical, 
#They are MSSubClass and MoSold (Month sold)
#we need to convert them into categorical
train_predictors = train_predictors.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

test_predictors = test_predictors.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [None]:
#Drop Target variable from Train Data set
train_predictors = train_predictors.drop(['SalePrice'],axis=1)

In [None]:
numeric_features_train = train_predictors.select_dtypes(include=[np.number])

numeric_features_test = test_predictors.select_dtypes(include=[np.number])

In [None]:
categorical_features_train = train_predictors.select_dtypes(include=[object])

categorical_features_test = test_predictors.select_dtypes(include=[object])

In [None]:
numeric_features_train.columns

In [None]:
numeric_features_train.fillna(numeric_features_train.mean(),inplace=True)
numeric_features_test.fillna(numeric_features_test.mean(),inplace=True)

In [None]:
numeric_features_train.shape

In [None]:
numeric_features_test.shape

In [None]:
categorical_features_train.fillna(categorical_features_train.mode(), inplace = True)
categorical_features_test.fillna(categorical_features_test.mode(), inplace = True)

In [None]:
categorical_features_train.shape

In [None]:
categorical_features_test.shape

In [None]:
#Combining Categorical features of both train and test sets for Encoding
categorical_features_traintest = pd.concat([categorical_features_train , categorical_features_test])

In [None]:
ntrain = categorical_features_train.shape[0]
ntrain

In [None]:
categorical_features_traintest.columns

In [None]:
#BackwardDifference Encoder
import category_encoders as ce
encoder = ce.BackwardDifferenceEncoder(cols=['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
       'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition'])
categorical_encoded = encoder.fit_transform(categorical_features_traintest)

categorical_encoded.head()

In [None]:
#Separating Categorical features of Train and test data sets after encoding
categorical_train_encoded = categorical_encoded[:ntrain]
categorical_test_encoded = categorical_encoded[ntrain:]

In [None]:
final_train = pd.concat([numeric_features_train, categorical_train_encoded],axis=1)
final_test = pd.concat([numeric_features_test, categorical_test_encoded],axis=1)

In [None]:
final_train.head()

In [None]:
final_train.columns

In [None]:
final_test.columns

# FEATURE ENGINEERING

In [None]:
#Adding new features to train and test data
# Overall quality of the house
# Total number of bathrooms
final_train["TotalBath"] = final_train["BsmtFullBath"] + (0.5 * final_train["BsmtHalfBath"]) + \
final_train["FullBath"] + (0.5 * final_train["HalfBath"])

final_test["TotalBath"] = final_test["BsmtFullBath"] + (0.5 * final_test["BsmtHalfBath"]) + \
final_test["FullBath"] + (0.5 * final_test["HalfBath"])

# Total SF for house (incl. basement)
final_train["AllSF"] = final_train["GrLivArea"] + final_train["TotalBsmtSF"]

final_test["AllSF"] = final_test["GrLivArea"] + final_test["TotalBsmtSF"]

# Total SF for 1st + 2nd floors
final_train["AllFlrsSF"] = final_train["1stFlrSF"] + final_train["2ndFlrSF"]

final_test["AllFlrsSF"] = final_test["1stFlrSF"] + final_test["2ndFlrSF"]

# Total SF for porch
final_train["AllPorchSF"] = final_train["OpenPorchSF"] + final_train["EnclosedPorch"] + \
final_train["3SsnPorch"] + final_train["ScreenPorch"]

final_test["AllPorchSF"] = final_test["OpenPorchSF"] + final_test["EnclosedPorch"] + \
final_test["3SsnPorch"] + final_test["ScreenPorch"]

In [None]:
final_train.shape

In [None]:
final_test.shape

# MODELING

In [None]:
# Partition the dataset in train + validation sets
from sklearn.model_selection import train_test_split #to create validation data set
X_train, X_test, y_train, y_test = train_test_split(final_train, train_target_final, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

In [None]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
y_train_pred = lm.predict(X_train)
y_test_pred = lm.predict(X_test)

In [None]:
mse_train = np.mean((y_train_pred - y_train)**2)
print("MSE on Training set : ", mse_train )


mse_test = np.mean((y_test_pred - y_test)**2)

print("MSE on Test set : ", mse_test )

print(lm.score(X_test,y_test))

In [None]:
# Plot residuals
plt.scatter(y_train_pred, y_train_pred - y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

# Plot predictions
plt.scatter(y_train_pred, y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

In [None]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20, 30, 40, 50, 60])
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

In [None]:
ridge.fit(X_train, y_train)

In [None]:
y_train_ridge_pred = ridge.predict(X_train)
y_test_ridge_pred = ridge.predict(X_test)

In [None]:
mse_train = np.mean((y_train_ridge_pred - y_train)**2)
print("MSE on Training set : ", mse_train )


mse_test = np.mean((y_test_ridge_pred - y_test)**2)

print("MSE on Test set : ", mse_test )

print(ridge.score(X_test,y_test))

#MSE on the test set slightly reduced

In [None]:
# Plot residuals
plt.scatter(y_train_ridge_pred, y_train_ridge_pred - y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_ridge_pred, y_test_ridge_pred - y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

# Plot predictions
plt.scatter(y_train_ridge_pred, y_train, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_test_ridge_pred, y_test, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

In [None]:
# Plot important coefficients
coefs = pd.Series(ridge.coef_, index = X_train.columns)
print("Ridge picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  \
      str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")
plt.show()

In [None]:
from sklearn.linear_model import LassoCV


lasso = LassoCV(alphas = [0.0001, 0.0002,0.0004, 0.0006, 0.001, 0.002,0.004, 0.006, 0.01, 0.02,0.04, 0.06, 0.1, 
                          0.2,0.4, 0.6, 1], 
                max_iter = 50000, cv = 10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

In [None]:
lasso.fit(X_train, y_train)

In [None]:
y_train_lasso_pred = lasso.predict(X_train)
y_test_lasso_pred = lasso.predict(X_test)

In [None]:
mse_train = np.mean((y_train_lasso_pred - y_train)**2)
print("MSE on Training set : ", mse_train )


mse_test = np.mean((y_test_lasso_pred - y_test)**2)

print("MSE on Test set : ", mse_test )

print(lasso.score(X_test,y_test))

#MSE on the test set slightly reduced

In [None]:
# Plot important coefficients
coefs = pd.Series(lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  \
      str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")
plt.show()

In [None]:
from xgboost import XGBRegressor

In [None]:
# A parameter grid for XGBoost
params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}

In [None]:
xgb = XGBRegressor(nthread=-1) 

grid = GridSearchCV(xgb, params)
grid.fit(X_train, y_train)

In [None]:
y_train_xgb_pred = grid.predict(X_train)
y_test_xgb_pred = grid.predict(X_test)

In [None]:
mse_train = np.mean((y_train_xgb_pred - y_train)**2)
print("MSE on Training set : ", mse_train )


mse_test = np.mean((y_test_xgb_pred - y_test)**2)

print("MSE on Test set : ", mse_test )

print(grid.score(X_test,y_test))

#MSE on the test set slightly reduced