[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/luandaoduy96/HousePricePrediction-/blob/master/predicthouseprice.ipynb)

In [0]:
#@title!apt-get update -qq 2>&1 > /dev/null
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse 
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import seaborn as sns
from scipy import stats
from scipy.stats import norm


In [0]:
#loading data
train = pd.read_csv("drive/My Drive/train.csv")
test = pd.read_csv("drive/My Drive/test.csv")

print ('The train data has {0} rows and {1} columns'.format(train.shape[0],train.shape[1]))
print ('----------------------------')
print ('The test data has {0} rows and {1} columns'.format(test.shape[0],test.shape[1]))



In [0]:
#compute missing data
miss = train.isnull().sum()
miss = miss[miss > 0]
miss.sort_values(inplace=True)
print(miss)
print('------------------------------------------------------------')
numeric_data=train.select_dtypes(exclude = 'object')
cat_data=train.select_dtypes(include = 'object')
print ("There are {} numeric and {} categorical columns in train data".format(numeric_data.shape[1],cat_data.shape[1]))
print('------------------------------------------------------------')
miss = numeric_data.isnull().sum()
miss = miss[miss > 0]
miss.sort_values(inplace=True)
print(miss)
print('------------------------------------------------------------')
miss = cat_data.isnull().sum()
miss = miss[miss > 0]
miss.sort_values(inplace=True)
print(miss)


In [0]:
#SalePrice
print(train['SalePrice'].skew())
sns.distplot(train['SalePrice'])
plt.show()

In [0]:
#now log transformation for the target variable
target = np.log(train['SalePrice'])
print ('Skewness is {} and Kurtosis is {}'.format(target.skew(),target.kurt()))
sns.distplot(target)
plt.show()

In [0]:
#correlation plot
corr = numeric_data.corr()
sns.heatmap(corr)
plt.show()
#print(numeric_data.columns)
#print(corr[corr['SalePrice'].values<=0].index)
#print(corr['SalePrice'].values)
#print(corr['SalePrice'].MSSubClass)
#for i in corr['SalePrice'].index
#    if corr['SalePrice'].values >0
 
numeric_data=numeric_data.drop(columns=corr[corr['SalePrice'].values<=0].index)
print(numeric_data.columns)
print ('----------------------------------------------------------------------')
print (corr['SalePrice'].sort_values(ascending=False)[:], '\n')
#print ('----------------------')
#print (corr['SalePrice'].sort_values(ascending=False)[-5:]) #last 5 values

In [0]:
cat = [f for f in train.columns if train.dtypes[f] == 'object']
def anova(frame):
    anv = pd.DataFrame()
    anv['features'] = cat
    pvals = []
    for c in cat:
           samples = []
           for cls in frame[c].unique():
                  s = frame[frame[c] == cls]['SalePrice'].values
                  samples.append(s)
           pval = stats.f_oneway(*samples)[1]
           pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

cat_data['SalePrice'] = train.SalePrice.values
k = anova(cat_data) 
k['disparity'] = np.log(1./k['pval'].values) 
sns.barplot(data=k, x = 'features', y='disparity')
plt.xticks(rotation=90) 


In [0]:
print('------------------------------------------------------------')
alldata = train.append(test)
all_numeric_data = alldata.select_dtypes(exclude = 'object')
all_cat_data = alldata.select_dtypes(include = 'object')
print(alldata.shape)
print('------------------------------------------------------------')
missnumeric = all_numeric_data.isnull().sum()
missnumeric = missnumeric[missnumeric > 0]
missnumeric.sort_values(inplace=True)
print(missnumeric)
print('------------------------------------------------------------')
misscat = all_cat_data.isnull().sum()
misscat = misscat[misscat > 0]
misscat.sort_values(inplace=True)
print(misscat)
alldata['LotFrontage'].fillna(all_numeric_data['LotFrontage'].mean(),inplace=True)
alldata['MasVnrArea'].fillna(all_numeric_data['MasVnrArea'].mean(),inplace=True)
alldata['GarageYrBlt'].fillna(1980,inplace=True)
alldata['BsmtFinSF1'].fillna(all_numeric_data['BsmtFinSF1'].mean(),inplace=True)
alldata['BsmtFinSF2'].fillna(all_numeric_data['BsmtFinSF2'].mean(),inplace=True)
alldata['BsmtUnfSF'].fillna(all_numeric_data['BsmtUnfSF'].mean(),inplace=True)
alldata['GarageArea'].fillna(all_numeric_data['GarageArea'].mean(),inplace=True)
alldata['GarageCars'].fillna(2,inplace=True)
alldata['TotalBsmtSF'].fillna(all_numeric_data['TotalBsmtSF'].mean(),inplace=True)
alldata['BsmtFullBath'].fillna(2,inplace=True)
alldata['BsmtHalfBath'].fillna(0,inplace=True)
all_numeric_data = alldata.select_dtypes(exclude = 'object')
miss = all_numeric_data.isnull().sum()
print(miss)
print('------------------------------------------------------------')
dropvalue=['PoolQC','MiscFeature','Fence','FireplaceQu','Alley']
alldata.drop(dropvalue,axis=1,inplace=True)
print(all_cat_data.columns)
print('------------------------------------------------------------')
misscat = alldata.isnull().sum()
misscat = misscat[misscat > 0]
misscat.sort_values(inplace=True)
print(misscat)
print('------------------------------------------------------------')

In [0]:
var = 'OverallQual'
data=pd.concat([all_numeric_data['SalePrice'],all_numeric_data[var]],axis=1)
data.plot.scatter(x=var,y='SalePrice',ylim=(0,800000))



In [0]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([all_numeric_data['SalePrice'], all_numeric_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [0]:
all_numeric_data.drop(all_numeric_data[all_numeric_data['GrLivArea']>4000].index,inplace=True)
var = 'GrLivArea'
data = pd.concat([all_numeric_data['SalePrice'], all_numeric_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000),xlim=(0,6000));

In [0]:
var = 'GarageCars'
data = pd.concat([all_numeric_data['SalePrice'], all_numeric_data[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));

In [0]:
var='GarageArea'
data = pd.concat([all_numeric_data['SalePrice'],all_numeric_data[var]],axis=1)
data.plot.scatter(y='SalePrice',x=var,ylim=(0,800000))

In [0]:
all_numeric_data.drop(all_numeric_data[all_numeric_data['GarageArea']>1200].index,inplace=True)
var='GarageArea'
data = pd.concat([all_numeric_data['SalePrice'],all_numeric_data[var]],axis=1)
data.plot.scatter(y='SalePrice',x=var,ylim=(0,800000))

In [0]:
#importing function
alldata1=alldata.copy()
all_numeric_data1=alldata1.select_dtypes(exclude='object')
all_cat_data1=alldata1.select_dtypes(include='object')

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
def factorize(data, var, fill_na = None):
      if fill_na is not None:
            data[var].fillna(fill_na, inplace=True)
      le.fit(data[var])
      data[var] = le.transform(data[var])
      return data


qual_dict = {np.nan: 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
name = np.array(['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual', 'GarageQual','GarageCond'])

for i in name:
     alldata1[i] = alldata1[i].map(qual_dict).astype(int)

alldata1["GarageFinish"] = alldata1["GarageFinish"].map({np.nan: 0, "Unf": 1, "RFn": 2, "Fin": 3}).astype(int)
alldata1["GarageType"] = alldata1["GarageType"].map({np.nan: 0, "Detchd": 1, "CarPort": 2, "BuiltIn": 3, "Basment": 4, "Attchd": 5, "2Types": 6}).astype(int)
alldata1["BsmtExposure"] = alldata1["BsmtExposure"].map({np.nan: 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}).astype(int)
bsmt_fin_dict = {np.nan: 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
alldata1["BsmtFinType1"] = alldata1["BsmtFinType1"].map(bsmt_fin_dict).astype(int)
alldata1["BsmtFinType2"] = alldata1["BsmtFinType2"].map(bsmt_fin_dict).astype(int)
#encoding data
alldata1["CentralAir"] = (alldata1["CentralAir"] == "Y") * 1.0

varst = np.array(['LotConfig','Neighborhood','Condition1','BldgType','HouseStyle','RoofStyle','Foundation','SaleCondition','Condition2','Heating','LandContour','LandSlope','PavedDrive','RoofMatl','Street','LotShape'])

for x in varst:
         factorize(alldata1, x)

#encode variables and impute missing values
alldata1 = factorize(alldata1, "MSZoning", "RL")
alldata1 = factorize(alldata1, "Exterior1st", "Other")
alldata1 = factorize(alldata1, "Exterior2nd", "Other")
alldata1 = factorize(alldata1, "MasVnrType", "None")
alldata1 = factorize(alldata1, "SaleType", "Oth")
alldata1 = factorize(alldata1, "Utilities", "ELO")
alldata1 = factorize(alldata1, "Functional", "Sal")
alldata1 = factorize(alldata1, "Electrical", "Mix")

all_numeric_data1=alldata1.select_dtypes(exclude='object')
all_cat_data1=alldata1.select_dtypes(include='object')
print(all_cat_data1.columns)
misscat = all_cat_data1.isnull().sum()
misscat = misscat[misscat > 0]
misscat.sort_values(inplace=True)
print(misscat)
print('------------------------------------------------------------')
print(alldata1.shape)

In [0]:
train_new = alldata1[alldata1['SalePrice'].notnull()]
miss = train_new.isnull().sum()
miss=miss[miss>0]
print(miss)

test_new = alldata1[alldata1['SalePrice'].isnull()]
print(train_new.shape,test_new.shape)
print('------------------------------------------------------------')
columns = [f for f in train_new.columns if train_new[f].dtype != object]
print(columns)
#transform the numeric features using log(x + 1)
from scipy.stats import skew
skewed = train_new[columns].apply(lambda x: skew(x.dropna().astype(float)))
skewed = skewed[skewed > 0.75]
skewed = skewed.index
train_new[skewed] = np.log1p(train_new[skewed])
test_new[skewed] = np.log1p(test_new[skewed])
del test_new['SalePrice']
print(train_new.columns)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_new[columns])
scaled = scaler.transform(train_new[columns])

for i, col in enumerate(columns):
      train_new[col] = scaled[:,i]

columns.remove('SalePrice')
scaled = scaler.fit_transform(test_new[columns])

for i, col in enumerate(columns):
      test_new[col] = scaled[:,i]
    


In [0]:
def onehot(onehot_df, df, column_name, fill_na):
       onehot_df[column_name] = df[column_name]
       if fill_na is not None:
            onehot_df[column_name].fillna(fill_na, inplace=True)

       dummies = pd.get_dummies(onehot_df[column_name], prefix="_"+column_name)
       onehot_df = onehot_df.join(dummies)
       onehot_df = onehot_df.drop([column_name], axis=1)
       return onehot_df

def munge_onehot(df):
       onehot_df = pd.DataFrame(index = df.index)

       onehot_df = onehot(onehot_df, df, "MSSubClass", None)
       onehot_df = onehot(onehot_df, df, "MSZoning", "RL")
       onehot_df = onehot(onehot_df, df, "LotConfig", None)
       onehot_df = onehot(onehot_df, df, "Neighborhood", None)
       onehot_df = onehot(onehot_df, df, "Condition1", None)
       onehot_df = onehot(onehot_df, df, "BldgType", None)
       onehot_df = onehot(onehot_df, df, "HouseStyle", None)
       onehot_df = onehot(onehot_df, df, "RoofStyle", None)
       onehot_df = onehot(onehot_df, df, "Exterior1st", "VinylSd")
       onehot_df = onehot(onehot_df, df, "Exterior2nd", "VinylSd")
       onehot_df = onehot(onehot_df, df, "Foundation", None)
       onehot_df = onehot(onehot_df, df, "SaleType", "WD")
       onehot_df = onehot(onehot_df, df, "SaleCondition", "Normal")
       onehot_df = onehot(onehot_df, df, "MasVnrType", "None")

       onehot_df = onehot(onehot_df, df, "LotShape", None)
       onehot_df = onehot(onehot_df, df, "LandContour", None)
       onehot_df = onehot(onehot_df, df, "LandSlope", None)
       onehot_df = onehot(onehot_df, df, "Electrical", "SBrkr")
       onehot_df = onehot(onehot_df, df, "GarageType", "None")
       onehot_df = onehot(onehot_df, df, "PavedDrive", None)
       onehot_df = onehot(onehot_df, df, "Street", None)
       onehot_df = onehot(onehot_df, df, "Condition2", None)
       onehot_df = onehot(onehot_df, df, "RoofMatl", None)
       onehot_df = onehot(onehot_df, df, "Heating", None)

       # we'll have these as numerical variables too
       onehot_df = onehot(onehot_df, df, "ExterQual", "None")
       onehot_df = onehot(onehot_df, df, "ExterCond", "None")
       onehot_df = onehot(onehot_df, df, "BsmtQual", "None")
       onehot_df = onehot(onehot_df, df, "BsmtCond", "None")
       onehot_df = onehot(onehot_df, df, "HeatingQC", "None")
       onehot_df = onehot(onehot_df, df, "KitchenQual", "TA")
       onehot_df = onehot(onehot_df, df, "GarageQual", "None")
       onehot_df = onehot(onehot_df, df, "GarageCond", "None")
       onehot_df = onehot(onehot_df, df, "BsmtExposure", "None")
       onehot_df = onehot(onehot_df, df, "BsmtFinType1", "None")
       onehot_df = onehot(onehot_df, df, "BsmtFinType2", "None")
       onehot_df = onehot(onehot_df, df, "Functional", "Typ")
       onehot_df = onehot(onehot_df, df, "GarageFinish", "None")
       onehot_df = onehot(onehot_df, df, "MoSold", None)

       # Divide  the years between 1871 and 2010 into slices of 20 years
       year_map = pd.concat(pd.Series("YearBin" + str(i+1), index=range(1871+i*20,1891+i*20))  for i in range(0, 7))
       yearbin_df = pd.DataFrame(index = df.index)
       yearbin_df["GarageYrBltBin"] = df.GarageYrBlt.map(year_map)
       yearbin_df["GarageYrBltBin"].fillna("NoGarage", inplace=True)
       yearbin_df["YearBuiltBin"] = df.YearBuilt.map(year_map)
       yearbin_df["YearRemodAddBin"] = df.YearRemodAdd.map(year_map)

       onehot_df = onehot(onehot_df, yearbin_df, "GarageYrBltBin", None)
       onehot_df = onehot(onehot_df, yearbin_df, "YearBuiltBin", None)
       onehot_df = onehot(onehot_df, yearbin_df, "YearRemodAddBin", None)
       return onehot_df

#create one-hot features
onehot_df = munge_onehot(train)



train_new = train_new.join(onehot_df)
onehot_df = munge_onehot(test)
test_new = test_new.join(onehot_df)
print(train_new.shape,test_new.shape)
label_df = pd.DataFrame(index = train_new.index, columns = ['SalePrice'])
label_df['SalePrice'] = np.log(train['SalePrice'])
train_new.drop('SalePrice',axis=1,inplace=True)
train_new_drop = ['_HouseStyle_2.5Fin','_Exterior1st_ImStucc',
       '_Exterior1st_Stone', '_Exterior2nd_Other',
       '_Electrical_Mix',
       '_Condition2_RRAn', '_Condition2_RRAe', "_Condition2_RRNn","_RoofMatl_Membran", "_RoofMatl_Metal", "_RoofMatl_Roll", '_RoofMatl_ClyTile', "_Heating_Floor", "_Heating_OthW"
       , '_GarageQual_Ex']
test_new_drop = ['_MSSubClass_150']
train_new.drop(train_new_drop, axis=1, inplace=True)
test_new.drop(test_new_drop, axis=1, inplace=True)
print(train_new.columns[300:344])
print(test_new.columns[300:344])
print("Training set size:", train_new.shape)
print("Test set size:", test_new.shape)


In [0]:
import xgboost as xgb
regr = xgb.XGBRegressor(colsample_bytree=0.2,
                       gamma=0.0,
                       learning_rate=0.05,
                       max_depth=6,
                       min_child_weight=1.5,
                       n_estimators=7200,
                       reg_alpha=0.9,
                       reg_lambda=0.6,
                       subsample=0.2,
                       seed=42,
                       silent=1)

regr.fit(train_new, label_df)


In [0]:
from sklearn.metrics import mean_squared_error
def rmse(y_test,y_pred):
      return np.sqrt(mean_squared_error(y_test,y_pred))
y_pred = regr.predict(train_new)
y_test = label_df
print("XGBoost score on training set: ", rmse(y_test, y_pred))

from sklearn.linear_model import Lasso

#found this best alpha through cross-validation
best_alpha = 0.00099

regr = Lasso(alpha=best_alpha, max_iter=50000)
regr.fit(train_new, label_df)

# run prediction on the training set to get a rough idea of how well it does
y_pred = regr.predict(train_new)
y_test = label_df
print("Lasso score on training set: ", rmse(y_test, y_pred))

#make prediction on the test set
y_pred_lasso = regr.predict(test_new)
lasso_ex = np.exp(y_pred_lasso)
pred1 = pd.DataFrame({'Id': test['Id'], 'SalePrice': lasso_ex})
pred1.to_csv('lasso_model.csv', header=True, index=False)

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler

np.random.seed(10)

#create Model
#define base model
def base_model():
     model = Sequential()
     model.add(Dense(20, input_dim=344, init='normal', activation='relu'))
     model.add(Dense(10, init='normal', activation='relu'))
     model.add(Dense(1, init='normal'))
     model.compile(loss='mean_squared_error', optimizer = 'adam')
     return model

seed = 7
np.random.seed(seed)

scale = StandardScaler()
X_train = scale.fit_transform(train_new)
X_test = scale.fit_transform(test_new)

keras_label = label_df.as_matrix()
clf = KerasRegressor(build_fn=base_model, nb_epoch=1000, batch_size=5,verbose=0)
clf.fit(X_train,keras_label)

#make predictions and create the submission file 
kpred = clf.predict(X_test) 
kpred = np.exp(kpred)
pred_df = pd.DataFrame(kpred, index=test["Id"], columns=["SalePrice"]) 
pred_df.to_csv('keras1.csv', header=True, index_label='Id')