# House Price Prediction Model

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
TargetColumn = 'SalePrice'
TrainDataPath = 'data/train.csv'
TestDataPath = 'data/test.csv'
DropColumns = ['Id']
KeyColumn = 'Id'
Categorical_Columns = []

In [3]:
# Read Data
train = pd.read_csv(TrainDataPath) 
test  = pd.read_csv(TestDataPath)

In [4]:
train.shape

(1460, 81)

In [5]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
test.shape

(1459, 80)

In [7]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
Categorical_Columns = [name for name in train.columns if train[name].dtype == "object"]
Categorical_Columns

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [9]:
train.dropna(axis=0, subset=[TargetColumn], inplace=True)

In [10]:
trainLen = train.shape[0]

In [11]:
data=pd.concat([train.iloc[:,:-1],test],axis=0)

In [12]:
data = data.drop(columns=DropColumns,axis=1)

In [13]:
def HandleMissingValues(df):
    # for Object columns fill using 'UNKOWN'
    # for Object columns fill using Mode()
    # for Numeric columns fill using median
    num_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
    cat_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
    values = {}
    for a in cat_cols:
        #values[a] = 'UNKOWN'
        values[a] = df[a].mode()

    for a in num_cols:
        values[a] = df[a].median()
        
    df.fillna(value=values,inplace=True)

In [14]:
HandleMissingValues(data)

In [31]:
data.isnull().sum()

MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
OverallCond              0
YearBuilt                0
YearRemodAdd             0
MasVnrArea               0
BsmtFinSF1               0
BsmtFinSF2               0
BsmtUnfSF                0
TotalBsmtSF              0
1stFlrSF                 0
2ndFlrSF                 0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             0
BsmtHalfBath             0
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageYrBlt              0
GarageCars               0
GarageArea               0
WoodDeckSF               0
OpenPorchSF              0
EnclosedPorch            0
                        ..
GarageCond_TA            0
PavedDrive_N             0
PavedDrive_P             0
PavedDrive_Y             0
PoolQC_Ex                0
PoolQC_Fa                0
P

In [16]:
data = pd.get_dummies(data,columns = Categorical_Columns)

In [17]:
data.shape

(2919, 288)

In [18]:
train_data = data.iloc[:train.shape[0],:]
test_data  = data.iloc[train.shape[0]:,:]

In [19]:
train_data.shape

(1460, 288)

In [20]:
test_data.shape

(1459, 288)

In [21]:
X=train_data
y=train.loc[:,TargetColumn]

In [22]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

In [23]:
import xgboost as xgb
model_xgb = xgb.XGBRegressor(n_estimators=340, max_depth=2, learning_rate=0.2)
model_xgb.fit(X, y)
predictions = model_xgb.predict(X)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [24]:
print('\t\tMAE:', metrics.mean_absolute_error(y, predictions))
print('\t\tMSE:', metrics.mean_squared_error(y, predictions))
print('\t\tRMSE:', np.sqrt(metrics.mean_squared_error(y, predictions)))

		MAE: 8374.561932791095
		MSE: 128682497.47525106
		RMSE: 11343.83081129347


In [25]:
Final_predictions = model_xgb.predict(test_data)

In [26]:
Final_predictions

array([118512.21, 178529.52, 183302.73, ..., 154204.34, 115341.61,
       233041.98], dtype=float32)

In [27]:
submission = {
    KeyColumn: test[KeyColumn].values,
    TargetColumn: Final_predictions
}

In [28]:
submission

{'Id': array([1461, 1462, 1463, ..., 2917, 2918, 2919], dtype=int64),
 'SalePrice': array([118512.21, 178529.52, 183302.73, ..., 154204.34, 115341.61,
        233041.98], dtype=float32)}

In [29]:
solution = pd.DataFrame(submission)

In [30]:
solution.to_csv('data/submission.csv',index=False)