In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

import warnings
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('train.csv')
df.shape
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.shape

(1460, 81)

In [4]:
df.isna().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
MasVnrType      872
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64

In [5]:
df = df.drop(['PoolQC','MiscFeature','Alley','Fence','MasVnrType'],axis=1)
df.isnull().sum().sort_values(ascending=False)

FireplaceQu     690
LotFrontage     259
GarageType       81
GarageYrBlt      81
GarageFinish     81
               ... 
BsmtUnfSF         0
TotalBsmtSF       0
Heating           0
MSSubClass        0
SalePrice         0
Length: 76, dtype: int64

In [6]:
df = df.drop(['FireplaceQu'],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [8]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())
df['GarageCond'] = df['GarageCond'].fillna(df['GarageCond'].mode()[0])
df['GarageType'] = df['GarageType'].fillna(df['GarageType'].mode()[0])
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].mode()[0])
df['GarageFinish'] = df['GarageFinish'].fillna(df['GarageFinish'].mode()[0])
df['GarageQual'] = df['GarageQual'].fillna(df['GarageQual'].mode()[0])
df['BsmtExposure'] = df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0])


In [9]:
df.isnull().sum().sort_values(ascending=False)

BsmtFinType2    38
BsmtQual        37
BsmtFinType1    37
BsmtCond        37
MasVnrArea       8
                ..
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
MSSubClass       0
SalePrice        0
Length: 75, dtype: int64

In [10]:
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0])
df['BsmtFinType1'] = df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode()[0])
df['BsmtCond'] = df['BsmtCond'].fillna(df['BsmtCond'].mode()[0])
df['BsmtQual'] = df['BsmtQual'].fillna(df['BsmtQual'].mode()[0])
df.isnull().sum().sort_values(ascending=False)


MasVnrArea     8
Electrical     1
Id             0
HalfBath       0
Fireplaces     0
              ..
ExterQual      0
Exterior2nd    0
Exterior1st    0
RoofMatl       0
SalePrice      0
Length: 75, dtype: int64

In [12]:
df.MasVnrArea = df.MasVnrArea.fillna(df.MasVnrArea.mean())
df.Electrical = df.Electrical.fillna(df.Electrical.mode()[0])

In [13]:
# one hot encoding
df = pd.get_dummies(df)

In [14]:
df.shape

(1460, 268)

In [15]:
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,True,False,False,False,False,True,False


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 268 entries, Id to SaleCondition_Partial
dtypes: bool(230), float64(3), int64(35)
memory usage: 761.5 KB


In [18]:
bool_columns = df.select_dtypes(include='bool').columns
df[bool_columns] = df[bool_columns].astype(int)
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,0.003425,0.083562,0.002055,0.867808,0.069178,0.00274,0.008219,0.013699,0.820548,0.085616
std,421.610009,42.300571,22.024023,9981.264932,1.382997,1.112799,30.202904,20.645407,180.569112,456.098091,...,0.05844,0.276824,0.045299,0.338815,0.253844,0.052289,0.090317,0.116277,0.383862,0.279893
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,365.75,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,730.5,50.0,70.049958,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
X = df.drop(['SalePrice'],axis=1)
y = df['SalePrice']
X_train, x_val, y_train, y_val = train_test_split(X,y,test_size=0.23,random_state=42)
xgb = XGBRegressor()
xgb.fit(X_train,y_train)



In [20]:
y_pred = xgb.predict(x_val)
# see id xfb is getting good score on validation set
print('MAE:',mean_absolute_error(y_val,y_pred))
print('RMSE:',np.sqrt(mean_squared_error(y_val,y_pred)))
print('R2 Score:',r2_score(y_val,y_pred))




MAE: 16922.626883370536
RMSE: 26068.412628368787
R2 Score: 0.9022982201923936


In [21]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred = rf.predict(x_val)
print('MAE for random forest :',mean_absolute_error(y_val,y_pred))
print('RMSE for random forest :',np.sqrt(mean_squared_error(y_val,y_pred)))
print('R2 Score for random forest:',r2_score(y_val,y_pred))

MAE for random forest : 17105.057142857142
RMSE for random forest : 28319.72172119851
R2 Score for random forest: 0.8846941703725855


In [40]:
lasso = Lasso()
lasso.fit(X_train,y_train)
y_pred = lasso.predict(x_val)
print('MAE for lasso :',mean_absolute_error(y_val,y_pred))
print('RMSE for lasso :',np.sqrt(mean_squared_error(y_val,y_pred,squared=False)))
print('R2 Score for lasso:',r2_score(y_val,y_pred))


MAE for lasso : 19437.93183255132
RMSE for lasso : 176.40676585369738
R2 Score for lasso: 0.880716167181792


  model = cd_fast.enet_coordinate_descent(


In [23]:
linear = LinearRegression()
linear.fit(X_train,y_train)
y_pred = linear.predict(x_val)
print('MAE for linear :',mean_absolute_error(y_val,y_pred))
print('RMSE for linear :',np.sqrt(mean_squared_error(y_val,y_pred)))
print('R2 Score for linear:',r2_score(y_val,y_pred))


MAE for linear : 17459.19199391435
RMSE for linear : 28224.96589527476
R2 Score for linear: 0.8854644901219801


In [27]:
df_test = pd.read_csv('test.csv')
# same preprocessing as train data
df_test = pd.read_csv('test.csv')
df_test.drop(['PoolQC','MiscFeature','Alley','Fence','MasVnrType'],axis=1,inplace=True)
df_test.drop(['FireplaceQu'],axis=1,inplace=True)
df_test['LotFrontage'] = df_test['LotFrontage'].fillna(df_test['LotFrontage'].mean())
df_test['GarageCond'] = df_test['GarageCond'].fillna(df_test['GarageCond'].mode()[0])
df_test['GarageType'] = df_test['GarageType'].fillna(df_test['GarageType'].mode()[0])
df_test['GarageYrBlt'] = df_test['GarageYrBlt'].fillna(df_test['GarageYrBlt'].mode()[0])
df_test['GarageFinish'] = df_test['GarageFinish'].fillna(df_test['GarageFinish'].mode()[0])
df_test['GarageQual'] = df_test['GarageQual'].fillna(df_test['GarageQual'].mode()[0])
df_test['BsmtExposure'] = df_test['BsmtExposure'].fillna(df_test['BsmtExposure'].mode()[0])
df_test['BsmtFinType2'] = df_test['BsmtFinType2'].fillna(df_test['BsmtFinType2'].mode()[0])
df_test['BsmtFinType1'] = df_test['BsmtFinType1'].fillna(df_test['BsmtFinType1'].mode()[0])
df_test['BsmtCond'] = df_test['BsmtCond'].fillna(df_test['BsmtCond'].mode()[0])
df_test['BsmtQual'] = df_test['BsmtQual'].fillna(df_test['BsmtQual'].mode()[0])
df_test.MasVnrArea = df_test.MasVnrArea.fillna(df_test.MasVnrArea.mean())
df_test.Electrical = df_test.Electrical.fillna(df_test.Electrical.mode()[0])
df_test = pd.get_dummies(df_test)
df_test.shape




(1459, 251)

In [30]:
# check if any column is missing in test data
missing_cols = set(X.columns) - set(df_test.columns)
missing_cols


{'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'Electrical_Mix',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'GarageQual_Ex',
 'Heating_Floor',
 'Heating_OthW',
 'HouseStyle_2.5Fin',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'Utilities_NoSeWa'}

In [None]:
X.drop(missing_cols,axis=1,inplace=True)


In [35]:
X_train, x_val, y_train, y_val = train_test_split(X,y,test_size=0.15,random_state=42)
xgb.fit(X_train,y_train)
y_pred = xgb.predict(x_val)
print ('R2 Score for xgb:',r2_score(y_val,y_pred))

R2 Score for xgb: 0.9064141923008606


In [36]:
# predict on test data
y_pred = xgb.predict(df_test)
y_pred

array([135043.08 , 173605.84 , 189692.06 , ..., 169429.   , 113478.555,
       219458.47 ], dtype=float32)

In [37]:
y_pred = pd.DataFrame(y_pred)
y_pred

Unnamed: 0,0
0,135043.078125
1,173605.843750
2,189692.062500
3,184203.046875
4,201870.140625
...,...
1454,84850.992188
1455,76925.429688
1456,169429.000000
1457,113478.554688


In [39]:
# save the predictions in csv file Id,SalePrice
y_pred = pd.concat([df_test['Id'],y_pred],axis=1)
y_pred.to_csv('submission.csv',index=False)