In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Loading train and test data

In [None]:
import os
import pandas as pd
os.chdir(r"/kaggle/input/house-prices-advanced-regression-techniques/")
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

Understanding train and test data

In [None]:
print (f"Train has {train.shape[0]} rows and {train.shape[1]} columns")
print (f"Test has {test.shape[0]} rows and {test.shape[1]} columns")

Checking Null Values

In [None]:
def total_NAs(data) :
    print ("Total NAs:",data.isna().sum().sum())

In [None]:
total_NAs(train)

In [None]:
total_NAs(test)

In [None]:
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns, total missing values and total missing values percentage"""
    total = df.isna().sum().sort_values(ascending = False)[df.isna().sum().sort_values(ascending = False)!=0]
    percent = round(df.isna().sum().sort_values(ascending = False)/len(df)*100,2)[round(df.isna().sum().sort_values(ascending = False)/len(df)*100,2) != 0]
    return pd.concat([total, percent], axis=1, keys=['TotalMissing','PercentMissing'])

In [None]:
missing_percentage(train)

In [None]:
missing_percentage(test)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Checking the distribution of the target variable
sns.distplot(train["SalePrice"],kde = False, bins = 40 )

In [None]:
#skewness and kurtosis
print("Skewness: " + str(train['SalePrice'].skew()))
print("Kurtosis: " + str(train['SalePrice'].kurt()))

SalePrice is right skewed and has kurtosis. Transformations should be applied.

In [None]:
# Check for any outliers in the target variable
sns.boxplot("SalePrice", data = train)

There are few outliers

In [None]:
# Getting the correlation of all the features with target variable. 
(train.corr()**2)["SalePrice"].sort_values(ascending = False)[1:]

Feature Engineering

In [None]:
# Function to convert all object columns to categorical
def convert_obj_categ(data) :
    categ_cols=data.select_dtypes(include='object').columns
    for i in categ_cols :
        data[categ_cols]=data[categ_cols].astype('category')
    print (data.select_dtypes(include='category').columns, "columns are converted to categorical")

In [None]:
convert_obj_categ(train)

In [None]:
convert_obj_categ(test)

In [None]:
def convert_other_categ(data,other_categ_cols) :
    for i in other_categ_cols :
        data[other_categ_cols]=data[other_categ_cols].astype('category')
    print (data.select_dtypes(include='category').columns, "columns are converted to categorical")

In [None]:
convert_other_categ(train,['MSSubClass','YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','MoSold','YrSold','OverallQual','OverallCond'])
convert_other_categ(test,['MSSubClass','YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','MoSold','YrSold','OverallQual','OverallCond'])

In [None]:
test.dtypes

Handling Null Values

In [None]:
num_cols=train.select_dtypes(include=['int64','float64']).columns
num_cols=num_cols.drop(['SalePrice','Id'])
cat_cols=train.select_dtypes(include='category').columns
print(num_cols);print(cat_cols)

In [None]:
from sklearn.impute import SimpleImputer
num_imputer=SimpleImputer()
cat_imputer=SimpleImputer(strategy='most_frequent')
num_imputer.fit(train[num_cols])
cat_imputer.fit(train[cat_cols])

In [None]:
# Impute values for NAs in train and test
train[num_cols]=num_imputer.transform(train[num_cols])
test[num_cols]=num_imputer.transform(test[num_cols])
train[cat_cols]=cat_imputer.transform(train[cat_cols])
test[cat_cols]=cat_imputer.transform(test[cat_cols])

In [None]:
convert_obj_categ(train)
convert_obj_categ(test)

In [None]:
convert_other_categ(train,['MSSubClass','YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','MoSold','YrSold','OverallQual','OverallCond'])
convert_other_categ(test,['MSSubClass','YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageYrBlt','GarageCars','MoSold','YrSold','OverallQual','OverallCond'])

In [None]:
total_NAs(train)

In [None]:
total_NAs(test)

In [None]:
test.dtypes

In [None]:
def mean_encode(df1,df2,columns,target,alpha):
    # global mean
    global_mean=df1[target].mean()
    
    for feature in columns:
        # mean and count
        agg_data=df1.groupby([feature])[target].agg(['count','mean'])
        count=agg_data['count']
        mean=agg_data['mean']
    
        # Smoothed mean
        smoothed_labels=(mean*count+global_mean*alpha)/(count+alpha)
        df1[feature] = df1[feature].map(smoothed_labels)
        df2[feature] = df2[feature].map(smoothed_labels)

In [None]:
categ_columns=['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','CentralAir','Electrical','GarageType','GarageFinish','PavedDrive','Fence','MiscFeature','SaleType','SaleCondition','Functional']

In [None]:
mean_encode(train,test,categ_columns,'SalePrice',10)

In [None]:
for i in categ_columns:
    train[i]=train[i].astype('float64')

In [None]:
total_NAs(train)

In [None]:
total_NAs(test)

In [None]:
train.head()

In [None]:
label_map1={'TA': 1, 'Fa': 2, 'Gd': 3, 'Ex': 4}
train['ExterQual']=train['ExterQual'].map(label_map1)
test['ExterQual'] = test['ExterQual'].map(label_map1)
train['ExterQual']=train['ExterQual'].astype('int32')
test['ExterQual'] = test['ExterQual'].astype('int32')

In [None]:
label_map2={'Po': 1, 'TA': 2, 'Fa': 3, 'Gd': 4, 'Ex': 5}
train['ExterCond']=train['ExterCond'].map(label_map2)
test['ExterCond'] = test['ExterCond'].map(label_map2)
train['ExterCond']=train['ExterCond'].astype('int32')
test['ExterCond'] = test['ExterCond'].astype('int32')

In [None]:
train['BsmtQual']=train['BsmtQual'].map(label_map1)
test['BsmtQual'] = test['BsmtQual'].map(label_map1)
train['BsmtQual']=train['BsmtQual'].astype('int32')
test['BsmtQual'] = test['BsmtQual'].astype('int32')

In [None]:
label_map3={'Po': 1, 'TA': 2, 'Fa': 3, 'Gd': 4}
train['BsmtCond']=train['BsmtCond'].map(label_map3)
test['BsmtCond'] = test['BsmtCond'].map(label_map3)
train['BsmtCond']=train['BsmtCond'].astype('int32')
test['BsmtCond'] = test['BsmtCond'].astype('int32')

In [None]:
train['HeatingQC']=train['HeatingQC'].map(label_map2)
test['HeatingQC'] = test['HeatingQC'].map(label_map2)
train['HeatingQC']=train['HeatingQC'].astype('int32')
test['HeatingQC'] = test['HeatingQC'].astype('int32')

In [None]:
train['KitchenQual']=train['KitchenQual'].map(label_map1)
test['KitchenQual'] = test['KitchenQual'].map(label_map1)
train['KitchenQual']=train['KitchenQual'].astype('int32')
test['KitchenQual'] = test['KitchenQual'].astype('int32')

In [None]:
train['FireplaceQu']=train['FireplaceQu'].map(label_map2)
test['FireplaceQu'] = test['FireplaceQu'].map(label_map2)
train['FireplaceQu']=train['FireplaceQu'].astype('int32')
test['FireplaceQu'] = test['FireplaceQu'].astype('int32')

In [None]:
train['GarageQual']=train['GarageQual'].map(label_map2)
test['GarageQual'] = test['GarageQual'].map(label_map2)
train['GarageQual']=train['GarageQual'].astype('int32')
test['GarageQual'] = test['GarageQual'].astype('int32')

In [None]:
train['GarageCond']=train['GarageCond'].map(label_map2)
test['GarageCond'] = test['GarageCond'].map(label_map2)
train['GarageCond']=train['GarageCond'].astype('int32')
test['GarageCond'] = test['GarageCond'].astype('int32')

In [None]:
label_map4={'Fa': 1, 'Gd': 2, 'Ex': 3}
train['PoolQC']=train['PoolQC'].map(label_map4)
test['PoolQC'] = test['PoolQC'].map(label_map4)
train['PoolQC']=train['PoolQC'].astype('int32')
test['PoolQC'] = test['PoolQC'].astype('int32')

In [None]:
ord_columns=['OverallQual','OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageCars','GarageQual','GarageCond','PoolQC','BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces']

In [None]:
for i in ord_columns:
    train[i]=train[i].astype('int32')

In [None]:
def drop_unimp_cols(data,unimp_cols) :
    print("Deleting unimportant columns", unimp_cols)
    data.drop(unimp_cols,axis=1,inplace=True)

In [None]:
drop_unimp_cols(train,['Id'])
drop_unimp_cols(test,['Id'])

In [None]:
def ageOfHouse(dataset):
    print(len(dataset))
    for i in range(0, len(dataset)):
        dataset.iloc[i, 80] = dataset.iloc[i, 76] - dataset.iloc[i, 19]

In [None]:
train['age']=''
ageOfHouse(train)

In [None]:
train.age.unique()

In [None]:
def ageOfHouseTest(dataset):
    print(len(dataset))
    for i in range(0, len(dataset)):
        dataset.iloc[i, 79] = dataset.iloc[i, 76] - dataset.iloc[i, 19]

In [None]:
test['age']=''
ageOfHouseTest(test)

In [None]:
test.age.unique()

In [None]:
total_NAs(train)

In [None]:
total_NAs(test)

In [None]:
test[num_cols]=num_imputer.transform(test[num_cols])
test[cat_cols]=cat_imputer.transform(test[cat_cols])

In [None]:
train.columns

In [None]:
def drop_unimp_cols(data,unimp_cols) :
    print("Deleting unimportant columns", unimp_cols)
    data.drop(unimp_cols,axis=1,inplace=True)

In [None]:
drop_unimp_cols(train,['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold'])
drop_unimp_cols(test,['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold'])

In [None]:
# split predictors and target
traindata_x=train.drop(['SalePrice'],axis=1)
traindata_y=pd.DataFrame(train['SalePrice'])
print(traindata_x.columns);print(traindata_y.columns)

In [None]:
# Split train and validation data
from sklearn.model_selection import train_test_split
train_x,val_x,train_y,val_y=train_test_split(traindata_x,traindata_y,test_size=0.2,random_state=42)
print(train_x.shape)
print(val_x.shape)
print(train_y.shape)
print(val_y.shape)

In [None]:
print("======verify that the no of columns are the same in train,val and test before model building======")
print(train_x.shape[1])
print(val_x.shape[1])
print(test.shape[1])

Linear Regression

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
def model_building(model):
    m=model
    m.fit(train_x,train_y)
    train_pred = m.predict(train_x)
    val_pred = m.predict(val_x)
    print('=======Train======')
    print('MSE: ', mean_squared_error(train_y, train_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(train_y, train_pred)))
    print('MAE: ', mean_absolute_error(train_y,train_pred))
    print('MAPE: ', np.mean(np.abs((train_y - train_pred)/train_y))*100)
    print('======Test======')
    print('MSE: ', mean_squared_error(val_y, val_pred))
    print('RMSE: ', np.sqrt(mean_squared_error(val_y, val_pred)))
    print('MAE: ', mean_absolute_error(val_y,val_pred))
    print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)
    return m

In [None]:
from sklearn.linear_model import LinearRegression
reg = model_building(LinearRegression())

Linear Regression using OLS

In [None]:
train_x.columns=[x.replace('1stFlrSF','FirstFlrSF') for x in train_x.columns]
train_x.columns=[x.replace('2ndFlrSF','SecondFlrSF') for x in train_x.columns]
train_x.columns=[x.replace('3SsnPorch','ThreeSsnPorch') for x in train_x.columns]

In [None]:
val_x.columns=[x.replace('1stFlrSF','FirstFlrSF') for x in val_x.columns]
val_x.columns=[x.replace('2ndFlrSF','SecondFlrSF') for x in val_x.columns]
val_x.columns=[x.replace('3SsnPorch','ThreeSsnPorch') for x in val_x.columns]

In [None]:
test.columns=[x.replace('1stFlrSF','FirstFlrSF') for x in test.columns]
test.columns=[x.replace('2ndFlrSF','SecondFlrSF') for x in test.columns]
test.columns=[x.replace('3SsnPorch','ThreeSsnPorch') for x in test.columns]

In [None]:
train_x.columns

In [None]:
x='+'.join(train_x.columns.values)
x

In [None]:
formula = " ~ ".join((train_y.columns[-1],x))
print (formula)

In [None]:
#Importing the necessary modules
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:
train_xy=pd.concat([train_x,train_y],axis=1)
val_xy=pd.concat([val_x,val_y],axis=1)

In [None]:
# defining and fitting the model
lm_mod1 = ols(formula=formula, data=train_xy) # Describe model

result1 = lm_mod1.fit()

In [None]:
# Model summary
print(result1.summary2())

In [None]:
train_pred1=result1.predict(train_xy)

In [None]:
from sklearn.metrics import r2_score

In [None]:
# checking the adjusted r2_score
r2_score(train_xy['SalePrice'],train_pred1)

In [None]:
val_pred1 = result1.predict(val_xy)

In [None]:
def ErrorMetrics(train_y,train_pred,val_y,val_pred):
    print("-----Train Error------")
    print("MSE:", mean_squared_error(train_y, train_pred))
    print("RMSE:", np.sqrt(mean_squared_error(train_y, train_pred)))
    print("MAE:", mean_absolute_error(train_y, train_pred))
    print ('MAPE: ',np.mean(np.abs((train_y - train_pred)/train_y))*100)
    
    print("-----Validation Error------")
    print("MSE:", mean_squared_error(val_y, val_pred))
    print("RMSE:", np.sqrt(mean_squared_error(val_y, val_pred)))
    print("MAE:", mean_absolute_error(val_y, val_pred))
    print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)

In [None]:
train_pred1=pd.DataFrame(train_pred1)
val_pred1=pd.DataFrame(val_pred1)
ErrorMetrics(train_y.values,train_pred1,val_y.values,val_pred1)

Residual Analysis

In [None]:
# Checking the assumption of linearity
preds_tr=result1.predict(exog=train_xy)
# Plot the residuals after fitting a linear model
sns.residplot(preds_tr,result1.resid, lowess=True)
sns.set(style="whitegrid")
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
# From the below plot, we can say that assumption of linearity is violated. Residuals are linear.

In [None]:
# Checking the assumption of normality.
inf=result1.get_influence()
sm.qqplot(inf.resid_studentized_internal,line='45')
plt.show(block=True)
# from the below plot, we can say that resisuals are not normally distributed.

In [None]:
leverage=inf.hat_matrix_diag
plt.figure()
plt.plot(leverage,inf.resid_studentized_internal,'bo')
plt.show(block=True)

In [None]:
plt.figure()
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.influence_plot(result1, alpha  = 0.05, ax = ax, criterion="cooks")
plt.show(block=True)

Ridge Regularization

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
# The value of alpha determines the extent of penalization.
# But we also need to check which value of alpha gives best predictions on test data.
# For this we experiment with several values of alpha and pick the best. 
# We do this by performing grid search over several values of alpha - Cross Validation
alphas = np.array([1,0.1,0.01,0.001,0.0001,0,1.5,2])

In [None]:
# create and fit a ridge regression model
model_ridge = Ridge()
grid = GridSearchCV(estimator=model_ridge, param_grid=dict(alpha=alphas),cv=10)
grid.fit(train_x,train_y)
print(grid)

In [None]:
# Print the best parameters
print(grid.best_score_)
print(grid.best_estimator_.alpha)

In [None]:
Ridge_model= Ridge(alpha=2,normalize=False)
Ridge_model.fit(train_x,train_y)
train_pred_ridge=Ridge_model.predict(train_x)
val_pred_ridge=Ridge_model.predict(val_x)

In [None]:
train_pred_ridge=pd.DataFrame(train_pred_ridge)
val_pred_ridge=pd.DataFrame(val_pred_ridge)
ErrorMetrics(train_y.values,train_pred_ridge,val_y.values,val_pred_ridge)

Lasso Regularization

In [None]:
# create and fit a lasso regression model
model_lasso = Lasso()
grid = GridSearchCV(estimator=model_lasso, param_grid=dict(alpha=alphas),cv=10)
grid.fit(train_x,train_y)
print(grid)

In [None]:
# Print the best parameters
print(grid.best_score_)
print(grid.best_estimator_.alpha)

In [None]:
Lasso_model= Lasso(alpha=0.01,normalize=False)
Lasso_model.fit(train_x,train_y)
train_pred_lasso=Lasso_model.predict(train_x)
val_pred_lasso=Lasso_model.predict(val_x)

In [None]:
train_pred_lasso=pd.DataFrame(train_pred_ridge)
val_pred_lasso=pd.DataFrame(val_pred_ridge)
ErrorMetrics(train_y.values,train_pred_lasso,val_y.values,val_pred_lasso)

Decision Tree

In [None]:
# Build DT Model
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
dt_reg1 = DecisionTreeRegressor(max_depth=7)

# Fit the model on train data
%time dt_reg1.fit(train_x,train_y)
print(dt_reg1.score)

# Predict target on train and val data
train_pred = dt_reg1.predict(train_x)
val_pred = dt_reg1.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

In [None]:
features = pd.DataFrame(dt_reg1.feature_importances_, columns = ['FI'], index = train_x.columns).sort_values('FI', ascending = True)
features.plot(kind = 'barh', figsize = (15,10))
del features

Decision Tree with Grid

In [None]:
# Build Model
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
dt_grid=DecisionTreeRegressor(random_state=2)
dt_param_grid = {"min_samples_leaf": [2,3,4,5,6,7],
                  "max_depth": [3,4,5,6,7,8]}
dt_reg2 = RandomizedSearchCV(dt_grid,dt_param_grid,cv=10,n_jobs=-1,n_iter=1000)

# Fit model on train data
%time dt_reg2.fit(train_x,train_y)
print(dt_reg2.best_score_,dt_reg2.best_params_)

# Predict target on train and val data
train_pred = dt_reg2.predict(train_x)
val_pred = dt_reg2.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Random Forest

In [None]:
# Build Model
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(random_state=3,max_depth=8,min_samples_leaf=7)

# Fit model on train data
%time rf1.fit(train_x,train_y)
print(rf1.score)

# Predict target on train and val data
train_pred = rf1.predict(train_x)
val_pred = rf1.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Random Forest with Grid

In [None]:
# Build Model
rf_grid = RandomForestRegressor(random_state=4)
rf_param_grid = {"n_estimators" : [2,4,6,8,10,12,14,16,20,25],
                 "max_depth" : [5,7,9,11,13,15],
                 "min_samples_leaf" : [2,3,5,7,10]}
rf2=RandomizedSearchCV(rf_grid,rf_param_grid,cv=10,n_jobs=-1,n_iter=3000)

# Fit model on train data
%time rf2.fit(train_x,train_y)
print(rf2.best_score_,rf2.best_params_)

# Predict target on train and val data
train_pred = rf2.predict(train_x)
val_pred = rf2.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Ada Boost

In [None]:
# Build Model
from sklearn.ensemble import AdaBoostRegressor
ada1 = AdaBoostRegressor(random_state=5,n_estimators=200,learning_rate=0.05)

# Fit model on train data
%time ada1.fit(train_x,train_y)
print(ada1.score)

# Predict target on train and val data
train_pred = ada1.predict(train_x)
val_pred = ada1.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Ada Boost with Grid

In [None]:
# Build Model
from sklearn.ensemble import AdaBoostRegressor
ada_grid = AdaBoostRegressor(random_state=5)
ada_param_grid = {'n_estimators':[100,200,300,400],
                  'learning_rate':[0.02,0.04,0.06,0.08]}
ada2=RandomizedSearchCV(ada_grid,ada_param_grid,cv=10,n_jobs=-1,n_iter=3000)

# Fit model on train data
%time ada2.fit(train_x,train_y)
print(ada2.best_score_,ada2.best_params_)                
        
# Predict target on train and val data
train_pred = ada2.predict(train_x)
val_pred = ada2.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

In [None]:
# Build Model
from sklearn.ensemble import GradientBoostingRegressor
gb1 = GradientBoostingRegressor(n_estimators=200,learning_rate=0.002,max_depth=8)

# Fit model on train data
%time gb1.fit(train_x,train_y)
print(gb1.score)

# Predict target on train and val data
train_pred = gb1.predict(train_x)
val_pred = gb1.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Gradient Boosting with Grid

In [None]:
# Build Model
from sklearn.ensemble import GradientBoostingRegressor
gb_grid = GradientBoostingRegressor(random_state=6)
gb_param_grid = {"n_estimators" : [100,150,200],
                  "max_depth" : [2,3,5,7,9],
                  "learning_rate" : [0.002,0.005]}
gb2=RandomizedSearchCV(gb_grid,gb_param_grid,cv=10,n_jobs=-1,n_iter=3000)

# Fit model on train data
%time gb2.fit(train_x,train_y)
print(gb2.best_score_,gb2.best_params_)

# Predict target on train and val data
train_pred = gb2.predict(train_x)
val_pred = gb2.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

XG Boost

In [None]:
# Build Model
from xgboost.sklearn import XGBRegressor
xgb1 = XGBRegressor(n_estimators= 70,max_depth=2, learning_rate=0.1, reg_lambda= 2)

# Fit model on train data
%time xgb1.fit(train_x,train_y)
print(xgb1.score)

# Predict target on train and val data
train_pred = xgb1.predict(train_x)
val_pred = xgb1.predict(val_x)

# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)

Making predictions

In [None]:
test_backup=pd.read_csv('test.csv')
Id=test_backup.Id

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
test.dtypes

In [None]:
Target = pd.DataFrame(xgb1.predict(test),columns=['SalePrice'])
pred=pd.concat([Id,Target],axis=1)
pred

In [None]:
os.chdir(r"/kaggle/working")
pred.to_csv('Submission.csv',index=False)