In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")
train <- read.csv("../input/train.csv")
test <- read.csv("../input/test.csv")
require("DataExplorer")
require("corrplot")
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
head(train)
head(test)
str(train)
summary(train)



In [None]:
#data explorer

plot_missing(train)
plot_missing(test)
plot_density(train)
plot_correlation(train)
plot_bar(train)
plot_correlation(test)
plot_bar(test)


In [None]:
# Combining both datasets
test$SalePrice <- NA
combi  <- rbind(train, test)
dim(combi)

In [None]:
# Sales price bar chart
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=SalePrice)) +
        geom_histogram(fill="blue", binwidth = 10000) +
          scale_x_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

Discovering numeric variable that highly correlate with SalePrice


In [None]:
numericVars <- which(sapply(combi, is.numeric))
#numericVars
numericVarNames <- names(numericVars)
#numericVarNames
combi_numVar <- combi[, numericVars]
#combi_numVar

cor_numVar <- cor(combi_numVar, use="pairwise.complete.obs")
#cor_numVar 
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
 #select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]

corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")



In [None]:
ggplot (data=combi[!is.na(combi$SalePrice),],aes(x=as.factor(KitchenQual),y= SalePrice))+
geom_boxplot(col='blue')
table(combi$KitchenQual)

analyzing high correlation features

In [None]:
#Overal Quality
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=factor(OverallQual), y=SalePrice))+
        geom_boxplot(col='blue') + labs(x='Overall Quality') +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

#living area
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=GrLivArea, y=SalePrice))+
        geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
  scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)


#garage cars
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=factor(GarageCars), y=SalePrice))+
        geom_boxplot(col='blue') + labs(x='Garage Cars') +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

#garage area
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=GarageArea, y=SalePrice))+
        geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
  scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)



In [None]:
#garage Area vs garage cars
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=factor(GarageCars), y=SalePrice))+
        geom_boxplot(col='blue') + labs(x='Garage Cars') +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

#garage area
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=GarageArea, y=SalePrice))+
        geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
  scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)


**Data Cleasnsing - of missing data**

In [None]:
Qualities <- c('None' = 0, 'Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5)

dealing with missing values
PoolQC - Ex   Excellent
   Gd   Good
   TA   Average/Typical
   Fa   Fair
   NA   No Pool
   

In [None]:
combi$PoolQC <- as.character(combi$PoolQC)
table(combi$PoolQC)
combi$PoolQC[is.na(combi$PoolQC)] <- 'None'
combi$PoolQC<-as.integer(revalue(combi$PoolQC,Qualities))

In [None]:
filter(combi[ c('PoolArea', 'PoolQC', 'OverallQual')] ,  PoolArea>0 & PoolQC =='None')
combi[combi$PoolArea>0 & combi$PoolQC=='None', c('PoolArea', 'PoolQC', 'OverallQual')]



In [None]:
combi$PoolQC[2421] <- 2
combi$PoolQC[2504] <- 3
combi$PoolQC[2600] <- 2

In [None]:

table(combi$PoolQC)

MisCelan feature

In [None]:
combi$MiscFeature <- as.character(combi$MiscFeature)
combi$MiscFeature[is.na(combi$MiscFeature)] <- 'None'
combi$MiscFeature <- as.factor(combi$MiscFeature)

In [None]:
ggplot(combi[!is.na(combi$SalePrice),],aes(x=MiscFeature,y=SalePrice))+
geom_bar(stat='summary',fun.y='median',fill='blue')+
 scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)+
 geom_label(stat = "count", aes(label = ..count.., y = ..count..))

Alley

In [None]:
combi$Alley <- as.character(combi$Alley)
combi$Alley[is.na(combi$Alley)] <- 'None'
combi$Alley <- as.factor(combi$Alley)

In [None]:
ggplot(combi[!is.na(combi$SalePrice),], aes(x=Alley, y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue')+
        scale_y_continuous(breaks= seq(0, 200000, by=50000), labels = comma)

Fence

In [None]:
combi$Fence
table(combi$Fence)

In [None]:
table(combi$Fence)

In [None]:
combi$Fence<-as.character(combi$Fence)
combi$Fence[is.na(combi$Fence)] <- 'None'
combi$Fence<-as.factor(combi$Fence)

In [None]:
table(combi$Fence)
combi[!is.na(combi$SalePrice),]

In [None]:
combi[!is.na(combi$SalePrice),] %>% group_by(Fence)%>% summarise(avg=mean(!is.na(combi$SalePrice)))

Fireplace

In [None]:
combi$FireplaceQu<-as.character(combi$FireplaceQu)
combi$FireplaceQu[is.na(combi$FireplaceQu)] <- 'None'
combi$FireplaceQu<-as.integer(revalue(combi$FireplaceQu, Qualities))
table(combi$FireplaceQu)

Garage

there are 7 Garage variables


In [None]:
plot_missing(combi)

In [None]:
combi$GarageYrBlt[is.na(combi$GarageYrBlt)] <- combi$YearBuilt[is.na(combi$GarageYrBlt)]

In [None]:
length((which(is.na(combi$GarageType) & is.na(combi$GarageFinish) & is.na(combi$GarageCond) & is.na(combi$GarageQual))))
## there is 157 records missing all variables and 2 extra 

In [None]:
#finding extras
kable(combi[!is.na(combi$GarageType) & is.na(combi$GarageFinish), c('GarageCars', 'GarageArea', 'GarageType', 'GarageCond', 'GarageQual', 'GarageFinish')])

In [None]:
table(combi$GarageCond)
table(combi$GarageQual)
table(combi$GarageFinish)


In [None]:
# for 2172 use most common values
combi$GarageCond[2127]<-'TA'
combi$GarageQual[2127]<-'TA'
combi$GarageFinish[2127]<-'Unf'

# it looks like it doesn`t have garage at all
combi$GarageCars[2577]<-0
combi$GarageArea[2577]<-0
combi$GarageType[2577]<-NA


In [None]:
combi$GarageType<-as.character(combi$GarageType)
combi$GarageType[is.na(combi$GarageType)] <- 'No Garage'
combi$GarageType <- as.factor(combi$GarageType)
table(combi$GarageType)

In [None]:
temp<-filter(combi,is.na(combi$GarageType))

In [None]:
select(temp,c(GarageCond,GarageType,GarageCars,GarageArea,GarageQual,GarageFinish ))

In [None]:
combi$GarageQual<-as.character(combi$GarageQual)
combi$GarageQual[is.na(combi$GarageQual)] <- 'None'
combi$GarageQual<-as.integer(revalue(combi$GarageQual, Qualities))
table(combi$GarageQual)

Kitchen Quality variable


In [None]:
table(combi$KitchenQual)

In [None]:
sum(table(combi$KitchenQual))
combi$KitchenQual[is.na(combi$KitchenQual)] <- 'TA'
combi$KitchenQual<-as.character(combi$KitchenQual)

table(combi$KitchenQual)
combi$KitchenQual<-as.integer(revalue(combi$KitchenQual, Qualities))


table(combi$KitchenQual)
str(combi$KitchenQual)

Basement Quality

In [None]:
combi$BsmtQual<-as.character(combi$BsmtQual)
combi$BsmtQual[is.na(combi$BsmtQual)] <- 'None'
table(combi$BsmtQual)
#combi$BsmtQual<-as.factor(combi$BsmtQual)
combi$BsmtQual<-as.integer(revalue(combi$BsmtQual,Qualities))
table(combi$BsmtQual)


In [None]:
combi[!is.na(combi$BsmtFinType1) & (is.na(combi$BsmtCond)|is.na(combi$BsmtQual)|is.na(combi$BsmtExposure)|is.na(combi$BsmtFinType2)), c('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2')]

In [None]:
combi$BsmtFinSF1[is.na(combi$BsmtFinSF1)] <-0
combi$BsmtFinSF2[is.na(combi$BsmtFinSF2)] <-0
combi$BsmtUnfSF[is.na(combi$BsmtUnfSF)] <-0
combi$TotalBsmtSF[is.na(combi$TotalBsmtSF)] <-0


ExterQual Variable


In [None]:
table(combi$ExterQual)
sum(table(combi$ExterQual))
combi$ExterQual<-as.character(combi$ExterQual)


In [None]:
combi$ExterQual<-as.integer(revalue(combi$ExterQual, Qualities))
table(combi$ExterQual)

Bathrooms**

In [None]:
combi[(is.na(combi$BsmtFullBath)|is.na(combi$BsmtHalfBath)|is.na(combi$BsmtFinSF1)|is.na(combi$BsmtFinSF2)|is.na(combi$BsmtUnfSF)|is.na(combi$TotalBsmtSF)), c('BsmtQual', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF')]

In [None]:
combi$BsmtFullBath[is.na(combi$BsmtFullBath)] 
combi$BsmtFullBath[is.na(combi$BsmtFullBath)] <-0

combi$BsmtHalfBath[is.na(combi$BsmtHalfBath)]<-0

**Coding variables as factors**

Coding HEating QC as Quality

MSSubClass coding


In [None]:
str(combi$MSSubClass)

In [None]:
combi$MSSubClass<-as.factor(combi$MSSubClass)

In [None]:
Charcol <- names(combi[,sapply(combi, is.character)])
Charcol

In [None]:
str(combi$YrSold)
str(combi$MoSold)
combi$MoSold<-as.factor(combi$MoSold)
combi$YrSold<-as.factor(combi$YrSold)

Air condition

In [None]:
table(combi$CentralAir)
combi$CentralAir<-as.character(combi$CentralAir)
combi$CentralAir<-as.integer(revalue(combi$CentralAir,c('N'=0,'Y'=1)))
table(combi$CentralAir)

Heating

In [None]:
str(combi$HeatingQC)
table(combi$HeatingQC)
combi$HeatingQC<-as.character(combi$HeatingQC)

combi$HeatingQC<-as.integer(revalue(combi$HeatingQC,Qualities))
table(combi$HeatingQC)


In [None]:
 ggplot(combi[!is.na(combi$SalePrice),], aes(x=as.factor(YrSold), y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue')+
        scale_y_continuous(breaks= seq(0, 800000, by=25000), labels = comma) +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
        coord_cartesian(ylim = c(0, 200000)) +
        geom_hline(yintercept=163000, linetype="dashed", color = "red")

ggplot(combi[!is.na(combi$SalePrice),], aes(x=MoSold, y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue')+
        scale_y_continuous(breaks= seq(0, 800000, by=25000), labels = comma) +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..)) +
        coord_cartesian(ylim = c(0, 200000)) +
        geom_hline(yintercept=163000, linetype="dashed", color = "red")

Some Visual of important vars 

In [None]:
numericVars <- which(sapply(combi, is.numeric)) #index vector numeric variables
factorVars <- which(sapply(combi, is.factor)) #index vector factor variables
cat('There are', length(numericVars), 'numeric variables, and', length(factorVars), 'categoric variables')

In [None]:
combi_numVar <- combi[, numericVars]
cor_numVar <- cor(combi_numVar, use="pairwise.complete.obs")

In [None]:
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
 #select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]

corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", tl.cex = 0.7,cl.cex = .7, number.cex=.7)

In [None]:

fit <- randomForest(SalePrice ~ OverallQual + PoolQC + MiscFeature + GrLivArea + GarageCars + Alley +KitchenQual+ExterQual+
                                       X1stFlrSF  +  YearBuilt + TotRmsAbvGrd + MSSubClass+LotArea + OverallCond+ Neighborhood,
data = combi[1:1460,],
                      importance = TRUE,
                      ntree = 2000)


In [None]:

imp_RF <- importance(fit)
imp_DF <- data.frame(Variables = row.names(imp_RF), MSE = imp_RF[,1])
imp_DF <- imp_DF[order(imp_DF$MSE, decreasing = TRUE),]

ggplot(imp_DF[1:20,], aes(x=reorder(Variables, MSE), y=MSE, fill=MSE)) +
geom_bar(stat = 'identity') +
labs(x = 'Variables', y= '% increase MSE if variable is randomly permuted') + 
coord_flip()+
 theme(legend.position="none")

Neghborhood


In [None]:
ggplot(combi[!is.na(combi$SalePrice),], aes(x=Neighborhood, y=SalePrice))+
 geom_bar(stat='summary', fun.y = "median", fill='blue') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma) +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) +
        geom_hline(yintercept=163000, linetype="dashed", color = "red") 

In [None]:
ggplot(data=combi, aes(x=Neighborhood)) +
        geom_histogram(stat='count')+
        geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3)+
        theme(axis.text.x = element_text(angle = 45, hjust = 1))

**FEATURES Engineering**

Total no of bathrooms

In [None]:
combi$TotBath<-combi$FullBath+(combi$HalfBath*0.5) + combi$BsmtFullBath+(combi$BsmtHalfBath*0.5)


In [None]:
ggplot(combi, aes(x=as.factor(TotBath)))+
geom_histogram(stat='count')

In [None]:
ggplot(combi [!is.na(combi$SalePrice),], aes(x=as.factor(TotBath), y=SalePrice))+
geom_point(col='blue')+
scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma)

 Total Square Feet

In [None]:
str(combi)

In [None]:
combi$TotalSqFeet <- combi$GrLivArea + combi$TotalBsmtSF

In [None]:
plot_missing(select(combi,TotalBsmtSF))

In [None]:
ggplot(data=combi[!is.na(combi$SalePrice),], aes(x=TotalSqFeet, y=SalePrice))+
        geom_point(col='blue') + geom_smooth(method = "lm", se=FALSE, color="black", aes(group=1)) +
        scale_y_continuous(breaks= seq(0, 800000, by=100000), labels = comma) +
        geom_text_repel(aes(label = ifelse(combi$GrLivArea[!is.na(combi$SalePrice)]>4500, rownames(combi), '')))

HOUSE AGE,
Remode
IsNEw

In [None]:
combi$Remod <- ifelse(combi$YearBuilt==combi$YearRemodAdd, 0, 1) #0=No Remodeling, 1=Remodeling

In [None]:
ggplot(combi[!is.na(combi$SalePrice),], aes(x=as.factor(Remod), y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue') +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=6) +
        scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma) +
        theme_grey(base_size = 18) +
        geom_hline(yintercept=163000, linetype="dashed") #dashed line is median SalePrice

In [None]:
combi$IsNew <- ifelse(combi$YrSold==combi$YearBuilt, 1, 0)
table(combi$IsNew)

In [None]:
ggplot(combi[!is.na(combi$SalePrice),], aes(x=as.factor(IsNew), y=SalePrice)) +
        geom_bar(stat='summary', fun.y = "median", fill='blue') +
        geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=6) +
        scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma) +
        theme_grey(base_size = 18) +
        geom_hline(yintercept=163000, linetype="dashed") #dashed line is median SalePrice

In [None]:
combi$Age <- as.numeric(as.character(combi$YrSold))-combi$YearRemodAdd

In [None]:
ggplot(combi[!is.na(combi$SalePrice),],aes(x=Age,y=SalePrice))+
geom_point(col='blue')+
geom_smooth(method='lm', color='red')+
scale_y_continuous(breaks= seq(0, 800000, by=100000),labels=comma)

PreModelling


Dropping highly correlated variables

In [None]:
dropVars <- c('YearRemodAdd', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'TotalBsmtSF', 'TotalRmsAbvGrd', 'BsmtFinSF1')

combi <- combi[,!(names(combi) %in% dropVars)]

Drop for now


In [None]:
plot_missing(combi)
NAcol <- which(colSums(is.na(combi)) > 0)
a<-sort(colSums(sapply(combi[NAcol], is.na)), decreasing = TRUE)


In [None]:
droptemp <- c('LotFrontage','GarageFinish','BsmtCond','BsmtExposure','BsmtFinType','BsmtFinType1','MasVnrType','MasVnrArea','MSZoning','Utilities',
'Functional',
'Exterior1st',
'Exterior2nd',
'Electrical',
'SaleType','BsmtFinType2')

combi <- combi[,!(names(combi) %in% droptemp)]

In [None]:
plot_missing(combi)

In [None]:
dim(combi)

In [None]:
numericVarNames <- numericVarNames[!(numericVarNames %in% c('MSSubClass', 'MoSold', 'YrSold', 'SalePrice', 'OverallQual', 'OverallCond'))]
numericVarNames <- append(numericVarNames, c('TotalSqFeet','TotBath','Age'))


In [None]:
numericVarNames

In [None]:
DFnumeric <- combi[, names(combi) %in% numericVarNames]
dim(DFnumeric)

In [None]:
DFfactors <- combi[, !(names(combi) %in% numericVarNames)]
DFfactors <- DFfactors[, names(DFfactors) != 'SalePrice']
DFfactors$MSSubClass<-as.factor(DFfactors$MSSubClass)
DFfactors$OverallQual<-as.factor(DFfactors$OverallQual)
DFfactors$OverallCond<-as.factor(DFfactors$OverallCond)
DFfactors$MoSold<-as.factor(DFfactors$MoSold)     
DFfactors$YrSold<-as.factor(DFfactors$YrSold)

dim(DFfactors)



In [None]:

cat('There are', length(DFnumeric), 'numeric variables, and', length(DFfactors), 'factor variables')

Skew

In [None]:
for(i in 1:ncol(DFnumeric)){
        if (abs(skew(DFnumeric[,i]))>0.8){
                DFnumeric[,i] <- log(DFnumeric[,i] +1)
        }
}

Normalizing

In [None]:
PreNum <- preProcess(DFnumeric, method=c("center", "scale"))
print(PreNum)

In [None]:
DFnorm<-predict(PreNum, DFnumeric)
dim(DFnorm)
dim(DFfactors)

In [None]:
plot_missing(DFfactors)
plot_missing(DFnumeric)

one hot encdoding

In [None]:
DFdummies <- as.data.frame(model.matrix(~.-1, DFfactors))
dim(DFdummies)

In [None]:
ZerocolTest <- which(colSums(DFdummies[(nrow(combi[!is.na(combi$SalePrice),])+1):nrow(combi),])==0)
colnames(DFdummies[ZerocolTest])

In [None]:
DFdummies <- DFdummies[,-ZerocolTest]

In [None]:
ZerocolTrain <- which(colSums(DFdummies[1:nrow(combi[!is.na(combi$SalePrice),]),])==0)
colnames(DFdummies[ZerocolTrain])

In [None]:
DFdummies <- DFdummies[,-ZerocolTrain]

In [None]:
fewOnes <- which(colSums(DFdummies[1:nrow(combi[!is.na(combi$SalePrice),]),])<10)
colnames(DFdummies[fewOnes])

In [None]:
DFdummies <- DFdummies[,-fewOnes] #removing predictors
dim(DFdummies)

SKEW


In [None]:
skew(combi$SalePrice)
qqnorm(combi$SalePrice)
qqline(combi$SalePrice)

In [None]:
combi$SalePrice<-log(combi$SalePrice)


In [None]:
qqnorm(combi$SalePrice)
qqline(combi$SalePrice)
skew(combi$SalePrice)

In [None]:
combined <- cbind(DFnorm, DFdummies) 
train1 <- combined[!is.na(combi$SalePrice),]
test1 <- combined[is.na(combi$SalePrice),]
plot_missing(combi)
plot_missing(train1)

LASSO


In [None]:

my_control <-trainControl(method="cv", number=5)
lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.0005))

lasso_mod <- train(x=train1, y=combi$SalePrice[!is.na(combi$SalePrice)], method='glmnet', trControl= my_control, tuneGrid=lassoGrid) 
lasso_mod$bestTune

In [None]:
min(lasso_mod$results$RMSE)

In [None]:
lassoVarImp <- varImp(lasso_mod,scale=F)
lassoImportance <- lassoVarImp$importance
varsSelected <- length(which(lassoImportance$Overall!=0))
varsNotSelected <- length(which(lassoImportance$Overall==0))

In [None]:
cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.')

In [None]:
LassoPred <- predict(lasso_mod, test1)
predictions_lasso <- exp(LassoPred) #reverse the log to the real values
head(predictions_lasso)

> XGBOOST


In [None]:
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1
)

In [None]:
#xgb_caret <- train(x=train1, y=combi$SalePrice[!is.na(combi$SalePrice)], method='xgbTree', trControl= my_control, tuneGrid=xgb_grid) 
#xgb_caret$bestTune

In [None]:
label_train <- combi$SalePrice[!is.na(combi$SalePrice)]
dtrain <- xgb.DMatrix(data = as.matrix(train1), label= label_train)
dtest <- xgb.DMatrix(data = as.matrix(test1))

In [None]:
default_param<-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.05, #default = 0.3
        gamma=0,
        max_depth=3, #default=6
        min_child_weight=4, #default=1
        subsample=1,
        colsample_bytree=1
)

In [None]:
xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 500, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)

In [None]:
xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 454)

In [None]:
XGBpred <- predict(xgb_mod, dtest)
predictions_XGB <-exp(XGBpred) #need to reverse the log to the real values
head(predictions_XGB)

In [None]:
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train1),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)

In [None]:
sub_avg <- data.frame(Id = test$Id, SalePrice = ((2*predictions_lasso+predictions_XGB)/3))
head(sub_avg)
head(predictions_XGB)

##### 

In [None]:
write.csv(sub_avg, file = 'average.csv', row.names = F)