makeSubmission.R

# Kaggle Walmart recruiting competition 2014-02-20 to 2014-05-05.
# Michael Kim (mikeskim  AT  g m a i l  DOT c o m)
# https://www.kaggle.com/users/64626/mike-kim
# Forked merge code from Kakuda
# http://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/forums/t/7214/merging-into-one-file-using-r


#import libraries
library(timeDate)
library(randomForest)


#set options to make sure scientific notation is disabled when writing files
options(scipen=500)


#read in data
dfStore <- read.csv(file='/home/mikeskim/Desktop/walmart/stores.csv')
dfTrain <- read.csv(file='/home/mikeskim/Desktop/walmart/train.csv')
dfTest <- read.csv(file='/home/mikeskim/Desktop/walmart/test.csv')
dfFeatures <- read.csv(file='/home/mikeskim/Desktop/walmart/features.csv')
submission = read.csv(file='/home/mikeskim/Desktop/walmart/sampleSubmission.csv',header=TRUE,as.is=TRUE)


# Merge Type and Size
dfTrainTmp <- merge(x=dfTrain, y=dfStore, all.x=TRUE)
dfTestTmp <- merge(x=dfTest, y=dfStore, all.x=TRUE)


# Merge all the features
train <- merge(x=dfTrainTmp, y=dfFeatures, all.x=TRUE)
test <- merge(x=dfTestTmp, y=dfFeatures, all.x=TRUE)


# Make features for train
train$year = as.numeric(substr(train$Date,1,4))
train$month = as.numeric(substr(train$Date,6,7))
train$day = as.numeric(substr(train$Date,9,10))
#this function is a huge bottleneck in terms of speed
train$days =sapply(train$Date,
                   function(x) as.numeric(difftimeDate(timeDate(x),timeDate(paste(substr(x,1,4),"-01-01",sep="")),"days")))
#you can try the faster alternative if the above is too slow, comment out above, uncomment next line.
#train$days = (train$month-1)*30 + train$day
train$Type = as.character(train$Type)
train$Type[train$Type=="A"]=1
train$Type[train$Type=="B"]=2
train$Type[train$Type=="C"]=3
train$IsHoliday[train$IsHoliday=="TRUE"]=1
train$IsHoliday[train$IsHoliday=="FALSE"]=0
train$dayHoliday = train$IsHoliday*train$days
train$logsales = log(4990+train$Weekly_Sales)
#weight certain features more by duplication, not sure if helpful?
train$tDays = 360*(train$year-2010) + (train$month-1)*30 + train$day
train$days30 = (train$month-1)*30 + train$day


#Make features for test
test$year = as.numeric(substr(test$Date,1,4))
test$month = as.numeric(substr(test$Date,6,7))
test$day = as.numeric(substr(test$Date,9,10))
#bottleneck
test$days = sapply(test$Date,
                   function(x) as.numeric(difftimeDate(timeDate(x),timeDate(paste(substr(x,1,4),"-01-01",sep="")),"days")))
#you can try the faster alternative if the above is too slow, comment out above, uncomment next line.
#test$days = (test$month-1)*30 + test$day
test$Type = as.character(test$Type)
test$Type[test$Type=="A"]=1
test$Type[test$Type=="B"]=2
test$Type[test$Type=="C"]=3
test$IsHoliday[test$IsHoliday=="TRUE"]=1
test$IsHoliday[test$IsHoliday=="FALSE"]=0
test$dayHoliday = test$IsHoliday*test$days
test$tDays = 360*(test$year-2010) + (test$month-1)*30 + test$day
test$days30 = (test$month-1)*30 + test$day


#Run model
tmpR0 = nrow(submission)
j=1
while (j < tmpR0) {
  print(j/tmpR0)#keep track of progress
  #select only relevant data for the store and department tuple
  tmpId = submission$Id[j]
  tmpStr = unlist(strsplit(tmpId,"_"))
  tmpStore = tmpStr[1]
  tmpDept = tmpStr[2]
  dataF1 = train[train$Dept==tmpDept,]
  tmpL = nrow(dataF1[dataF1$Store==tmpStore,])
  #since MAE is weighted, increase weights of holiday data by 5x
  tmpF = dataF1[dataF1$IsHoliday==1,]
  dataF1 = rbind(dataF1,do.call("rbind", replicate(4, tmpF, simplify = FALSE)))
  dataF2 = dataF1[dataF1$Store==tmpStore,]  
  testF1 = test[test$Dept==tmpDept,]
  testF1 = testF1[testF1$Store==tmpStore,]
  testRows = nrow(testF1)
  if (tmpL<10) {#sample size restrictions since rf can fail if there isn't enough data
    #this model uses all dept data (since that store + dept pair does not exist in the training set)
    tmpModel =  randomForest(logsales~Size+Type+ year + month + day + days + dayHoliday + tDays + days30, 
                               ntree=4800, replace=TRUE, mtry=4, data=dataF1)}
  else {
    #this model is trained on store+dept filtered data
    tmpModel =  randomForest(logsales ~ year + month + day + days + dayHoliday + tDays + days30, 
                               ntree=4800, replace=TRUE, mtry=3, data=dataF2)}
  tmpP = exp(predict(tmpModel,testF1))-4990
  k = j + testRows - 1
  submission$Weekly_Sales[j:k] = tmpP
  j = k+1
}


#write the submission to csv for Kaggle submission
write.table(x=submission,
            file='/home/mikeskim/Desktop/walmart/outputFinal.csv',
            sep=',', row.names=FALSE, quote=FALSE)