Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
115 lines (96 sloc) 4.5 KB
# Kaggle Walmart recruiting competition 2014-02-20 to 2014-05-05.
# Michael Kim (mikeskim AT g m a i l DOT c o m)
# https://www.kaggle.com/users/64626/mike-kim
# Forked merge code from Kakuda
# http://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting/forums/t/7214/merging-into-one-file-using-r
#import libraries
library(timeDate)
library(randomForest)
#set options to make sure scientific notation is disabled when writing files
options(scipen=500)
#read in data
dfStore <- read.csv(file='/home/mikeskim/Desktop/walmart/stores.csv')
dfTrain <- read.csv(file='/home/mikeskim/Desktop/walmart/train.csv')
dfTest <- read.csv(file='/home/mikeskim/Desktop/walmart/test.csv')
dfFeatures <- read.csv(file='/home/mikeskim/Desktop/walmart/features.csv')
submission = read.csv(file='/home/mikeskim/Desktop/walmart/sampleSubmission.csv',header=TRUE,as.is=TRUE)
# Merge Type and Size
dfTrainTmp <- merge(x=dfTrain, y=dfStore, all.x=TRUE)
dfTestTmp <- merge(x=dfTest, y=dfStore, all.x=TRUE)
# Merge all the features
train <- merge(x=dfTrainTmp, y=dfFeatures, all.x=TRUE)
test <- merge(x=dfTestTmp, y=dfFeatures, all.x=TRUE)
# Make features for train
train$year = as.numeric(substr(train$Date,1,4))
train$month = as.numeric(substr(train$Date,6,7))
train$day = as.numeric(substr(train$Date,9,10))
#this function is a huge bottleneck in terms of speed
train$days =sapply(train$Date,
function(x) as.numeric(difftimeDate(timeDate(x),timeDate(paste(substr(x,1,4),"-01-01",sep="")),"days")))
#you can try the faster alternative if the above is too slow, comment out above, uncomment next line.
#train$days = (train$month-1)*30 + train$day
train$Type = as.character(train$Type)
train$Type[train$Type=="A"]=1
train$Type[train$Type=="B"]=2
train$Type[train$Type=="C"]=3
train$IsHoliday[train$IsHoliday=="TRUE"]=1
train$IsHoliday[train$IsHoliday=="FALSE"]=0
train$dayHoliday = train$IsHoliday*train$days
train$logsales = log(4990+train$Weekly_Sales)
#weight certain features more by duplication, not sure if helpful?
train$tDays = 360*(train$year-2010) + (train$month-1)*30 + train$day
train$days30 = (train$month-1)*30 + train$day
#Make features for test
test$year = as.numeric(substr(test$Date,1,4))
test$month = as.numeric(substr(test$Date,6,7))
test$day = as.numeric(substr(test$Date,9,10))
#bottleneck
test$days = sapply(test$Date,
function(x) as.numeric(difftimeDate(timeDate(x),timeDate(paste(substr(x,1,4),"-01-01",sep="")),"days")))
#you can try the faster alternative if the above is too slow, comment out above, uncomment next line.
#test$days = (test$month-1)*30 + test$day
test$Type = as.character(test$Type)
test$Type[test$Type=="A"]=1
test$Type[test$Type=="B"]=2
test$Type[test$Type=="C"]=3
test$IsHoliday[test$IsHoliday=="TRUE"]=1
test$IsHoliday[test$IsHoliday=="FALSE"]=0
test$dayHoliday = test$IsHoliday*test$days
test$tDays = 360*(test$year-2010) + (test$month-1)*30 + test$day
test$days30 = (test$month-1)*30 + test$day
#Run model
tmpR0 = nrow(submission)
j=1
while (j < tmpR0) {
print(j/tmpR0)#keep track of progress
#select only relevant data for the store and department tuple
tmpId = submission$Id[j]
tmpStr = unlist(strsplit(tmpId,"_"))
tmpStore = tmpStr[1]
tmpDept = tmpStr[2]
dataF1 = train[train$Dept==tmpDept,]
tmpL = nrow(dataF1[dataF1$Store==tmpStore,])
#since MAE is weighted, increase weights of holiday data by 5x
tmpF = dataF1[dataF1$IsHoliday==1,]
dataF1 = rbind(dataF1,do.call("rbind", replicate(4, tmpF, simplify = FALSE)))
dataF2 = dataF1[dataF1$Store==tmpStore,]
testF1 = test[test$Dept==tmpDept,]
testF1 = testF1[testF1$Store==tmpStore,]
testRows = nrow(testF1)
if (tmpL<10) {#sample size restrictions since rf can fail if there isn't enough data
#this model uses all dept data (since that store + dept pair does not exist in the training set)
tmpModel = randomForest(logsales~Size+Type+ year + month + day + days + dayHoliday + tDays + days30,
ntree=4800, replace=TRUE, mtry=4, data=dataF1)}
else {
#this model is trained on store+dept filtered data
tmpModel = randomForest(logsales ~ year + month + day + days + dayHoliday + tDays + days30,
ntree=4800, replace=TRUE, mtry=3, data=dataF2)}
tmpP = exp(predict(tmpModel,testF1))-4990
k = j + testRows - 1
submission$Weekly_Sales[j:k] = tmpP
j = k+1
}
#write the submission to csv for Kaggle submission
write.table(x=submission,
file='/home/mikeskim/Desktop/walmart/outputFinal.csv',
sep=',', row.names=FALSE, quote=FALSE)