# Titanic challenge from Kaggle

## This is the dataset from [Titanic](https://www.kaggle.com/c/titanic) Kaggle competition

### Let's import some libs

In [1]:
library(rpart)
library(zoo)
library(randomForest)


Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.


In [2]:
## Now, we'll load the datasets

In [2]:
train <- read.csv("data/train.csv")
test <- read.csv("data/test.csv")

We don't have the `Survived` feature in the `test` dataset, so, we'll create it

In [3]:
test$Survived <- NA

Now, we will create a simple function that splits the data. Ok, `R` have this function, but, see, we'll to this, below, in a few lines of code.

In [4]:
splitData <- function(dataframe, seed=NULL) {
  if (!is.null(seed)) set.seed(seed)
  index <- 1:nrow(dataframe)
  
  trainindex <- sample(index, trunc(length(index)*.8))
  
  trainset <- dataframe[trainindex, ]
  
  subindex <- 1:nrow(trainset)
  validationindex <- sample(subindex, trunc(length(subindex)*.8))
  
  validationset <- dataframe[-validationindex, ]
  trainset <- trainset[validationindex, ]
  
  
  testset <- dataframe[-trainindex, ]
  list(train=trainset, test=testset, validation=validationset)
}

It's important to notice that all transformations that we'll do in `train` dataset, we need to replicate in the `test` dataset. In our case, we merged both for this reason, but, don't worry, we'll split it a few later!

In [5]:
combined <- rbind(train, test)

combined$Name <- as.character(combined$Name)
combined$Title <- sapply(combined$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
combined$Title <- sub(' ', '', combined$Title)
combined$Title[combined$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combined$Title[combined$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combined$Title[combined$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
combined$Title <- factor(combined$Title)
combined$FamilySize <- combined$SibSp + combined$Parch + 1
combined$Surname <- sapply(combined$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
combined$FamilyID <- factor(paste(as.character(combined$FamilySize), combined$Surname, sep=""))

In [6]:
combined <- combined[order(combined$Surname, na.last=FALSE) , ]

combined$Age <- na.spline(combined$Age)
combined$Fare <- na.spline(combined$Fare)

Now, we will undo the train/test merge:

In [7]:
combined <- combined[order(combined$Survived, na.last=TRUE) , ]

In [8]:
full_data_combined = combined[1:891,]
split_data <- splitData(full_data_combined)
split_data$train

fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + Fare + Title + FamilySize + Embarked, data=full_data_combined, importance=TRUE, ntree = 2500)

prediction_test <- predict(fit, split_data$test, type = "class")
dt_test = data.frame(PassengerId = split_data$test$PassengerId, 
                     Survived_P = prediction_test, 
                     Survived = split_data$test$Survived, 
                     Correct = (split_data$test$Survived == prediction_test))


clean_test <- combined[892:1309,]
prediction <- predict(fit, clean_test, type = "class")
length(prediction)

result <- data.frame(PassengerId = clean_test$PassengerId, Survived = prediction)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,Surname,FamilyID
49,49,0,3,"Samaan, Mr. Youssef",male,28.67030,2,0,2662,21.6792,,C,Mr,3,Samaan,3Samaan
622,622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42.00000,1,0,11753,52.5542,D19,S,Mr,2,Kimball,2Kimball
769,769,0,3,"Moran, Mr. Daniel J",male,65.76899,1,0,371110,24.1500,,Q,Mr,2,Moran,2Moran
775,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.00000,1,3,29105,23.0000,,S,Mrs,5,Hocking,5Hocking
354,354,0,3,"Arnold-Franchi, Mr. Josef",male,25.00000,1,0,349237,17.8000,,S,Mr,2,Arnold-Franchi,2Arnold-Franchi
727,727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30.00000,3,0,31027,21.0000,,S,Mrs,4,Renouf,4Renouf
464,464,0,2,"Milling, Mr. Jacob Christian",male,48.00000,0,0,234360,13.0000,,S,Mr,1,Milling,1Milling
873,873,0,1,"Carlsson, Mr. Frans Olof",male,33.00000,0,0,695,5.0000,B51 B53 B55,S,Mr,1,Carlsson,1Carlsson
488,488,0,1,"Kent, Mr. Edward Austin",male,58.00000,0,0,11771,29.7000,B37,C,Mr,1,Kent,1Kent
428,428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19.00000,0,0,250655,26.0000,,S,Miss,1,Phillips,1Phillips


In [9]:
write.csv(result, file = "result.csv", row.names = FALSE)