# Spaceship Titanic: Data Imputation
## Imports

In [1]:
suppressPackageStartupMessages(library(here))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(missForest))

## Read data

In [2]:
# Path to data directory
data_dir <- here("input", "spaceship-titanic")
stopifnot(dir.exists(data_dir))

In [3]:
# Path to training data
train_path <- file.path(data_dir, "train_prep.csv")
stopifnot(file.exists(train_path))

In [4]:
# Read training data
df_train <- read_csv(
  train_path,
  show_col_types = FALSE,
  col_types = list(
    PassengerId = col_character(),
    Alone = col_factor(levels = c("True", "False"), ordered = FALSE),
    CompCntReduced = col_factor(levels = c("0", "1", "2", "3+"), ordered = TRUE),
    HomePlanetOrd = col_factor(levels = c("0.0", "1.0", "2.0"), ordered = FALSE),
    CryoSleep = col_factor(levels = c("True", "False"), ordered = FALSE),
    CabinDeckOrd = col_factor(
      levels = c("0.0", "1.0", "2.0", "3.0", "4.0", "5.0"),
      ordered = FALSE
    ),
    CabinPort = col_factor(levels = c("True", "False"), ordered = FALSE),
    DestinationOrd = col_factor(levels = c("0.0", "1.0", "2.0"), ordered = FALSE),
    DiscretizedAge4 = col_factor(levels = c("0.0", "1.0", "2.0", "3.0"), ordered = TRUE),
    DiscretizedAge5 = col_factor(levels = c("0.0", "1.0", "2.0", "3.0", "4.0"), ordered = TRUE),
    VIP = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosRoomService = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosFoodCourt = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosShoppingMall = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosSpa = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosVRDeck = col_factor(levels = c("True", "False"), ordered = FALSE),
    PTTotalSpent = col_double(),
    Transported = col_logical()
  )
) %>%
  column_to_rownames("PassengerId") %>%
  as.data.frame()

In [5]:
target <- df_train$Transported
df_train$Transported <- NULL
str(df_train)

'data.frame':	8693 obs. of  16 variables:
 $ Alone          : Factor w/ 2 levels "True","False": 1 1 2 2 1 1 2 2 1 2 ...
 $ CompCntReduced : Ord.factor w/ 4 levels "0"<"1"<"2"<"3+": 1 1 2 2 1 1 2 2 1 3 ...
 $ HomePlanetOrd  : Factor w/ 3 levels "0.0","1.0","2.0": 2 1 2 2 1 1 1 1 1 2 ...
 $ CryoSleep      : Factor w/ 2 levels "True","False": 2 2 2 2 2 2 2 1 2 1 ...
 $ CabinDeckOrd   : Factor w/ 6 levels "0.0","1.0","2.0",..: 1 4 6 6 4 4 4 5 4 1 ...
 $ CabinPort      : Factor w/ 2 levels "True","False": 1 2 2 2 2 1 2 2 2 1 ...
 $ DestinationOrd : Factor w/ 3 levels "0.0","1.0","2.0": 3 3 3 3 3 2 3 3 3 1 ...
 $ DiscretizedAge4: Ord.factor w/ 4 levels "0.0"<"1.0"<"2.0"<..: 4 2 4 3 1 4 2 3 3 1 ...
 $ DiscretizedAge5: Ord.factor w/ 5 levels "0.0"<"1.0"<"2.0"<..: 4 3 5 4 1 5 3 3 4 1 ...
 $ VIP            : Factor w/ 2 levels "True","False": 2 2 1 2 2 2 2 2 2 2 ...
 $ PosRoomService : Factor w/ 2 levels "True","False": 2 1 1 2 1 2 1 2 2 2 ...
 $ PosFoodCourt   : Factor w/ 2 levels "True","Fals

In [6]:
head(df_train, n = 10L)

Unnamed: 0_level_0,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent
Unnamed: 0_level_1,<fct>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<ord>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>
0001_01,True,0,1.0,False,0.0,True,2.0,3.0,3.0,False,False,False,False,False,False,-1.1403312
0002_01,True,0,0.0,False,3.0,False,2.0,1.0,2.0,False,True,True,True,True,True,0.5764973
0003_01,False,1,1.0,False,5.0,False,2.0,3.0,4.0,True,True,True,False,True,True,1.5016107
0003_02,False,1,1.0,False,5.0,False,2.0,2.0,3.0,False,False,True,True,True,True,1.2424611
0004_01,True,0,0.0,False,3.0,False,2.0,0.0,0.0,False,True,True,True,True,True,0.7040607
0005_01,True,0,0.0,False,3.0,True,1.0,3.0,4.0,False,False,True,False,True,False,0.5926261
0006_01,False,1,0.0,False,3.0,False,2.0,1.0,2.0,False,True,True,True,False,False,0.8280081
0006_02,False,1,0.0,True,4.0,False,2.0,2.0,2.0,False,False,False,False,False,False,-1.1403312
0007_01,True,0,0.0,False,3.0,False,2.0,2.0,3.0,False,False,True,True,True,False,0.681374
0008_01,False,2,1.0,True,0.0,True,0.0,0.0,0.0,False,False,False,False,False,False,-1.1403312


In [7]:
# Path to test data
test_path <- file.path(data_dir, "test_prep.csv")
stopifnot(file.exists(test_path))

In [8]:
# Read test data
df_test <- read_csv(
  test_path,
  show_col_types = FALSE,
  col_types = list(
    PassengerId = col_character(),
    Alone = col_factor(levels = c("True", "False"), ordered = FALSE),
    CompCntReduced = col_factor(levels = c("0", "1", "2", "3+"), ordered = TRUE),
    HomePlanetOrd = col_factor(levels = c("0.0", "1.0", "2.0"), ordered = FALSE),
    CryoSleep = col_factor(levels = c("True", "False"), ordered = FALSE),
    CabinDeckOrd = col_factor(
      levels = c("0.0", "1.0", "2.0", "3.0", "4.0", "5.0"),
      ordered = FALSE
    ),
    CabinPort = col_factor(levels = c("True", "False"), ordered = FALSE),
    DestinationOrd = col_factor(levels = c("0.0", "1.0", "2.0"), ordered = FALSE),
    DiscretizedAge4 = col_factor(levels = c("0.0", "1.0", "2.0", "3.0"), ordered = TRUE),
    DiscretizedAge5 = col_factor(levels = c("0.0", "1.0", "2.0", "3.0", "4.0"), ordered = TRUE),
    VIP = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosRoomService = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosFoodCourt = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosShoppingMall = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosSpa = col_factor(levels = c("True", "False"), ordered = FALSE),
    PosVRDeck = col_factor(levels = c("True", "False"), ordered = FALSE),
    PTTotalSpent = col_double()
  )
) %>%
  column_to_rownames("PassengerId") %>%
  as.data.frame()
str(df_test)

'data.frame':	4277 obs. of  16 variables:
 $ Alone          : Factor w/ 2 levels "True","False": 1 1 1 1 1 1 1 2 2 1 ...
 $ CompCntReduced : Ord.factor w/ 4 levels "0"<"1"<"2"<"3+": 1 1 1 1 1 1 1 2 2 1 ...
 $ HomePlanetOrd  : Factor w/ 3 levels "0.0","1.0","2.0": 1 1 2 2 1 1 2 2 2 1 ...
 $ CryoSleep      : Factor w/ 2 levels "True","False": 1 2 1 2 2 2 1 1 1 2 ...
 $ CabinDeckOrd   : Factor w/ 6 levels "0.0","1.0","2.0",..: 5 4 2 2 4 4 1 6 6 4 ...
 $ CabinPort      : Factor w/ 2 levels "True","False": 2 2 2 2 2 1 1 2 2 2 ...
 $ DestinationOrd : Factor w/ 3 levels "0.0","1.0","2.0": 3 3 1 3 3 3 1 3 1 1 ...
 $ DiscretizedAge4: Ord.factor w/ 4 levels "0.0"<"1.0"<"2.0"<..: 3 2 3 4 2 3 2 2 2 2 ...
 $ DiscretizedAge5: Ord.factor w/ 5 levels "0.0"<"1.0"<"2.0"<..: 3 2 4 4 2 4 2 2 2 3 ...
 $ VIP            : Factor w/ 2 levels "True","False": 2 2 2 2 2 2 2 2 2 2 ...
 $ PosRoomService : Factor w/ 2 levels "True","False": 2 2 2 2 1 2 2 2 2 2 ...
 $ PosFoodCourt   : Factor w/ 2 levels "True","Fals

In [9]:
head(df_test, n = 10L)

Unnamed: 0_level_0,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent
Unnamed: 0_level_1,<fct>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<ord>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>
0013_01,True,0,0.0,True,4.0,False,2.0,2.0,2.0,False,False,False,False,False,False,-1.1403312
0018_01,True,0,0.0,False,3.0,False,2.0,1.0,1.0,False,False,True,False,True,False,1.0273457
0019_01,True,0,1.0,True,1.0,False,0.0,2.0,3.0,False,False,False,False,False,False,-1.1403312
0021_01,True,0,1.0,False,1.0,False,2.0,3.0,3.0,False,False,True,False,True,True,1.3749534
0023_01,True,0,0.0,False,3.0,False,2.0,1.0,1.0,False,True,False,True,False,False,0.5344694
0027_01,True,0,0.0,False,3.0,True,2.0,2.0,3.0,False,False,True,True,True,True,0.9157079
0029_01,True,0,1.0,True,0.0,True,0.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0032_01,False,1,1.0,True,5.0,False,2.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0032_02,False,1,1.0,True,5.0,False,0.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0033_01,True,0,0.0,False,3.0,False,0.0,1.0,2.0,False,False,True,False,False,False,0.5315074


## Imputation with `missForest`

In [10]:
maxiter <- 10
ntree <- 100

In [11]:
# Training data
train_imp <- missForest(
  df_train,
  maxiter = maxiter,
  ntree = ntree,
  variablewise = TRUE,
  verbose = TRUE
)

  missForest iteration 1 in progress...done!
    estimated error(s): 0 0 0.066467 0.0602676 0.3407111 0.4976728 0.4220421 0.1887479 0.303735 0.04902986 0 0 0 0 0 0 
    difference(s): 0 0.003282334 
    time: 9.237 seconds

  missForest iteration 2 in progress...done!
    estimated error(s): 0 0 0.05310448 0.0602676 0.3409466 0.4988364 0.4145224 0.1880432 0.3039699 0.05007552 0 0 0 0 0 0 
    difference(s): 0 0.0008742667 
    time: 8.361 seconds

  missForest iteration 3 in progress...done!
    estimated error(s): 0 0 0.0526437 0.0602676 0.3318813 0.5008145 0.4188697 0.1832276 0.3077284 0.05088881 0 0 0 0 0 0 
    difference(s): 0 0.0004678093 
    time: 8.633 seconds

  missForest iteration 4 in progress...done!
    estimated error(s): 0 0 0.0526437 0.0602676 0.3377678 0.4956947 0.4208671 0.1844022 0.3026779 0.0501917 0 0 0 0 0 0 
    difference(s): 0 0.0004448023 
    time: 8.237 seconds

  missForest iteration 5 in progress...done!
    estimated error(s): 0 0 0.05298929 0.0602676 0

In [12]:
head(train_imp$ximp, n = 10L)

Unnamed: 0_level_0,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent
Unnamed: 0_level_1,<fct>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<ord>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>
0001_01,True,0,1.0,False,0.0,True,2.0,3.0,3.0,False,False,False,False,False,False,-1.1403312
0002_01,True,0,0.0,False,3.0,False,2.0,1.0,2.0,False,True,True,True,True,True,0.5764973
0003_01,False,1,1.0,False,5.0,False,2.0,3.0,4.0,True,True,True,False,True,True,1.5016107
0003_02,False,1,1.0,False,5.0,False,2.0,2.0,3.0,False,False,True,True,True,True,1.2424611
0004_01,True,0,0.0,False,3.0,False,2.0,0.0,0.0,False,True,True,True,True,True,0.7040607
0005_01,True,0,0.0,False,3.0,True,1.0,3.0,4.0,False,False,True,False,True,False,0.5926261
0006_01,False,1,0.0,False,3.0,False,2.0,1.0,2.0,False,True,True,True,False,False,0.8280081
0006_02,False,1,0.0,True,4.0,False,2.0,2.0,2.0,False,False,False,False,False,False,-1.1403312
0007_01,True,0,0.0,False,3.0,False,2.0,2.0,3.0,False,False,True,True,True,False,0.681374
0008_01,False,2,1.0,True,0.0,True,0.0,0.0,0.0,False,False,False,False,False,False,-1.1403312


In [13]:
err <- train_imp$OOBerror
num_miss <- colSums(is.na(df_train))
mask <- num_miss > 0

tibble(
  Feature = names(num_miss[mask]),
  MissingValues = num_miss[mask],
  PFC = err[mask] # PFC = Proportion of falsely classified
) %>% column_to_rownames("Feature")

Unnamed: 0_level_0,MissingValues,PFC
Unnamed: 0_level_1,<dbl>,<dbl>
HomePlanetOrd,12,0.0527589
CryoSleep,98,0.06038394
CabinDeckOrd,199,0.3364728
CabinPort,99,0.49232022
DestinationOrd,182,0.41828222
DiscretizedAge4,179,0.18522434
DiscretizedAge5,179,0.30772845
VIP,86,0.04984315


In [14]:
# Test data
test_imp <- missForest(
  df_test,
  maxiter = maxiter,
  ntree = ntree,
  variablewise = TRUE,
  verbose = TRUE
)

  missForest iteration 1 in progress...done!
    estimated error(s): 0 0 0.06624532 0.05260675 0.3696433 0.4909824 0.4107527 0.2001911 0.3143813 0.02483444 0 0 0 0 0 0 
    difference(s): 0 0.003382433 
    time: 3.825 seconds

  missForest iteration 2 in progress...done!
    estimated error(s): 0 0 0.06062734 0.05260675 0.3658128 0.4838633 0.4062127 0.2009078 0.3122312 0.02554399 0 0 0 0 0 0 
    difference(s): 0 0.0009664095 
    time: 3.736 seconds

  missForest iteration 3 in progress...done!
    estimated error(s): 0 0 0.05898876 0.05260675 0.3706009 0.4819649 0.404779 0.2021022 0.3076923 0.02743614 0 0 0 0 0 0 
    difference(s): 0 0.000685839 
    time: 3.738 seconds

  missForest iteration 4 in progress...done!
    estimated error(s): 0 0 0.06015918 0.05260675 0.381853 0.4852871 0.40454 0.2009078 0.3057812 0.0281457 0 0 0 0 0 0 
    difference(s): 0 0.0005143792 
    time: 4.234 seconds

  missForest iteration 5 in progress...done!
    estimated error(s): 0 0 0.06039326 0.05260

In [15]:
head(test_imp$ximp, n = 10L)

Unnamed: 0_level_0,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent
Unnamed: 0_level_1,<fct>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<ord>,<ord>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>
0013_01,True,0,0.0,True,4.0,False,2.0,2.0,2.0,False,False,False,False,False,False,-1.1403312
0018_01,True,0,0.0,False,3.0,False,2.0,1.0,1.0,False,False,True,False,True,False,1.0273457
0019_01,True,0,1.0,True,1.0,False,0.0,2.0,3.0,False,False,False,False,False,False,-1.1403312
0021_01,True,0,1.0,False,1.0,False,2.0,3.0,3.0,False,False,True,False,True,True,1.3749534
0023_01,True,0,0.0,False,3.0,False,2.0,1.0,1.0,False,True,False,True,False,False,0.5344694
0027_01,True,0,0.0,False,3.0,True,2.0,2.0,3.0,False,False,True,True,True,True,0.9157079
0029_01,True,0,1.0,True,0.0,True,0.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0032_01,False,1,1.0,True,5.0,False,2.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0032_02,False,1,1.0,True,5.0,False,0.0,1.0,1.0,False,False,False,False,False,False,-1.1403312
0033_01,True,0,0.0,False,3.0,False,0.0,1.0,2.0,False,False,True,False,False,False,0.5315074


In [16]:
err <- test_imp$OOBerror
num_miss <- colSums(is.na(df_test))
mask <- num_miss > 0

tibble(
  Feature = names(num_miss[mask]),
  MissingValues = num_miss[mask],
  PFC = err[mask] # PFC = Proportion of falsely classified
) %>% column_to_rownames("Feature")

Unnamed: 0_level_0,MissingValues,PFC
Unnamed: 0_level_1,<dbl>,<dbl>
HomePlanetOrd,5,0.06039326
CryoSleep,38,0.05260675
CabinDeckOrd,100,0.36820685
CabinPort,63,0.48718557
DestinationOrd,92,0.40477897
DiscretizedAge4,91,0.19827998
DiscretizedAge5,91,0.30697563
VIP,49,0.02459792


## Save imputed datasets

In [17]:
# Training data
tbl_train <- as_tibble(train_imp$ximp, rownames = NA) %>%
  rownames_to_column(var = "PassengerId") %>%
  mutate(
    Alone = Alone == "True",
    CompCntReduced = as.character(CompCntReduced),
    HomePlanetOrd = HomePlanetOrd %>% as.character() %>% as.integer(),
    CryoSleep = CryoSleep == "True",
    CabinDeckOrd = CabinDeckOrd %>% as.character() %>% as.integer(),
    CabinPort = CabinPort == "True",
    DestinationOrd = DestinationOrd %>% as.character() %>% as.integer(),
    DiscretizedAge4 = DiscretizedAge4 %>% as.character() %>% as.integer(),
    DiscretizedAge5 = DiscretizedAge5 %>% as.character() %>% as.integer(),
    VIP = VIP == "True",
    PosRoomService = PosRoomService == "True",
    PosFoodCourt = PosFoodCourt == "True",
    PosShoppingMall = PosShoppingMall == "True",
    PosSpa = PosSpa == "True",
    PosVRDeck = PosVRDeck == "True"
  ) %>%
  add_column(Transported = target)
str(tbl_train)

tibble [8,693 × 18] (S3: tbl_df/tbl/data.frame)
 $ PassengerId    : chr [1:8693] "0001_01" "0002_01" "0003_01" "0003_02" ...
 $ Alone          : logi [1:8693] TRUE TRUE FALSE FALSE TRUE TRUE ...
 $ CompCntReduced : chr [1:8693] "0" "0" "1" "1" ...
 $ HomePlanetOrd  : int [1:8693] 1 0 1 1 0 0 0 0 0 1 ...
 $ CryoSleep      : logi [1:8693] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ CabinDeckOrd   : int [1:8693] 0 3 5 5 3 3 3 4 3 0 ...
 $ CabinPort      : logi [1:8693] TRUE FALSE FALSE FALSE FALSE TRUE ...
 $ DestinationOrd : int [1:8693] 2 2 2 2 2 1 2 2 2 0 ...
 $ DiscretizedAge4: int [1:8693] 3 1 3 2 0 3 1 2 2 0 ...
 $ DiscretizedAge5: int [1:8693] 3 2 4 3 0 4 2 2 3 0 ...
 $ VIP            : logi [1:8693] FALSE FALSE TRUE FALSE FALSE FALSE ...
 $ PosRoomService : logi [1:8693] FALSE TRUE TRUE FALSE TRUE FALSE ...
 $ PosFoodCourt   : logi [1:8693] FALSE TRUE TRUE TRUE TRUE TRUE ...
 $ PosShoppingMall: logi [1:8693] FALSE TRUE FALSE TRUE TRUE FALSE ...
 $ PosSpa         : logi [1:8693] FAL

In [18]:
head(tbl_train, n = 10L)

PassengerId,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent,Transported
<chr>,<lgl>,<chr>,<int>,<lgl>,<int>,<lgl>,<int>,<int>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<lgl>
0001_01,True,0,1,False,0,True,2,3,3,False,False,False,False,False,False,-1.1403312,False
0002_01,True,0,0,False,3,False,2,1,2,False,True,True,True,True,True,0.5764973,True
0003_01,False,1,1,False,5,False,2,3,4,True,True,True,False,True,True,1.5016107,False
0003_02,False,1,1,False,5,False,2,2,3,False,False,True,True,True,True,1.2424611,False
0004_01,True,0,0,False,3,False,2,0,0,False,True,True,True,True,True,0.7040607,True
0005_01,True,0,0,False,3,True,1,3,4,False,False,True,False,True,False,0.5926261,True
0006_01,False,1,0,False,3,False,2,1,2,False,True,True,True,False,False,0.8280081,True
0006_02,False,1,0,True,4,False,2,2,2,False,False,False,False,False,False,-1.1403312,True
0007_01,True,0,0,False,3,False,2,2,3,False,False,True,True,True,False,0.681374,True
0008_01,False,2,1,True,0,True,0,0,0,False,False,False,False,False,False,-1.1403312,True


In [19]:
write_csv(
  tbl_train,
  file.path(data_dir, "train_imputed.csv"),
  quote = "needed",
  escape = "backslash",
  progress = FALSE
)

In [20]:
# Test data
tbl_test <- as_tibble(test_imp$ximp, rownames = NA) %>%
  rownames_to_column(var = "PassengerId") %>%
  mutate(
    Alone = Alone == "True",
    CompCntReduced = as.character(CompCntReduced),
    HomePlanetOrd = HomePlanetOrd %>% as.character() %>% as.integer(),
    CryoSleep = CryoSleep == "True",
    CabinDeckOrd = CabinDeckOrd %>% as.character() %>% as.integer(),
    CabinPort = CabinPort == "True",
    DestinationOrd = DestinationOrd %>% as.character() %>% as.integer(),
    DiscretizedAge4 = DiscretizedAge4 %>% as.character() %>% as.integer(),
    DiscretizedAge5 = DiscretizedAge5 %>% as.character() %>% as.integer(),
    VIP = VIP == "True",
    PosRoomService = PosRoomService == "True",
    PosFoodCourt = PosFoodCourt == "True",
    PosShoppingMall = PosShoppingMall == "True",
    PosSpa = PosSpa == "True",
    PosVRDeck = PosVRDeck == "True"
  )
str(tbl_test)

tibble [4,277 × 17] (S3: tbl_df/tbl/data.frame)
 $ PassengerId    : chr [1:4277] "0013_01" "0018_01" "0019_01" "0021_01" ...
 $ Alone          : logi [1:4277] TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ CompCntReduced : chr [1:4277] "0" "0" "0" "0" ...
 $ HomePlanetOrd  : int [1:4277] 0 0 1 1 0 0 1 1 1 0 ...
 $ CryoSleep      : logi [1:4277] TRUE FALSE TRUE FALSE FALSE FALSE ...
 $ CabinDeckOrd   : int [1:4277] 4 3 1 1 3 3 0 5 5 3 ...
 $ CabinPort      : logi [1:4277] FALSE FALSE FALSE FALSE FALSE TRUE ...
 $ DestinationOrd : int [1:4277] 2 2 0 2 2 2 0 2 0 0 ...
 $ DiscretizedAge4: int [1:4277] 2 1 2 3 1 2 1 1 1 1 ...
 $ DiscretizedAge5: int [1:4277] 2 1 3 3 1 3 1 1 1 2 ...
 $ VIP            : logi [1:4277] FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ PosRoomService : logi [1:4277] FALSE FALSE FALSE FALSE TRUE FALSE ...
 $ PosFoodCourt   : logi [1:4277] FALSE TRUE FALSE TRUE FALSE TRUE ...
 $ PosShoppingMall: logi [1:4277] FALSE FALSE FALSE FALSE TRUE TRUE ...
 $ PosSpa         : logi [1:4277] 

In [21]:
head(tbl_test, n = 10L)

PassengerId,Alone,CompCntReduced,HomePlanetOrd,CryoSleep,CabinDeckOrd,CabinPort,DestinationOrd,DiscretizedAge4,DiscretizedAge5,VIP,PosRoomService,PosFoodCourt,PosShoppingMall,PosSpa,PosVRDeck,PTTotalSpent
<chr>,<lgl>,<chr>,<int>,<lgl>,<int>,<lgl>,<int>,<int>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>
0013_01,True,0,0,True,4,False,2,2,2,False,False,False,False,False,False,-1.1403312
0018_01,True,0,0,False,3,False,2,1,1,False,False,True,False,True,False,1.0273457
0019_01,True,0,1,True,1,False,0,2,3,False,False,False,False,False,False,-1.1403312
0021_01,True,0,1,False,1,False,2,3,3,False,False,True,False,True,True,1.3749534
0023_01,True,0,0,False,3,False,2,1,1,False,True,False,True,False,False,0.5344694
0027_01,True,0,0,False,3,True,2,2,3,False,False,True,True,True,True,0.9157079
0029_01,True,0,1,True,0,True,0,1,1,False,False,False,False,False,False,-1.1403312
0032_01,False,1,1,True,5,False,2,1,1,False,False,False,False,False,False,-1.1403312
0032_02,False,1,1,True,5,False,0,1,1,False,False,False,False,False,False,-1.1403312
0033_01,True,0,0,False,3,False,0,1,2,False,False,True,False,False,False,0.5315074


In [22]:
write_csv(
  tbl_test,
  file.path(data_dir, "test_imputed.csv"),
  quote = "needed",
  escape = "backslash",
  progress = FALSE
)