In [None]:
# https://www.hackerrank.com/challenges/temperature-predictions/problem
# Copy the data below into your clipboard:

20
yyyy    month   tmax    tmin
1908    January 5.0 -1.4
1908    February    7.3 1.9
1908    March   6.2 0.3
1908    April   Missing_1   2.1
1908    May Missing_2   7.7
1908    June    17.7    8.7
1908    July    Missing_3   11.0
1908    August  17.5    9.7
1908    September   16.3    8.4
1908    October 14.6    8.0
1908    November    9.6 3.4
1908    December    5.8 Missing_4
1909    January 5.0 0.1
1909    February    5.5 -0.3
1909    March   5.6 -0.3
1909    April   12.2    3.3
1909    May 14.7    4.8
1909    June    15.0    7.5
1909    July    17.3    10.8
1909    August  18.8    10.7

In [34]:
STDIN <- read.table(file = "clipboard", strip.white = TRUE, sep='', header = TRUE, skip = 1)
STDIN

yyyy,month,tmax,tmin
1908,January,5.0,-1.4
1908,February,7.3,1.9
1908,March,6.2,0.3
1908,April,Missing_1,2.1
1908,May,Missing_2,7.7
1908,June,17.7,8.7
1908,July,Missing_3,11.0
1908,August,17.5,9.7
1908,September,16.3,8.4
1908,October,14.6,8.0


In [35]:
#############################################################################################################
########################################## Model 1: StructTS ################################################
#############################################################################################################
shhh <- suppressPackageStartupMessages
shhh(library(imputeTS))

# Begin Model 1: Kalman Smoothing on Structural Time Series for missing value imputation

KalmanSmoothing <- function(data){
  
  data$month <- substr(data$month, start = 1, stop = 3)
  data$month <- match(data$month, month.abb)
  
  data <- sapply(data, function(x){
    replace(x, grepl("Missing", x), NA)
  })
  
  data <- as.data.frame(data)
  

  # On the tmax column
  KalmanImputation_tmax <- na_kalman(data[, 3], model = "StructTS", smooth = TRUE, type = "trend")
  KalmanImputation_tmax <- round(KalmanImputation_tmax, 1)
  
  # On the tmin column
  KalmanImputation_tmin <- na_kalman(data[, 4], model = "StructTS", smooth = TRUE, type = "trend")
  KalmanImputation_tmin <- round(KalmanImputation_tmin, 1)
  
  KalmanImputation <- cbind(data, KalmanImputation_tmax, KalmanImputation_tmin)
  
  KalmanImputation_tmax <- subset(KalmanImputation, is.na(tmax), select = c("KalmanImputation_tmax"))
  KalmanImputation_tmin <- subset(KalmanImputation, is.na(tmin), select = c("KalmanImputation_tmin"))
  KalmanImputation_tmax <- unlist(KalmanImputation_tmax, use.names = FALSE)
  KalmanImputation_tmin <- unlist(KalmanImputation_tmin, use.names = FALSE)
  
  STDOUT <- c(KalmanImputation_tmax, KalmanImputation_tmin)
  
  return(STDOUT)
}

print("The missing values using are for the Structural Smoothing:")
KalmanSmoothing(STDIN)

[1] "The missing values using are for the Structural Smoothing:"


In [36]:
#############################################################################################################
########################################## Model 5: RandomForest ############################################
#############################################################################################################

# No implementation of XGBoost on HackerRack so I developed a randomForest model
library(randomForest)

data <- STDIN

RandomForestStackedImputation <- function(data){
  data$month <- substr(data$month, start = 1, stop = 3)
  data$month <- match(data$month, month.abb)
  
  data <- sapply(data, function(x){
    replace(x, grepl("Missing", x), NA)
    })
  
  data <- as.data.frame(data)
  
  data <- data %>%
    as_tibble() %>%
    mutate(yyyy = as.numeric(yyyy),
           month = as.numeric(month),
           tmax = as.numeric(tmax),
           tmin = as.numeric(tmin))

################### Instructions #####################
# Step A1: impute the missing values in tmin
# Step A2: estimate the missing values in tmax using the imputed tmin values (and time variables)

# Step B1: impute the missing values in tmax
# Step B2: estimate the missing values in tmin using the imputed tmax values (and time variables)
######################################################

# Step A1:
  
  tmin_imputed_data <- subset(data, !is.na(tmax)) %>%
    rfImpute(tmax ~ ., data = .) %>%
    rename(tmin_imputed_values = tmin) %>%
    rename(tmax_to_predict = tmax) %>%
    full_join(., data, by = c("yyyy", "month")) %>%
    arrange(yyyy, month) %>%
    mutate(tmin_imputed_values = round(tmin_imputed_values, 0),
           tmin = coalesce(tmin_imputed_values, tmin)) %>%
    select(yyyy, month, tmax_to_predict, tmin)

# Step A2:
# Use the imputed tmin values to predict the tmax values

  tmax_predictions <- tmin_imputed_data %>%
    rfImpute(tmin ~ ., data = .) %>%
    rename(tmax_imputed_values_after_tmin_imputation = tmax_to_predict) %>%
    mutate(tmax_imputed_values_after_tmin_imputation = round(tmax_imputed_values_after_tmin_imputation, 0))


# Step B1:

  tmax_imputed_data <- subset(data, !is.na(tmin)) %>%
    rfImpute(tmin ~ ., data = .) %>%
    rename(tmax_imputed_values = tmax) %>%
    rename(tmin_to_predict = tmin) %>%
    full_join(., data, by = c("yyyy", "month")) %>%
    arrange(yyyy, month) %>%
    mutate(tmax_imputed_values = round(tmax_imputed_values, 0),
           tmax = coalesce(tmax_imputed_values, tmax)) %>%
    select(yyyy, month, tmin_to_predict, tmax)


# Step B2:
# Use the imputed tmax values to predict the tmin values

  tmin_predictions <- tmax_imputed_data %>%
    rfImpute(tmax ~ ., data = .) %>%
    rename(tmin_imputed_values_after_tmax_imputation = tmin_to_predict) %>%
    mutate(tmin_imputed_values_after_tmax_imputation = round(tmin_imputed_values_after_tmax_imputation, 0))


  Stacked_RandomForest_models <- cbind(data, tmax_predictions, tmin_predictions)

  RandomForest_tmax <- subset(Stacked_RandomForest_models, is.na(tmax), select = c("tmax_imputed_values_after_tmin_imputation"))
  RandomForest_tmin <- subset(Stacked_RandomForest_models, is.na(tmin), select = c("tmin_imputed_values_after_tmax_imputation"))

  RandomForest_tmax <- unlist(RandomForest_tmax, use.names = FALSE)
  RandomForest_tmin <- unlist(RandomForest_tmin, use.names = FALSE)

  STDOUT <- c(RandomForest_tmax, RandomForest_tmin)

  return(STDOUT)
}


RandomForestStackedImputation(STDIN)

     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    14.16    70.35 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    13.99    69.51 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    13.51    67.16 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    14.36    71.36 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    14.41    71.61 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    9.947    35.45 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    10.76    38.34 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    10.58    37.70 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    9.808    34.95 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    9.889    35.24 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    11.39    39.12 |
     |      Out-of-bag   |
Tree |      MSE  %Var(y) |
 300 |    12.95    44.51 |
     |      Out-of-bag   |
T