# Loading Packages & Initialization

be sure to set:
* nl.model
* model.type
* output_folder
* iteration_budget
* selected_ins

In [2]:
rm(list = ls())

library(data.table)
library(tidyverse)
library(rJava)
library(RNetLogo)

library(lhs)  # For maximin Latin hypercube sampling
library(ggplot2)
library(plotly)  # For beautiful plotting
library(caret)
library(randomForest)
library(factoextra)
library(e1071)
library(TSrepr)  # for evaluating predictive power

require(gridExtra)

options(warn = -1)

In [3]:
Is_Headless <- 1
nl.model <- "Segregation_Dummy"

nl.path <- "C:/Program Files/NetLogo 6.0.4/app"
folder.path = "C:/Users/paslanpatir/Desktop/TEZ_v2/"
model.path <- paste0(folder.path, nl.model, ".nlogo")

if (Is_Headless == 0) {
    NLStart(nl.path, gui = TRUE, nl.jarname = "netlogo-6.0.4.jar")
    NLLoadModel(model.path)
} else {
    NLStart(nl.path, gui = FALSE, nl.jarname = "netlogo-6.0.4.jar", nl.obj = nl.model)
    NLLoadModel(model.path, nl.obj = nl.model)
}

In [4]:
model.type = ifelse(nl.model == "Segregation", "basic", "dummy")
# the path of data folder
data.path = paste0(folder.path,"data/")
# the path for outputs to be record
output.folder = paste0("outputs_FeatureSelect_DENEME_",Sys.Date())
dir.create(file.path(folder.path, output.folder), showWarnings = FALSE)

outputs.path = paste0(folder.path,output.folder,"/")

In [5]:
# Read Me File to keep info about the output folder
ReadMe = paste0(outputs.path,"ReadMe_",model.type,".txt")

# Model Parameters & Functions

## Set model parameters

In [6]:
## Set model parameters Number of replications for each instance
nofrep = 10

# order feature names according to their definition order in run_model
if (model.type == "basic") {
    feature_names = c("density", "%-similar-wanted")
} else if (model.type == "dummy") {
    feature_names = c("density", "%-similar-wanted", "budget-multiplier-dummy", "density-multiplier-dummy", 
        "noise-dummy", "tick-limit")
}  
# 
output_name = c("percent-similar")

# Number of input parameters of the agent-based model
nofparams = length(feature_names)

# set RF parameters
ntree = 300
mtry = 2

## Set user parameters

In [7]:
error_type = "RMSE"  # MAPE, BIAS

# choose the uncertainty measure
selection_metric <- "sd"  #, 'range' 

# Number of iterations
iteration_budget = 11
metarep = c(1:10)

# Number of instances
unlabeled_ins = 200
test_ins = 200
train_ins_oneshot = 200
train_ins_Ad = 100

# Set selection parameters
selected_ins = 5  #nofinstancesWillbeSelected in each step

# Set elimination parameters
h <- 1  # number of variables eliminated in each step

seed.oneshot = c(0)
seed.Ad = c(0)

size.test = c(100)
unlabeled.type = "constant"

In [8]:
#unlabeled_ins = 100
#test_ins = 100
#train_ins_oneshot = 50
#train_ins_Ad = 50
#selected_ins = 1
#nofrep = 1
#metarep = c(1:2)
#iteration_budget = 6

In [9]:
write(paste0( "model =",nl.model,"\n"
             ,"nofrep =",nofrep,"\n"
             ,"metarep =",metarep,"\n"
             ,"ntree =",ntree,"\n"
             ,"mtry =",mtry,"\n"
             ,"iteration_budget =",iteration_budget,"\n"
             ,"unlabeled_ins =",unlabeled_ins,"\n"
             ,"unlabeled.type =",unlabeled.type,"\n"
             ,"test_ins =",test_ins,"\n"
             ,"train_ins_Ad =",train_ins_Ad,"\n"
             ,"selected_ins =",selected_ins,"\n"
             ,"h =",h,"\n"
             ,paste0(c("seed.oneshot =",seed.oneshot),collapse = " "),"\n"
             ,paste0(c("seed.Ad =",seed.Ad),collapse = " "),"\n"
             ,paste0(c("size.test =",size.test),collapse = " "),"\n"
             ,"error_type =",error_type,"\n" 
             ,"selection_metric =",selection_metric,"\n" 
             ,"Date =", Sys.Date()
             )
      ,ReadMe, append=TRUE, sep = "\n" )    

## Define functions

### run_model

In [10]:
# run_model <- function(feature_names,feature_values){ # both should be in character list format both should be in character list format
run_model <- function(feature_values) {
    k = length(feature_names)
    for (i in 1:k) {
        NLCommand(paste0("set ", feature_names[i], " ", feature_values[i]), nl.obj = nl.model)
    }
    NLCommand("setup", nl.obj = nl.model)
    NLDoCommand(100, "go", nl.obj = nl.model)
    result <- NLReport(output_name, nl.obj = nl.model)
    return(result)
}

### run_replicas

In [11]:
# run_replicas <- function(nofrep,feature_names,feature_values) {
run_replicas <- function(nofrep, feature_values) {
    replicas = matrix(NA, ncol = nofrep, nrow = 1)  # Save the result of each replication
    for (i in 1:nofrep) {
        # replicas[i]= run_model(feature_names,feature_values)
        replicas[i] = run_model(feature_values)
    }
    aggregated_result = mean(replicas)
    return(aggregated_result)
}

### run_ABM

In [12]:
# run_ABM = function(nofrep,nofinstances,unlabeledset,featurenames = feature_names){
run_ABM = function(nofrep, nofinstances, unlabeledset) {
    # unlabeledset = setcolorder(unlabeledset,featurenames)
    unlabeledset = setcolorder(unlabeledset, feature_names)
    for (i in 1:nofinstances) {
        # unlabeledset[i, output := run_replicas(nofrep,featurenames,
        # as.matrix(unlabeledset[i,]))]
        unlabeledset[i, `:=`(output, run_replicas(nofrep, as.matrix(unlabeledset[i,])))]
    }
    return(unlabeledset)
}

### error functions

In [13]:
# error functions on test data
rmse_func <- function(actual, predicted) {
    error = predicted - actual
    return(sqrt(mean(error^2)))
}

mape_func <- function(actual, predicted) {
    return((abs(actual - predicted)/actual) * 100)
}

bias_func <- function(actual, predicted) {
    return((actual - predicted)/actual)
}

# error functions on train data
obb_error_func <- function(model) {
    if (model$type == "regression") {
        oob_error = model$mse[model$ntree]
    } else if (model$type == "classification") {
        oob_error = model$err.rate
    }
    return(oob_error)
}

### get_test_predictions

In [14]:
# prediction functions
get_test_predictions <- function(model, testset, errortype) {
    
    predictedLabels <- predict(model, testset)
    predictedLabels <- cbind(testset, predictedLabels)
    setnames(predictedLabels, "predictedLabels", "pred_output")
    
    output_variables = colnames(select(predictedLabels, contains("output")))
    # output_variables[1] = true output output_variables[2] = predicted output
    
    # output_variables = colnames(predictedLabels[,1:(ncol(predictedLabels) - 2)])
    
    if (error_type == "MAPE") {
        predictedLabels[, `:=`(MAPE, mapply(function(x, y) mape_func(x, y), get(output_variables[1]), 
            get(output_variables[2])))]
    }
    if (error_type == "RMSE") {
        predictedLabels[, `:=`(RMSE, mapply(function(x, y) rmse_func(x, y), get(output_variables[1]), 
            get(output_variables[2])))]
    }
    if (error_type == "BIAS") {
        predictedLabels[, `:=`(BIAS, mapply(function(x, y) bias_func(x, y), get(output_variables[1]), 
            get(output_variables[2])))]
    }
    
    output_variables_1 = predictedLabels[, get(output_variables[1]), with = TRUE]
    output_variables_2 = predictedLabels[, get(output_variables[2]), with = TRUE]
    
    performance_temp = matrix(c(1:3), nrow = 1, ncol = 3)
    performance_temp[1] = mae(output_variables_1, output_variables_2)
    performance_temp[2] = rmse(output_variables_1, output_variables_2)
    performance_temp[3] = mape(output_variables_1, output_variables_2)
    
    return(list(predictedLabels, performance_temp, output_variables))
    
}

### sample_selection

In [15]:
# Adaptive sample selection function with an uncertainty measure depending on 'selection_metric'
sample_selection <- function(selected_ins, unlabeled_set, model) {
    ind_pred <- t(predict(model, unlabeled_set, predict.all = TRUE)$individual) %>% 
        data.table()  # predictions by each tree in the forest
    ind_pred_eval = data.table()
    
    # standard deviation calculation
    s_dev = sapply(ind_pred, sd) %>% data.table()
    setnames(s_dev, ".", "sd")
    ind_pred_eval = cbind(ind_pred_eval, s_dev)
    
    # range calculation
    range = sapply(ind_pred, range) %>% t() %>% data.table()
    range = range[, .(range = abs(range[, 1] - range[, 2]))]
    setnames(range, "range.V1", "range")
    ind_pred_eval = cbind(ind_pred_eval, range)
    
    #coeff variance calculation
    s_dev = sapply(ind_pred, sd) %>% data.table()
    setnames(s_dev, ".", "sd")
    s_mean = sapply(ind_pred, mean) %>% data.table()
    setnames(s_mean, ".", "mean")
    coeff_var = cbind(s_dev,s_mean) 
    coeff_var = coeff_var[,.(c_var = (sd / mean)* 100)]
    ind_pred_eval = cbind(ind_pred_eval, coeff_var)
    
    ind_pred_eval[, `:=`(idx, 1:.N)]
    
    if (selection_metric == "sd") {
        ind_pred_eval = ind_pred_eval[order(-sd)][1:selected_ins]
    } else if (selection_metric == "range") {
        ind_pred_eval = ind_pred_eval[order(-range)][1:selected_ins]
    } else if (selection_metric == "coefvar") {
        ind_pred_eval = ind_pred_eval[order(-coeff_var)][1:selected_ins]
    }
    
    unlabeled_set[, `:=`(idx, 1:.N)]
    train_candidates = unlabeled_set[ind_pred_eval$idx]
    
    return(train_candidates)
}

### random_sample_selection

In [16]:
# Random sample selection
random_sample_selection <- function(selected_ins, unlabeled_set) {
    
    unlabeled_set[, `:=`(idx, 1:.N)]
    
    train_candidate_idx = sample(unlabeled_set$idx, selected_ins, replace = FALSE, prob = NULL)
    train_candidates = unlabeled_set[idx %in% train_candidate_idx]
    
    return(train_candidates)
}

### get_variable_importance

In [17]:
get_variable_importance <- function(model) {
    importances <- importance(model, type = 1, scale = FALSE)
    selected.vars <- order(importances, decreasing = TRUE)
    ranked_features = feature_names[selected.vars]
    ordered.importances <- importances[selected.vars]
    
    return(ranked_features)
}

### feature_elimination

In [18]:
feature_elimination <- function(h, total_numof_eliminated_vars, ranked_features) {
    numof_columns_left = length(ranked_features) - (total_numof_eliminated_vars + h)
    columns_left = ranked_features[1:numof_columns_left]
    
    eliminated_columns = setdiff((length(ranked_features) - total_numof_eliminated_vars),numof_columns_left)
    eliminated_columns = ranked_features[eliminated_columns]
    
    # update total_numof_eliminated_vars
    total_numof_eliminated_vars = length(ranked_features) - length(columns_left)
    
    return(list(columns_left, total_numof_eliminated_vars, h, eliminated_columns))
}

# Unlabeled Data Pool

*Latin hyper cube sampling*

In [19]:
unlabeled_pool.name= paste0(data.path,"unlabeled_pool","_",model.type,"_",unlabeled_ins,".csv")
unlabeled_pool <- fread(unlabeled_pool.name)  

data_candidates = copy(unlabeled_pool)

# Test Set

In [20]:
#### Test Sets ####
for( t in size.test){
    test_set.name= paste0(data.path,"test_set","_",model.type,"_",t,".csv")
    test_set <- fread(test_set.name)  
    
    assign(paste0("test_set_",t),test_set)
} 

# One-Shot Training Set

In [21]:
training_set_all = data.table()

for( i in seed.oneshot){
    
    training_set.name= paste0(data.path,"training_set","_",model.type,"_",train_ins_oneshot,"_seed",i,".csv")
    training_set <- fread(training_set.name) 
    
    assign(paste0("training_set_",i),training_set)
    
    training_set_all = rbind(training_set_all,data.table(training_set, "seed" = i))
    rm(training_set,training_set.name)    
}

In [22]:
one_shot_data = copy(training_set_all)

## One-shot Train & Test Metamodel

In [23]:
#### OneShot Metamodel ####
performance_table_oneshot = data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae= numeric(),rmse= numeric(), mape = numeric())
predictedLabels_oneshot_all = data.table()
obb_error_oneshot_all = data.table(obb_error = numeric(),seed = numeric(),rep = numeric())

In [24]:
for( i in seed.oneshot){
    
    training_set = copy(one_shot_data[seed == i,.SD,.SDcols = -c("seed")])
    for(r in metarep){
   
    #tuning = tune.randomForest(x = training_set[, -c("output")], y = training_set$output, ntree = c(100,200,300,400), mtry = c(1, 2, 3), tunecontrol = tune.control(sampling = "cross", cross = length(training_set$output)))
    #model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = as.numeric(tuning$best.parameters$ntree), mtry = as.numeric(tuning$best.parameters$mtry))
    model_oneshot <- randomForest(x = training_set[, -c("output")], y = training_set$output, importance = TRUE,ntree = ntree, mtry = mtry)
    model_Sub.path = paste0(outputs.path, "model_oneshot_seed_", i, "_rep_", r,"_size_",train_ins_oneshot,".rds")
    saveRDS(model_oneshot, model_Sub.path)
    
    obb_error_oneshot <- obb_error_func(model_oneshot)
    obb_error_oneshot_all = rbind(obb_error_oneshot_all,data.table(obb_error_oneshot,"seed" = i, "rep" = r),use.names=FALSE)
    
    #predictions on each test set    
        for (t in size.test) {
             test_set = get(paste0("test_set_", t))
             
             test_prediction_oneshot = get_test_predictions(model_oneshot, test_set,error_type)
             predictedLabels_oneshot = test_prediction_oneshot[[1]]
             
             predictedLabels_oneshot_all = rbind(predictedLabels_oneshot_all, data.table(predictedLabels_oneshot,"seed" = i, "rep" = r, "size" = t))
             performance_table_oneshot = rbind(performance_table_oneshot, data.table(1,i, r, t, test_prediction_oneshot[[2]]), use.names = FALSE)
             output_variables = test_prediction_oneshot[[3]]               
        }
    rm(model_oneshot,obb_error_oneshot,test_prediction_oneshot,predictedLabels_oneshot)   
    }
}

In [25]:
fwrite(predictedLabels_oneshot_all, paste0(outputs.path,model.type,"_","predictedLabels_oneshot_all",".csv"))
fwrite(performance_table_oneshot, paste0(outputs.path,model.type,"_","performance_table_oneshot",".csv"))
fwrite(obb_error_oneshot_all, paste0(outputs.path,model.type,"_","obb_error_oneshot_all",".csv"))

In [26]:
rm(predictedLabels_oneshot_all,performance_table_oneshot,obb_error_oneshot_all)

# Adaptive Training Set

In [27]:
#### Adaptive Set ####
training_set_Ad_all = data.table()
for (i in seed.Ad) {
    
    training_set.name = paste0(data.path, "training_set", "_", model.type, "_", train_ins_Ad, "_seed", i, ".csv")
    training_set <- fread(training_set.name)
    
    assign(paste0("training_set_Ad_", i), training_set)
    
    training_set_Ad_all = rbind(training_set_Ad_all, data.table(training_set, "seed" = i))
    rm(training_set, training_set.name)
}
adaptive_initial_data = copy(training_set_Ad_all)

## Random Sampling Train & Test Metamodel

In [28]:
#### Random Sampling ####

# Record model performances
performance_table = data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric() ,obb_error = numeric(),seed = numeric(),rep = numeric())
# Initialize record tables
predictedLabels_all = data.table()
train_candidates_all = data.table()
training_set_Ad_final = data.table()

In [29]:
for (i in seed.Ad) {
    print(paste0("seed : ", i, "  Random Sampling section start time : ", Sys.time()))
    
    for (r in metarep) { #replications
        set.seed(i + r) # set seed to control randomness
        print(paste0("seed : ", i,"   rep : ", r, "  Random Sampling section start time : ", Sys.time()))
        
        training_set_Ad = copy(adaptive_initial_data[seed == i, .SD, .SDcols = -c("seed")])
        train_candidates_table = data.table()
        
        
        unlabeled_pool = copy(data_candidates) # constant sample pool
            
        iter = 1
        while(iter <= iteration_budget){
            print(iter)
            
            trainx = training_set_Ad[, .SD, .SDcols = feature_names]
            trainy = training_set_Ad$output
            
            # Train the model
            model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
            model_Sub.name = paste0("model_Rd_", iter, "_seed_", i, "_rep_",r)
                assign(model_Sub.name, model_Sub)
                model_Sub.path = paste0(outputs.path, paste0(model_Sub.name,"_size_",train_ins_Ad,".rds"))# to save the model
                saveRDS(model_Sub, model_Sub.path)
            
            #oob error
                # Append
            obb_err = rbind( data.table(iter = numeric() ,obb_error = numeric(),seed = numeric(),rep = numeric())
                            ,data.table(iter, obb_error_func(model_Sub), i, r), use.names = FALSE)            
            fwrite(obb_err,paste0(outputs.path,model.type,"_Append_","obb_error_Rd",".csv") ,append = TRUE)
                # Append ends       
            obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub), i, r), use.names = FALSE)
            
            # test the model on test set
            predictedLabels_Comb = data.table()
            for (t in size.test) {
               test_set = get(paste0("test_set_", t))
               
               test_predictions_Sub = get_test_predictions(model_Sub, test_set,error_type)
               predictedLabels_Sub = test_predictions_Sub[[1]]
               setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))

               fwrite(predictedLabels_Sub
                   ,paste0(outputs.path,model.type,"_","predictedLabels.Rd_seed_",i,"_iter_",iter,"_rep_",r,"_size_",t,".csv") ) 
               
               predictedLabels_Comb = rbind(predictedLabels_Comb, data.table("size" = t,predictedLabels_Sub))   
                
                # Keep test set error records
                #to append
                perf_table = rbind(data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
                                  ,data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE )
                fwrite(perf_table,paste0(outputs.path,model.type,"_Append_","performance_table_Rd",".csv"),append = TRUE )    
                ##
                performance_table = rbind(performance_table, data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE)
                
            }
            if(iter == 1){
               predictedLabels_table = copy(predictedLabels_Comb[,1:4])
            }
            predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Comb[,.SD ,.SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])
            

            if (iter != iteration_budget) {
                # below efforts are unnecessary when the budget is reached.
                
                ## sample selection from unlabeled data select candidates
                unlabeled_set <- copy(unlabeled_pool)
                train_candidates = random_sample_selection(selected_ins, unlabeled_set)
                
                # Eliminate train candidates from the unlabeled pool
                unlabeled_pool = unlabeled_pool[-train_candidates$idx]
                rm(unlabeled_set)
                
                # run ABM to find outputs of train candidates
                print(paste0("ABM train_candidate run start time : ", Sys.time()))
                train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
                print(paste0("ABM train_candidate run end time : ", Sys.time()))
                fwrite(train_candidates
                       ,paste0(outputs.path,model.type,"_","train_candidates.Rd_seed_",i,"_iter_",i,"_rep_",r,".csv") )
                
                train_candidates_table = rbind(train_candidates_table, data.table(train_candidates, "iter" = iter))
                
                # Add new data to train data
                training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
            }
            iter = iter + 1
        }  
    #to append
    fwrite( data.table(training_set_Ad, "seed" = i,"rep" = r)
           ,paste0(outputs.path,model.type,"_Append_","FinalTrainData_Rd",".csv") ,append = TRUE)
    ##        
    training_set_Ad_final = rbind(training_set_Ad_final, data.table(training_set_Ad, "seed" = i,"rep" = r))  
    
    #to append
    fwrite( data.table(predictedLabels_table, seed = i, rep = r)
           ,paste0(outputs.path,model.type,"_Append_","predictedLabels_table_Rd",".csv"),append = TRUE )               
    ##        
    predictedLabels_all = rbind(predictedLabels_all, data.table(predictedLabels_table, "seed" = i, "rep" = r))
        
    #to append               
    fwrite( data.table(train_candidates_table, "seed" = i, "rep" = r)
           ,paste0(outputs.path,model.type,"_Append_","train_candidates_table_Rd",".csv"),append = TRUE )               
    ##                 
    train_candidates_all = rbind(train_candidates_all, data.table(train_candidates_table, "seed" = i, "rep" = r))
    
    print(paste0("seed : ", i,"   rep : ", r, "  Random Sampling section end time : ", Sys.time()))        
  }  
         
    print(paste0("seed : ", i, "  Random Sampling section end time : ", Sys.time()))
    #rm(training_set_Ad, predictedLabels_table, train_candidates_table)
}

[1] "seed : 0  Random Sampling section start time : 2020-03-02 23:08:43"
[1] "seed : 0   rep : 1  Random Sampling section start time : 2020-03-02 23:08:43"
[1] 1
[1] "ABM train_candidate run start time : 2020-03-02 23:08:44"
[1] "ABM train_candidate run end time : 2020-03-02 23:08:45"
[1] 2
[1] "ABM train_candidate run start time : 2020-03-02 23:08:45"
[1] "ABM train_candidate run end time : 2020-03-02 23:08:47"
[1] 3
[1] "ABM train_candidate run start time : 2020-03-02 23:08:47"
[1] "ABM train_candidate run end time : 2020-03-02 23:08:47"
[1] 4
[1] "ABM train_candidate run start time : 2020-03-02 23:08:47"
[1] "ABM train_candidate run end time : 2020-03-02 23:08:50"
[1] 5
[1] "ABM train_candidate run start time : 2020-03-02 23:08:50"
[1] "ABM train_candidate run end time : 2020-03-02 23:08:51"
[1] 6
[1] "seed : 0   rep : 1  Random Sampling section end time : 2020-03-02 23:08:51"
[1] "seed : 0   rep : 2  Random Sampling section start time : 2020-03-02 23:08:51"
[1] 1
[1] "ABM train_can

In [30]:
# Final records
FinalTrainData_Rd = copy(training_set_Ad_final)
obb_error_Rd = copy(obb_error)
performance_table_Rd = copy(performance_table)
predictedLabels_table_Rd = copy(predictedLabels_all)
train_candidates_table_Rd  = copy(train_candidates_all)

In [31]:
rm(training_set_Ad_final,obb_error,performance_table,predictedLabels_all,train_candidates_all)

In [32]:
fwrite(FinalTrainData_Rd,paste0(outputs.path,model.type,"_","FinalTrainData_Rd",".csv") )
fwrite(performance_table_Rd,paste0(outputs.path,model.type,"_","performance_table_Rd",".csv") )
fwrite(train_candidates_table_Rd,paste0(outputs.path,model.type,"_","train_candidates_table_Rd",".csv") )
fwrite(predictedLabels_table_Rd,paste0(outputs.path,model.type,"_","predictedLabels_table_Rd",".csv") )
fwrite(obb_error_Rd,paste0(outputs.path,model.type,"_","obb_error_Rd",".csv") )

In [33]:
rm(FinalTrainData_Rd,performance_table_Rd,train_candidates_table_Rd,predictedLabels_table_Rd,obb_error_Rd)

## Adaptive Train & Test Metamodel

In [34]:
#### Adaptive Training ####
# Record model performances
performance_table = data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric() ,obb_error = numeric(),seed = numeric(),rep = numeric())

## initialize variables
predictedLabels_all = data.table()
train_candidates_all = data.table()
training_set_Ad_final = data.table()
importance_table_Ad = data.table()

In [35]:
for (i in seed.Ad) {
    print(paste0("seed : ", i, "  Adaptive Sampling section start time : ", Sys.time()))
    
    for (r in metarep) {
        # replications
        set.seed(i + r)
        print(paste0("seed : ", i, "   rep : ", r, "  Adaptive Sampling section start time : ",Sys.time()))
        
        unlabeled_pool = copy(data_candidates)
        training_set_Ad = copy(adaptive_initial_data[seed == i, .SD, .SDcols = -c("seed")])
        train_candidates_table = data.table()
        
        iter = 1
        while (iter <= iteration_budget) {
            print(iter)
            
            trainx = training_set_Ad[, .SD, .SDcols = feature_names]
            trainy = training_set_Ad$output
            
            # Train the model
            model_Sub <- randomForest(x = trainx, y = trainy, importance = TRUE, ntree = ntree, mtry = mtry)
            model_Sub.name = paste0("model_Ad_", selection_metric, "_", iter, "_seed_",i, "_rep_", r)
            assign(model_Sub.name, model_Sub)
            model_Sub.path = paste0(outputs.path, paste0(model_Sub.name, "_size_",train_ins_Ad, ".rds"))  # to save the model
            saveRDS(model_Sub, model_Sub.path)
            
            # to append
            obb_err = rbind( data.table(iter = numeric(), obb_error = numeric(), seed = numeric(),rep = numeric())
                            ,data.table(iter, obb_error_func(model_Sub), i, r), use.names = FALSE)
            fwrite(obb_err, paste0(outputs.path, model.type, "_Append_", "obb_error_Ad_",selection_metric, ".csv"), append = TRUE)
            ## 
            obb_error = rbind(obb_error, data.table(iter, obb_error_func(model_Sub),i, r), use.names = FALSE)
            
            # test the model on test set
            predictedLabels_Comb = data.table()
            for (t in size.test) {
                test_set = get(paste0("test_set_", t))
                
                test_predictions_Sub = get_test_predictions(model_Sub, test_set,error_type)
                predictedLabels_Sub = test_predictions_Sub[[1]]
                setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_",iter), paste0(error_type, "_", iter)))
                
                fwrite(predictedLabels_Sub
                       , paste0(outputs.path, model.type, "_","predictedLabels.Ad_", selection_metric, "_seed_", i, "_iter_",iter, "_rep_", r, "_size_", t, ".csv"))
                
                predictedLabels_Comb = rbind(predictedLabels_Comb, data.table("size" = t, predictedLabels_Sub))
                
                 # Keep test set error records
                #to append
                perf_table = rbind(data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
                                  ,data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE )
                fwrite(perf_table,paste0(outputs.path,model.type,"_Append_","performance_table_Ad",".csv"),append = TRUE )    
                ##
                performance_table = rbind(performance_table, data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE)
               
            }
            if (iter == 1) {
                predictedLabels_table = copy(predictedLabels_Comb[, 1:4])
            }
            
            predictedLabels_table = cbind( predictedLabels_table
                                          ,predictedLabels_Comb[,.SD, .SDcols = c(paste0("pred_output_", iter), paste0(error_type,"_", iter))])
            
            if (iter != iteration_budget) {# below efforts are unnecessary when the budget is reached.  sample selection
                # from unlabeled data select candidates
                unlabeled_set <- copy(unlabeled_pool)
                train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
                
                # eliminate candidates from the unlabeled pool
                unlabeled_pool = unlabeled_pool[-train_candidates$idx]
                rm(unlabeled_set)
                
                # run ABM to find outputs of train candidates
                print(paste0("ABM train_candidate run start time : ", Sys.time()))
                train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
                print(paste0("ABM train_candidate run end time : ", Sys.time()))
                fwrite(train_candidates
                           ,paste0(outputs.path,model.type,"_","train_candidates.Ad_",selection_metric,"_seed_",i,"_iter_",iter,"_rep_",r,"_size_",t,".csv") )
                train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,"iter" = iter))
                   
                # add labeled candidates to the train data
                training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
            }
            
            importance_table_Ad = rbind(importance_table_Ad,data.table("seed" = i,"rep" = r, "iter_no" = iter
                 ,t(importance(get(model_Sub.name), type = 1,scale = FALSE))))
            
            iter = iter + 1
        }
        
        # assign(paste0('predictedLabels_table_',i,'_rep_', r),predictedLabels_table) to
        # append
        fwrite(data.table(training_set_Ad, "seed" = i, "rep" = r), paste0(outputs.path, model.type, "_Append_", "FinalTrainData_Ad_", selection_metric, ".csv"), append = TRUE)
        ## 
        training_set_Ad_final = rbind(training_set_Ad_final, data.table(training_set_Ad,"seed" = i, "rep" = r))
        # assign(paste0('train_candidates_table_',i,'_rep_', r),train_candidates_table)
        # to append
        fwrite(data.table(predictedLabels_table, "seed" = i, "rep" = r), paste0(outputs.path,model.type, "_Append_", "predictedLabels_table_Ad_", selection_metric,".csv"), append = TRUE)
        ## 
        
        predictedLabels_all = rbind(predictedLabels_all, data.table(predictedLabels_table,"seed" = i, "rep" = r))
        # to append
        fwrite(data.table(train_candidates_table, seed = i, rep = r), paste0(outputs.path,model.type, "_Append_", "train_candidates_table_Ad_", selection_metric,".csv"), append = TRUE)
        ## 
        train_candidates_all = rbind(train_candidates_all, data.table(train_candidates_table,"seed" = i, "rep" = r))
        
        print(paste0("seed : ", i, "   rep : ", r, "  Adaptive Sampling section end time : ",Sys.time()))
    }
    
    print(paste0("seed : ", i, "  Adaptive Sampling section end time : ", Sys.time()))
    # rm(training_set_Ad,predictedLabels_table,train_candidates_table)
}

[1] "seed : 0  Adaptive Sampling section start time : 2020-03-02 23:09:11"
[1] "seed : 0   rep : 1  Adaptive Sampling section start time : 2020-03-02 23:09:11"
[1] 1
[1] "ABM train_candidate run start time : 2020-03-02 23:09:11"
[1] "ABM train_candidate run end time : 2020-03-02 23:09:13"
[1] 2
[1] "ABM train_candidate run start time : 2020-03-02 23:09:14"
[1] "ABM train_candidate run end time : 2020-03-02 23:09:15"
[1] 3
[1] "ABM train_candidate run start time : 2020-03-02 23:09:15"
[1] "ABM train_candidate run end time : 2020-03-02 23:09:15"
[1] 4
[1] "ABM train_candidate run start time : 2020-03-02 23:09:15"
[1] "ABM train_candidate run end time : 2020-03-02 23:09:17"
[1] 5
[1] "ABM train_candidate run start time : 2020-03-02 23:09:18"
[1] "ABM train_candidate run end time : 2020-03-02 23:09:19"
[1] 6
[1] "seed : 0   rep : 1  Adaptive Sampling section end time : 2020-03-02 23:09:19"
[1] "seed : 0   rep : 2  Adaptive Sampling section start time : 2020-03-02 23:09:19"
[1] 1
[1] "ABM t

In [36]:
# Final records
FinalTrainData_Ad = copy(training_set_Ad_final)
obb_error_Ad = copy(obb_error)
performance_table_Ad = copy(performance_table)
predictedLabels_table_Ad = copy(predictedLabels_all)
train_candidates_table_Ad  = copy(train_candidates_all)

In [37]:
rm(training_set_Ad_final,obb_error,performance_table,predictedLabels_all,train_candidates_all)

In [38]:
fwrite(importance_table_Ad,paste0(outputs.path,model.type,"_","importance_table_Ad","_",selection_metric,".csv") )
fwrite(FinalTrainData_Ad,paste0(outputs.path,model.type,"_","FinalTrainData_Ad","_",selection_metric,".csv") )
fwrite(performance_table_Ad,paste0(outputs.path,model.type,"_","performance_table_Ad","_",selection_metric,".csv") )
fwrite(train_candidates_table_Ad,paste0(outputs.path,model.type,"_","train_candidates_table_Ad","_",selection_metric,".csv") )
fwrite(predictedLabels_table_Ad,paste0(outputs.path,model.type,"_","predictedLabels_table_Ad","_",selection_metric,".csv") )
fwrite(obb_error_Ad,paste0(outputs.path,model.type,"_","obb_error_Ad","_",selection_metric,".csv") )

In [39]:
rm(importance_table_Ad,FinalTrainData_Ad,performance_table_Ad,train_candidates_table_Ad,predictedLabels_table_Ad,obb_error_Ad)

## Adaptive & Feature Elimination Train & Test Metamodel

In [71]:
# Decide on strategy:
if (model.type == "basic") {
    sample_selection_iteration_order = c(1:(iteration_budget - 1))
    feature_elimination_iteration_order = c(iteration_budget - 1)
} else if (model.type == "dummy") {
    sample_selection_iteration_order = c(1:(iteration_budget - 1))
    feature_elimination_iteration_order = c((iteration_budget - 4):(iteration_budget - 1))
    feature_elimination_iteration_order = feature_elimination_iteration_order[feature_elimination_iteration_order > 0] # eliminate negative order
}

In [72]:
#### Adaptive Feature Selection Training ####

# Record model performances
performance_table = data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
# Record obb_error table
obb_error = data.table(iter = numeric() ,obb_error = numeric(),seed = numeric(),rep = numeric())

## initialize variables
predictedLabels_all = data.table()
train_candidates_all = data.table()
training_set_Ad_final = data.table()
importance_table_AdFe = data.table()
iteration_history_AdFe = data.table()


# specify variables(columns) to be used initialize
columns_left = feature_names
total_numof_eliminated_vars <- 0
    

In [73]:
for(i in seed.Ad){

print(paste0("seed : ",i,"  Adaptive Sampling with Feature Selection section start time : ",Sys.time()))
    
for (r in metarep){ #replications
    set.seed(i + r)
        print(paste0("seed : ", i,"   rep : ", r, "  Adaptive Sampling with Feature Selection section start time : ", Sys.time()))
    
    unlabeled_pool = copy(data_candidates)
    training_set_Ad = copy(adaptive_initial_data[seed == i, .SD, .SDcols = -c("seed")])
    train_candidates_table = data.table()
        
    columns_left = feature_names # reset at the beginning of each iteration
    total_numof_eliminated_vars <- 0 # reset at the beginning of each iteration
    
    iteration_history = data.table("seed" = integer(),"rep" = integer(),"iter_no" = integer(), "IsFeatureEliminated" = logical(), "IsDataSelected" = logical())

    iter = 1
    while(iter <= iteration_budget){   
        print(iter)
    
        trainx = training_set_Ad[,.SD, .SDcols = columns_left]
        trainy = training_set_Ad$output
        
        # Train the model
        model_Sub <- randomForest( x = trainx, y =  trainy,importance = TRUE,ntree = ntree, mtry = mtry)
        model_Sub.name = paste0("model_AdFe_",selection_metric,"_", iter, "_seed_", i, "_rep_",r)
        assign(model_Sub.name,model_Sub)
        model_Sub.path = paste0(outputs.path, paste0(model_Sub.name,"_size_",train_ins_Ad,".rds"))
        saveRDS(model_Sub, model_Sub.path)
        
        if (length(columns_left) == length(feature_names)) {
        ranked_features = get_variable_importance(model_Sub)
        }
        
        #to append
        obb_err = rbind( data.table(iter = numeric() ,obb_error = numeric(),seed = numeric(),rep = numeric())
                        ,data.table(iter, obb_error_func(model_Sub), i, r), use.names = FALSE)            
        fwrite(obb_err,paste0(outputs.path,model.type,"_Append_","obb_error_AdFe_",selection_metric,".csv") ,append = TRUE)
        ##                
        obb_error = rbind(obb_error,data.table(iter,obb_error_func(model_Sub),i,r),use.names=FALSE)
        
        # test the model on test set
        predictedLabels_Comb = data.table()
        for (t in size.test) {
            test_set = get(paste0("test_set_", t))
            
            test_predictions_Sub = get_test_predictions(model_Sub, test_set,error_type)
            predictedLabels_Sub = test_predictions_Sub[[1]]
            setnames(predictedLabels_Sub, c("pred_output", error_type), c(paste0("pred_output_", iter), paste0(error_type, "_", iter)))

            fwrite(predictedLabels_Sub
                ,paste0(outputs.path,model.type,"_","predictedLabels.Ad_",selection_metric,"_seed_",i,"_iter_",iter,"_rep_",r,"_size_",t,".csv") ) 
            
            predictedLabels_Comb = rbind(predictedLabels_Comb, data.table(size = t,predictedLabels_Sub))   
             
             # Keep test set error records
                #to append
                perf_table = rbind(data.table(iter = numeric(),seed = numeric(),rep = numeric(),size = numeric(), mae = numeric(), rmse = numeric(), mape = numeric())
                                  ,data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE )
                fwrite(perf_table,paste0(outputs.path,model.type,"_Append_","performance_table_AdFe",".csv"),append = TRUE )    
                ##
                performance_table = rbind(performance_table, data.table(iter, i, r, t, test_predictions_Sub[[2]]),use.names = FALSE)
                        }
        if(iter == 1){
            predictedLabels_table = copy(predictedLabels_Comb[,1:4])
        }
            
        predictedLabels_table = cbind(predictedLabels_table, predictedLabels_Comb[,.SD ,.SDcols = c(paste0("pred_output_", iter), paste0(error_type, "_", iter))])

        if(iter != iteration_budget){ # below efforts are unnecessary when the budget is reached. 
            iteration_history= rbind(iteration_history,data.table(i,r,iter,0,0), use.names = FALSE)
            
            if (iter %in% sample_selection_iteration_order) { # select new data candidates before elimination
                ## sample selection from unlabeled data select candidates
                unlabeled_set <- copy(unlabeled_pool)
                train_candidates = sample_selection(selected_ins, unlabeled_set, model_Sub)
                
                # eliminate candidates from the unlabeled pool
                unlabeled_pool = unlabeled_pool[-train_candidates$idx]
                rm(unlabeled_set)
                
                # run ABM to find outputs of train candidates
                print(paste0("ABM train_candidate run start time : ",Sys.time()))
                train_candidates = run_ABM(nofrep, selected_ins, train_candidates)
                print(paste0("ABM train_candidate run end time : ",Sys.time()))
                
                fwrite(train_candidates
                           ,paste0(outputs.path,model.type,"_","train_candidates.AdFe_",selection_metric,"_seed_",i,"_iter_",iter,"_rep_",r,"_size_",t,".csv") )
                train_candidates_table = rbind(train_candidates_table, data.table(train_candidates,"iter" = iter))
                           
                # add labeled candidates to the train data
                training_set_Ad = rbind(training_set_Ad, train_candidates[, -c("idx")])
                
                # update iteration_history
                 iteration_history[iter]$IsDataSelected= 1
                }
            
           if (iter %in% feature_elimination_iteration_order) { # feature elimination module

                feature_elimination_result = feature_elimination(h, total_numof_eliminated_vars, ranked_features)
                
                columns_left = feature_elimination_result[[1]]  # 
                eliminated_columns = feature_elimination_result[[4]]  #   not necessary
                total_numof_eliminated_vars = as.numeric(feature_elimination_result[2])
                numof_eliminated_vars = as.numeric(feature_elimination_result[3])  #   not necessary 
                
                # update iteration_history
                iteration_history[iter]$IsFeatureEliminated= 1
           }
        }
        importance_table_AdFe = rbind(importance_table_AdFe, data.table("seed" = i, "rep" = r, "iter_no" = iter, 
            t(importance(get(model_Sub.name), type = 1, scale = FALSE))), use.names = TRUE, fill = TRUE) 
        
        iter = iter + 1
    }
    #to append
    fwrite(data.table(training_set_Ad, "seed" = i,"rep" = r),paste0(outputs.path,model.type,"_Append_","FinalTrainData_AdFe_",selection_metric,".csv") ,append = TRUE)
    ##
    training_set_Ad_final = rbind(training_set_Ad_final, data.table(training_set_Ad, "seed" = i,"rep" = r))
    #assign(paste0("train_candidates_table_",i,"_rep_", r),train_candidates_table)
    #to append
    fwrite(data.table(predictedLabels_table, "seed" = i, "rep" = r),paste0(outputs.path,model.type,"_Append_","predictedLabels_table_AdFe_",selection_metric,".csv"),append = TRUE )               
    ##
    
    predictedLabels_all = rbind(predictedLabels_all,data.table(predictedLabels_table,"seed" = i,"rep" = r))
    #to append               
    fwrite(data.table(train_candidates_table, "seed" = i, "rep" = r),paste0(outputs.path,model.type,"_Append_","train_candidates_table_AdFe_",selection_metric,".csv"),append = TRUE )               
    ##    
    train_candidates_all = rbind(train_candidates_all,data.table(train_candidates_table,"seed" = i,"rep" = r))

    #to append               
    fwrite(iteration_history,paste0(outputs.path,model.type,"_Append_","iteration_history_AdFe_",selection_metric,".csv"),append = TRUE )               
    ##
    iteration_history_AdFe = rbind(iteration_history_AdFe,iteration_history )
    
    print(paste0("seed : ",i,"   rep : ", r,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
}
    print(paste0("seed : ",i,"  Adaptive Sampling with Feature Elimination section end time : ",Sys.time()))
    #rm(training_set_Ad,predictedLabels_table,train_candidates_table)      
}

[1] "seed : 0  Adaptive Sampling with Feature Selection section start time : 2020-03-02 23:27:03"
[1] "seed : 0   rep : 1  Adaptive Sampling with Feature Selection section start time : 2020-03-02 23:27:03"
[1] 1
[1] "ABM train_candidate run start time : 2020-03-02 23:27:03"
[1] "ABM train_candidate run end time : 2020-03-02 23:27:05"
[1] 2
[1] "ABM train_candidate run start time : 2020-03-02 23:27:05"
[1] "ABM train_candidate run end time : 2020-03-02 23:27:06"
[1] 3
[1] "ABM train_candidate run start time : 2020-03-02 23:27:06"
[1] "ABM train_candidate run end time : 2020-03-02 23:27:10"
[1] 4
[1] "ABM train_candidate run start time : 2020-03-02 23:27:10"
[1] "ABM train_candidate run end time : 2020-03-02 23:27:12"
[1] 5
[1] "ABM train_candidate run start time : 2020-03-02 23:27:12"
[1] "ABM train_candidate run end time : 2020-03-02 23:27:17"
[1] 6
[1] "seed : 0   rep : 1  Adaptive Sampling with Feature Elimination section end time : 2020-03-02 23:27:17"
[1] "seed : 0   rep : 2  Adapt

In [74]:
# Final records
FinalTrainData_AdFe = copy(training_set_Ad_final)
obb_error_AdFe = copy(obb_error)
performance_table_AdFe = copy(performance_table)
predictedLabels_table_AdFe = copy(predictedLabels_all)
train_candidates_table_AdFe  = copy(train_candidates_all)

In [75]:
rm(training_set_Ad_final,obb_error,performance_table,predictedLabels_all,train_candidates_all)

In [76]:
fwrite(importance_table_AdFe,paste0(outputs.path,model.type,"_","importance_table_AdFe","_",selection_metric,".csv") )
fwrite(FinalTrainData_AdFe,paste0(outputs.path,model.type,"_","FinalTrainData_AdFe","_",selection_metric,".csv") )
fwrite(performance_table_AdFe,paste0(outputs.path,model.type,"_","performance_table_AdFe","_",selection_metric,".csv") )
fwrite(train_candidates_table_AdFe,paste0(outputs.path,model.type,"_","train_candidates_table_AdFe","_",selection_metric,".csv") )
fwrite(predictedLabels_table_AdFe,paste0(outputs.path,model.type,"_","predictedLabels_table_AdFe","_",selection_metric,".csv") )
fwrite(obb_error_AdFe,paste0(outputs.path,model.type,"_","obb_error_AdFe","_",selection_metric,".csv") )
fwrite(iteration_history_AdFe,paste0(outputs.path,model.type,"_","iteration_history_AdFe","_",selection_metric,".csv") )

In [79]:
performance_molten_AdFe <- melt(data = performance_table_AdFe
                             , id.vars = c('iter',"seed","rep","size"))
setnames(performance_molten_AdFe, c("variable","value"),c("errortype","errorvalue"))
p_AdFe = ggplot(performance_molten_AdFe, aes(x = iter, y = errorvalue, group=errortype, col=errortype)) + 
            geom_line(lwd=1) +
            #geom_hline(data = performance_molten_oneshot, aes(yintercept = errorvalue, group=errortype, col=errortype),stat = "hline", linetype = "dashed") +
            facet_wrap(~ rep) +
            geom_vline(xintercept = iteration_history[IsFeatureEliminated==1]$iter_no + 1, linetype = "dashed") +
            #geom_vline(xintercept = iteration_history[IsDataSelected==1]$iter_no + 1, linetype = "dotdash",color = "yellow") +
            ggtitle(paste0("Performances with AdFe for model_ ", model.type))
p_AdFe
#ggsave(paste0(outputs.path,"performance_table_AdFe_", model.type,".png"))

ERROR: Error in is.data.table(data): object 'performance_table_AdFe' not found


In [78]:
rm(importance_table_AdFe,FinalTrainData_AdFe,iteration_history_AdFe,performance_table_AdFe,train_candidates_table_AdFe,predictedLabels_table_AdFe,obb_error_AdFe)

# Quit NL

In [None]:
NLQuit(nl.obj = nl.model)
#NLQuit(all=TRUE)