### Load package

In [1]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [None]:
library(h2o)
h2o.init(nthreads = -1, max_mem_size = '100G')

In [None]:
h2o.clusterInfo()

### Read in data

In [None]:
data.hex <- h2o.importFile(path = "https://s3-us-west-2.amazonaws.com/data516project/data/allDataCleaned.csv"
                           , destination_frame = "data.hex")

data.hex$C1 <- NULL

In [None]:
orig_rframe = readRDS("/home/capsops/mandy/allDataCleaned.RDS")

In [None]:
data.hex.recon <- h2o.importFile(
    path = "https://s3-us-west-2.amazonaws.com/data516project/data/reconstr_data.csv.gz"
#     path = "s3://AKIAJLYQ6Q5TEGED3YYA:S7JWNaiyVn7RfFEWssZ7vUcoyWhW6minGoa0OBnq@data516project/data/reconstr_data.csv.gz"
    , destination_frame = "data.hex.recon")

data.hex.recon$C1 <- NULL

#### testing code

print(object.size(data.hex.recon), units="Mb")

print(object.size(data.hex), units="Mb")

dim(data.hex.recon)

dim(data.hex)

h2o.exportFile(data.hex.recon[1:6000, ], path = "/mnt/UW/outputDataset/test.csv.gz")

### Change Boolean and Categorical variables into factor

In [None]:
for (col in 2:8) {
    data.hex[,col] <- as.factor(as.character(data.hex[,col]))
}

### Train GLRM Model 

In [None]:
t0 = Sys.time()
data.glrm <- h2o.glrm(training_frame = data.hex, 
#                       impute variables except policy number 
                      cols = c(2:ncol(data.hex)), 
                      k = 50, seed = 1234, init = "SVD", svd_method = "GramSVD", 
                      loss = "Quadratic", 
                      multi_loss = "Categorical",
                      transform = "NORMALIZE",
                      impute_original = TRUE,
                      regularization_x = "Quadratic", regularization_y = "Quadratic", 
                      max_iterations = 200, min_step_size = 1e-6)
t1 = Sys.time()

### Output Data -- full set

In [None]:
class(data.glrm)

In [None]:
# Get Low rank representation, principal stances
rep <- h2o.getFrame(data.glrm@model$representation_name)

h2o.exportFile(rep, path = "/mnt/UW/outputDataset/lowrank_rep.csv.gz")

In [None]:
# Get archetypes
archetypes <- h2o.proj_archetypes(data.glrm, data.hex, reverse_transform = TRUE) 

h2o.exportFile(archetypes, path = "/mnt/UW/outputDataset/lowrank_archetypes.csv.gz")

In [None]:
# Reconstruct the original matrix
data.pred <- predict(data.glrm, data.hex)

# Failed at 38%, File size ~ 50G
# h2o.exportFile(data.pred, path = "/mnt/UW/outputDataset/reconstr_data.csv.gz")

# Save as rds file instead -- Not sure this will work if we turn down the h2o instance and reload it
# seems you can reload the h2o dataframe, but can't do any slicing or as.data.frame on it
# saveRDS(data.pred, 'recontr_data.RDS')

### Modify the reconstr dataframe and impute missing value in the original dataframe

### Approach 3 as.data.frame by blocks

In [None]:
rm(data.pred)

In [None]:
data.hex.recon = predict(data.glrm, data.hex) 

In [None]:
reconstr_rframe = as.data.frame(rep(0, dim(data.hex.recon)[1]))

In [None]:
for (i in 1:dim(data.hex.recon)[2]) {
    t0 = Sys.time()
    reconstr_rframe = cbind(reconstr_rframe, as.data.frame(data.hex.recon[i:i]))
    t1 = Sys.time()
#     print(t1-t0)
    print(i)
}

In [None]:
dim(reconstr_rframe)

In [None]:
for (i in 502:dim(data.hex.recon)[2]) {
    t0 = Sys.time()
    reconstr_rframe = cbind(reconstr_rframe, as.data.frame(data.hex.recon[i:i]))
    t1 = Sys.time()
#     print(t1-t0)
    print(i)
}

In [None]:
reconstr_rframe[1] <- NULL

In [None]:
# orig_rframe = readRDS("/home/capsops/mandy/allDataCleaned.RDS")

## Impute 17 key variables

In [2]:
interpretable_vars = c("score_C4", # Auto Score # flatten top and bottom
                     "EstMarketValue_C1", # flatten top and bottom
                     "finscr_C4", # Credit Score # flatten top and bottom
                     "CEN_tr_pctOwnOccSecondMort", # flatten top and bottom
                     "CEN_bg_pctSeasonalHousingUnits", # flatten top and bottom
                     "CEN_bg_pctHHincomeLT15K", # flatten top and bottom
                     "CEN_tr_pctHHInvestIncome", # flatten top and bottom
                     "CEN_tr_pctHHSocialSecurityIncome",  # flatten top and bottom
                     "CEN_bg_pctLiveAloneHH", # flatten top and bottom
                     "CEN_bg_pctConstructionIndustry", # flatten top and bottom
                     "CEN_tr_pctHSGrad", # flatten top and bottom
                     "iat89_C4", # Highest delinquency on a trade # int, round to nearest
                     "imt01_C4", # Number of mortgages # int, round to nearest
                     "IssAgeALB", # Issue Age # int, round to nearest
                     "HealthScore_C5", # Health Score # factor, round to nearest
                     "Length.of.Residence_num", # factor, round to nearest
                     "Target.Narrow.Band.Income_num" # factor, round to nearest
                    )

In [3]:
vars = paste0('reconstr_', interpretable_vars)

In [4]:
orig_rframe = read.csv('./LASSO_75_allData.csv')

In [5]:
reconstr_rframe = read.csv('./reconstr_17_columns.csv')

In [6]:
reconstr_rframe$X <- NULL

In [7]:
orig_rframe_17 = orig_rframe[, interpretable_vars]
reconstr_rframe_17 = reconstr_rframe[, vars]

In [8]:
imputed_rframe = coalesce(orig_rframe_17, reconstr_rframe_17)

#### flattern top and bottom

In [9]:
flattern_top_and_bottom <- function(x, max, min) {
    return(ifelse(x > max, max, ifelse(x < min, min, x)))
}

In [10]:
for (i in 1:11){
    minimum = min(orig_rframe_17[, i], na.rm = TRUE)
    maximum = max(orig_rframe_17[, i], na.rm = TRUE)
    imputed_rframe[i] = sapply(imputed_rframe[i], flattern_top_and_bottom, max = maximum, min = minimum)   
}

#### round to the nearest integer

In [11]:
change_to_positive_integer <- function(x){
    max(round(x), 0)
}

imputed_rframe[12:14] <- apply(imputed_rframe[12:14], c(1,2), change_to_positive_integer)

#### convert to the closest element

#### health_score

In [None]:
# imputed_rframe = coalesce(orig_rframe_17, reconstr_rframe_17)

In [None]:
# imputed_rframe[15] = coalesce(orig_rframe_17[15], reconstr_rframe_17[15])

In [22]:
unique_health_score = unique(orig_rframe$HealthScore_C5)
unique_health_score = unique_health_score[!is.na(unique_health_score)]

closest_health_score <- function(x){
    if (!(x %in% unique_health_score)){
        idx_closest = which.min(abs(x - unique_health_score))
        return(unique_health_score[idx_closest])
    } else {
        return(x)
    }
}

In [28]:
imputed_rframe[15] = apply(imputed_rframe[15], 1, closest_health_score)

#### Length.of.Residence_num

In [32]:
unique_residence_length = unique(orig_rframe$"Length.of.Residence_num")
unique_residence_length = unique_residence_length[!is.na(unique_residence_length)]

closest_residence_length <- function(x){
    if (!(x %in% unique_residence_length)){
        y = abs(x - unique_residence_length)
        idx_closest = which.min(y)
        return(unique_residence_length[idx_closest])
    } else {
        return(x)
    }
}

In [34]:
imputed_rframe[16] = apply(imputed_rframe[16], 1, closest_residence_length)

#### Target.Narrow.Band.Income_num

In [35]:
unique_income_band = unique(orig_rframe$"Target.Narrow.Band.Income_num")
unique_income_band = unique_income_band[!is.na(unique_income_band)]

closest_income_band <- function(x){
    if (!(x %in% unique_income_band)){
        y = abs(x - unique_income_band)
        idx_closest = which.min(y)
        return(unique_income_band[idx_closest])
    } else {
        return(x)
    }
}

In [36]:
imputed_rframe[17] = apply(imputed_rframe[17], 1, closest_income_band)

In [38]:
write.csv(imputed_rframe, "imputed_17_columns.csv")

In [41]:
table(imputed_rframe[15])


    0.5   1.125   1.375   1.625   1.875   2.125   2.375     3.5 
2710450 1355294  245738  136176   80363   49131   33022  122524 

## Impute 17 key variables

In [None]:
orig_rframe = as.data.frame(orig_rframe)

In [None]:
print(object.size(reconstr_rframe), units="Mb")

In [None]:
print(object.size(orig_rframe), units="Mb")

In [None]:
# Convert factor columns into characters
orig_rframe[2:20] %>% mutate_if(is.factor, as.character) -> orig_rframe[2:20]
reconstr_rframe[1:19] %>% mutate_if(is.factor, as.character) -> reconstr_rframe[1:19]

In [None]:
# change the "" elements from categorical columns into NA
empty_as_na <- function(x){
    ifelse(as.character(x)!="", x, NA)
}

orig_rframe[9:20] <- orig_rframe[9:20] %>% mutate_each(funs(empty_as_na)) 

In [None]:
# Impute missing values
imputed_rframe = cbind(orig_rframe[1:1]
                       , coalesce(orig_rframe[2:20], reconstr_rframe[1:19])
                       , coalesce(orig_rframe[21:510], reconstr_rframe[20:509])
)

In [None]:
## reconstructed dataframe

# Boolean: 1:7
# Categorical: 8:19
# Positive Integer: 20:134
# Positive Numeric: 135:186
# Percentage Numeric: 187:498
# Real Value Numeric: 499:509


## imputed dataframe
# 21:135 -- Convert to nearest positive integer
# 136:187 -- positive
# 188:498 -- positive percent

In [None]:
change_to_positive_integer <- function(x){
    max(round(x), 0)
}

imputed_rframe[21:135] <- apply(imputed_rframe[21:135], c(1,2), change_to_positive_integer)

In [None]:
change_to_positive <- function(x){
    max(x, 0)
}

imputed_rframe[136:499] <- apply(imputed_rframe[136:499], c(1,2), change_to_positive)

### Add an ID column to both datasets

In [None]:
imputed_rframe$ID <- seq.int(nrow(imputed_rframe))

In [None]:
columnMask = c('ID', names(imputed_rframe)[1:length(names(imputed_rframe))-1])
imputed_rframe = imputed_rframe[, columnMask]

### Output Data -- sampled set of 60000 rows

In [None]:
sampleRowsMask = sample(imputed_rframe$ID, 60000)

saveRDS(imputed_rframe[sampleRowsMask, ], '/mnt/UW/outputDataset/allDataImputed_sample.RDS')

In [None]:
saveRDS(imputed_rframe, '/mnt/UW/outputDataset/allDataImputed.RDS')

In [None]:
write.csv(imputed_rframe, '/mnt/UW/outputDataset/allDataImputed.csv')

In [None]:
h2o.shutdown(prompt = FALSE)

In [None]:
lassoVariables = read.csv('./lassoFeatures1.csv')

In [None]:
lassoVariables$Variable