### Data munging to get response variables for lapse and withdrawal

In [1]:
raw <- readRDS('/data/capstone_data/ClusterData_allPHB_allvendorvars_editedcolumns.rds')

In [2]:
response_df <- raw[, c("Surr", "Efficiency_Category_Sub", "WDResponse")]

In [14]:
table(response_df$Surr)


      0       1 
4685495   47203 

In [15]:
table(response_df$Efficiency_Category_Sub)


     E      I     O1     O2     O3     U1     U2     U3 
519840 137400  71663  59541  84352  97670  82640  89552 

In [16]:
table(response_df$WDResponse)


      0       1 
3990289   50387 

In [17]:
write.csv(response_df, file = "/mnt/UW/outputDataset/response_3col.csv")

### Initiate h2o cluster to run glm with LASSO regularization

In [18]:
library(h2o)
h2o.init(nthreads = -1, max_mem_size = '400G')


----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


Attaching package: ‘h2o’

The following object is masked from ‘package:RevoScaleR’:

    colnames

The following objects are masked from ‘package:stats’:

    cor, sd, var

The following objects are masked from ‘package:base’:

    &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
    colnames<-, ifelse, is.character, is.factor, is.numeric, log,
    log10, log1p, log2, round, signif, trunc



 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         1 days 9 hours 
    H2O cluster version:        3.16.0.2 
    H2O cluster version age:    2 months and 23 days  
    H2O cluster name:           H2O_started_from_R_capsops_osa286 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   325.59 GB 
    H2O cluster total cores:    64 
    H2O cluster allowed cores:  64 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.4.1 (2017-06-30) 



#### 1. Read in predictors from the cleaned data set

In [20]:
predictors <- h2o.importFile(path = normalizePath("/mnt/UW/outputDataset/allDataCleaned.csv")
                           , destination_frame = "predictors")

predictors$C1 <- NULL



In [21]:
response <- h2o.importFile(path = normalizePath("/mnt/UW/outputDataset/response_3col.csv")
                           , destination_frame = "response")

response$C1 <- NULL



In [22]:
full <- h2o.cbind(response, predictors)

In [32]:
full$Surr <- as.factor(full$Surr)

In [33]:
full$Efficiency_Category_Sub <- as.factor(full$Efficiency_Category_Sub)
full$WDResponse <- as.factor(full$WDResponse)

In [34]:
full.split <- h2o.splitFrame(full, ratios = c(0.5, 0.2), seed = -1)

In [99]:
training <- full.split[[1]]
validation <- full.split[[2]]

In [36]:
head(validation)

Surr,Efficiency_Category_Sub,WDResponse,PolNum_UW,JointInd,GMDBInd,OriginalOwner_C1,Qual,EligibleInd,FirstEligQInd,⋯,i12ccsb1_C4,i12ccsu1_C4,i12ccsv1_C4,i12ccsc1_C4,i12ccsd1_C4,i12ccsc2_C4,i03cctl1_C4,i12cctl1_C4,i12mtsq1_C4,i06txsb1_C4
0,,,86766,1,-1,-1.0,-1,,,⋯,0.2,0.0,0.0,0.0,0.0,-0.5,13.1,100.3,0.0,0.0
0,,,476073,1,-1,-1.0,-1,,,⋯,19.6,1.2,-4.1,43.27141,0.0,0.1,-30.9,-22.2,0.0,0.2
0,,,179630,1,-1,,-1,,,⋯,,,,,,,,,,
0,,,500356,-1,-1,-1.0,-1,,,⋯,-0.7,-0.1,0.0,0.0,0.0,0.0,0.0,2.8,0.0,0.0
0,,,204012,-1,-1,-1.0,-1,,,⋯,-34.1,-0.7,-1.0,29.9,0.0,-1.303578,178.3204,318.139,0.0,0.0
0,,,481439,1,-1,-1.0,-1,,,⋯,0.0,0.0,-0.9,33.1,0.0,1.1,0.0,8.9,0.1,0.0


h2o.glm(x, y, training_frame = , model_id, validation_frame = NULL,
  ignore_const_cols = TRUE, max_iterations = 50, beta_epsilon = 0,
  solver = c("IRLSM", "L_BFGS"), standardize = TRUE,
  family = c("gaussian", "binomial", "poisson", "gamma", "tweedie",
  "multinomial"), link = c("family_default", "identity", "logit", "log",
  "inverse", "tweedie"), tweedie_variance_power = NaN,
  tweedie_link_power = NaN, alpha = 0.5, prior = NULL, lambda = 1e-05,
  lambda_search = FALSE, nlambdas = -1, lambda_min_ratio = -1,
  nfolds = 0, fold_column = NULL, fold_assignment = c("AUTO", "Random",
  "Modulo"), keep_cross_validation_predictions = FALSE,
  beta_constraints = NULL, offset_column = NULL, weights_column = NULL,
  intercept = TRUE, max_active_predictors = -1, objective_epsilon = -1,
  gradient_epsilon = -1, non_negative = FALSE, compute_p_values = FALSE,
  remove_collinear_columns = FALSE, max_runtime_secs = 0,
  missing_values_handling = c("MeanImputation", "Skip"))

In [90]:
x <- names(full)[!names(full) %in% c("Surr", "Efficiency_Category_Sub", "WDResponse", "PolNum_UW", "AVPctEq",
                                     'JointInd', 'GMDBInd', 'Qual', 'EligibleInd', 'FirstEligQInd', 'UtilizationInd', 
                                     'Dist', 'Comm', 'Dur', 'SCPeriod', 'WDCount', 'AV', 'WDtoDate')]

#### 2. Lapse glm model with LASSO regularization

In [91]:
surr_fit <- h2o.glm(x, "Surr", training_frame = training, validation_frame = validation, family = "binomial", 
               alpha = 1, # LASSO
               standardize=TRUE,
               lambda_search = TRUE, nlambdas = 100, 
               lambda_min_ratio = 0.0001, nfolds = 0)



In [92]:
surr_fit@model$model_summary

family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
binomial,logit,Lasso (lambda = 2.259E-5 ),"nlambda = 100, lambda.max = 0.00454, lambda.min = 2.259E-5, lambda.1se = -1.0",514,313,67,RTMP_sid_9344_14


In [94]:
surr_var <- surr_fit@model$standardized_coefficient_magnitudes[1:50,]$names

In [95]:
surr_var

#### 3. Withdrawal efficiency group glm model with LASSO regularization

In [103]:
# Remove rows that are NA for 'Efficiency_Category_Sub' variable, because they are likely to be 
# policys that haven't started utilization.

training = training[training$Efficiency_Category_Sub != "NA",]
validation = validation[validation$Efficiency_Category_Sub != "NA",]
#h2o.table(validation$Efficiency_Category_Sub)

  Efficiency_Category_Sub  Count
1                       E 103636
2                       I  27432
3                      O1  14336
4                      O2  11908
5                      O3  16661
6                      U1  19662

[8 rows x 2 columns] 

In [104]:
eff_group_fit <- h2o.glm(x, "Efficiency_Category_Sub", training_frame = training, validation_frame = validation, family = "multinomial", 
                           alpha = 1, # LASSO
                           standardize=TRUE,
                           lambda_search = TRUE, nlambdas = 100, 
                           lambda_min_ratio = 0.0001, nfolds = 0)



In [105]:
eff_group_fit@model$model_summary

family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
multinomial,multinomial,Lasso (lambda = 6.531E-5 ),"nlambda = 100, lambda.max = 0.04827, lambda.min = 6.531E-5, lambda.1se = -1.0",4635,3022,78,RTMP_sid_9344_26


In [106]:
eff_group_var <- eff_group_fit@model$standardized_coefficient_magnitudes[1:50,]$names

In [107]:
eff_group_var

#### 4. What are the variables selected?

In [109]:
# Variable dictionary
df_vars <- read.csv('/home/capsops/mandy/selectedVariables.csv', encoding = 'latin1')

In [132]:
intersect(eff_group_var, surr_var)

In [133]:
lasso_features1 <- df_vars[df_vars$Variable %in% c(surr_var, eff_group_var),]

In [134]:
lasso_features2 <- df_vars[df_vars$Variable %in% intersect(surr_var, eff_group_var),]

In [138]:
write.csv(lasso_features1, '/mnt/UW/outputDataset/lassoFeatures1.csv')
write.csv(lasso_features1, 'lassoFeatures1.csv')

In [137]:
write.csv(lasso_features2, '/mnt/UW/outputDataset/lassoFeatures2.csv')
write.csv(lasso_features2, 'lassoFeatures2.csv')