In [None]:
library(tidyverse) 
list.files(path = "../input")
library(brms)
library(ROCR)
library(ggplot2)

In [None]:
#the training data
path = '../input/impute_train.csv'
train <- data.frame(read.csv(path, header = TRUE))

#data w/ missing contraband values
path = '../input/missing_contraband.csv'
missing_contraband <- data.frame(read.csv(path, header = TRUE))

#prior imputed probabilities
path = '../input/imputed probabilities'
est_probs <- data.frame(read.csv(path, header = TRUE, sep = ','))

In [None]:
head(train)

In [None]:
head(missing_contraband)

In [None]:
options(repr.plot.width=5, repr.plot.height=4)
ggplot(est_probs) +
    geom_histogram(aes(x = est_probs[,2]), fill = 'blue', alpha = 0.65) +
    xlab('Propensity To Contraband')

In [None]:
head(train)

In [None]:
str(train)

In [None]:
train$contraband_found <- (train$contraband_found == 'True') * 1
train$driver_gender <- (train$driver_gender == 'True') * 1
train$is_arrested <- (train$is_arrested == 'True') * 1
train$month <- factor(train$month)

missing_contraband$driver_gender <- (missing_contraband$driver_gender == 'True') * 1
missing_contraband$is_arrested <- (missing_contraband$is_arrested == 'True') * 1
missing_contraband$month <- factor(missing_contraband$month)

#set reference levels for model interpretability
train$driver_race <- relevel(factor(train$driver_race), ref = 'White')
train$violation_raw <- relevel(factor(train$violation_raw), ref = 'Moving Violation (VC)')

In [None]:
formula = 'contraband_found ~ driver_gender + driver_age_raw + driver_race + 
           violation_raw + stop_outcome + is_arrested + year + month + 
           driver_gender:driver_race + driver_race:violation_raw +
           year:month + (1 + driver_race|county_name)'

model <- brm(formula, data = train, family = bernoulli(),
            chains = 4, algorithm = 'meanfield', silent = FALSE,
            seed = 1, cores = 4, iter = 50000)

summary(model)

In [None]:
contraband_probs <- predict(model)
contraband_probs <- contraband_probs[, 1]

pred <- prediction(contraband_probs, train$contraband_found)
auc.tmp <- performance(pred,"auc")
auc <- as.numeric(auc.tmp@y.values)
auc

In [None]:
options(repr.plot.width=4, repr.plot.height=4)
pp_check(model, nsamples = 50)

In [None]:
probs <- c()
nContraband <- nrow(missing_contraband)
chunkSize <- 20000
x <- 0
for(i in 1:ceiling(nContraband / chunkSize)){  
    short_contraband<-missing_contraband[((i-1)*chunkSize+1):min(nContraband,(i*chunkSize)),] 
    single_prob <- predict(model, newdata = short_contraband)
    single_prob <- single_prob[, 1]
    probs <- append(probs, single_prob)
    x <- x + 1
    progress <- sprintf("Done with %s chunk", x)
    print(progress)
    flush.console()
}

In [None]:
(length(probs) == nrow(missing_contraband))

How do the histograms of the imputed probabilities from our two models compare?

In [None]:
options(repr.plot.width=5, repr.plot.height=4)
ggplot() +
    geom_histogram(aes(x = est_probs[,2]), fill = 'blue', alpha = 0.25) +
    geom_histogram(aes(x = probs), fill = 'orange', alpha = 0.25) +
    xlab('Propensity To Contraband')

In [None]:
write.csv(probs, file = "probilities.csv")