In [None]:
suppressMessages(library(cowplot))
library(reshape2)
theme_set(theme_minimal())

source_colors = c(basic = "#fc8d62", count_match = "#66c2a5", OLGA.Q ="#8da0cb", data = "#A3A3A3", train = "#444444")

In [None]:
dir = '_output_deneuter-2019-02-07'
dir = '_output_2019-03-07-robins-ratio-MS'
dir = '_output_seshadri-2019-02-09'

data_name = gsub('_output_', '', dir)
dir = paste0('input/', dir, '/')

## Plotting likelihoods

In [None]:
cols = unlist(strsplit("train_data,beta,model,test_set,Pgen,Ppost,acidity,aliphatic_index,amino_acid,aromaticity,basicity,bulkiness,cdr3_length,charge,gravy,j_gene,log_Ppost,log_Pvae,polarity,q,v_gene",','))
accept = cols %in% c('beta', 'model', 'test_set', 'log_Pvae')
colclasses = c(accept)
colclasses[!accept] = 'NULL'
colclasses[accept] = NA

df=read.csv(bzfile(paste0(dir,'merged.agg.csv.bz2')), colClasses=colclasses, stringsAsFactors=FALSE)

df$is_0.75 = df$beta==0.75

In [None]:
p = ggplot(df, aes(log_Pvae, color=beta, group=beta, linetype=is_0.75)) +
    geom_freqpoly(aes(y=..density..), bins=40) + 
    facet_grid(vars(model))
p + xlim(-40, -10)

In [None]:
p + xlim(-25, -15)

## Plotting a variety of betas

In [None]:
summarized_path = paste0(dir,'summarized.agg.csv')

facet_labeller = function(s) {
   s = sub("sumdiv_","",s)
   s = gsub("_"," ",s)
   s = sub("distance","dist",s)
   s
}

df = read.csv(summarized_path, stringsAsFactors=FALSE)
df = df[df$model != 'train',]
df$model_beta = paste(df$beta,df$model)
id_vars = c('test_set', 'model', 'model_beta')
df$model_beta = as.factor(df$model_beta)
measure_vars = grep('sumdiv_', colnames(df), value=TRUE)
df = df[c(id_vars, measure_vars)]
p = ggplot(
  melt(df, id_vars, measure_vars, variable.name='divergence_name', value.name='divergence'),
  aes_string('model_beta', 'divergence', color='model')
) + geom_boxplot() +
  facet_wrap(vars(divergence_name), scales='free', labeller=as_labeller(facet_labeller)) +
  theme(axis.text.x=element_blank()) +
  scale_y_log10() +
  ggtitle(data_name)
ggsave(paste0('output/',data_name,'sumdiv-beta.png'), width=8, height=8)
p

## Plotting one beta in detail

In [None]:
summarized_path = paste0(dir,'summarized.agg.csv')
df = read.csv(summarized_path, stringsAsFactors=FALSE)

facet_labeller = function(s) {
    s = sub("sumdiv_","",s)
    s = gsub("_"," ",s)
    s = sub("distance","dist",s)
    s
}

compare_model_divergences = function(df, beta) {
    df$synthetic = TRUE
    df[df$model == 'train', ]$synthetic = FALSE
    df = df[df$beta == beta,]
    df[df$model == 'olga', 'model'] = 'OLGA.Q'
    id_vars = c('test_set', 'model', 'synthetic')
    measure_vars = grep('sumdiv_', colnames(df), value=TRUE)
    df = df[c(id_vars, measure_vars)]
    theme_set(theme_minimal())
    ggplot(
        melt(df, id_vars, measure_vars, variable.name='divergence_name', value.name='divergence'),
        aes_string('model', 'divergence', color='model', shape='synthetic')
    ) + geom_point(position = position_jitterdodge(dodge.width=0.5, jitter.width=0.5)) +
        facet_wrap(vars(divergence_name), nrow=3, scales='free', labeller=as_labeller(facet_labeller)) +
        scale_y_log10() +
        scale_shape_manual(values=c(3, 16)) +
        theme(axis.text.x=element_blank(), axis.title.x = element_blank()) +
        scale_color_manual(values=source_colors) + 
        labs(color='data source') +
        ggtitle(data_name)
}

compare_model_divergences(df, 0.75)
ggsave(paste0('output/',data_name,'-sumdiv.png'), width=8, height=4.5)