In [None]:
library(ggplot2)
library(reshape2)
library(dplyr)

theme_set(theme_minimal())

In [None]:
read_freqs_probs = function(prefix) {
    input_path = paste0(prefix,'.csv')
    pvae_path = paste0(prefix,'.pvae.csv')
    df = read.csv(input_path)
    df$log_Pvae = read.csv(pvae_path)$log_p_x
    df$log_frequency = log(df$frequency)
    df$prefix = basename(prefix)
    df
}

df = rbind(
        read_freqs_probs('output/all-subjects.test.10K'),
        read_freqs_probs('output/all-subjects.train.10K'))

ggplot(df, aes(log_frequency, log_Pvae)) + facet_grid(cols=vars(prefix), scales='free') + geom_point(alpha=0.05) + 
     coord_cartesian(ylim=c(-30,-10)) + theme(aspect.ratio = 1)

In [None]:
ggplot(df, aes(log_frequency, log_Pvae, color=prefix)) + geom_point(alpha=0.08) + 
    coord_cartesian(xlim = c(-16, -9), ylim=c(-25,-10)) + theme(aspect.ratio = 1)

In [None]:
df = read.csv(bzfile('output/merged.agg.csv.bz2'), stringsAsFactors=FALSE)
df$log_frequency = log(df$frequency)
df$log_Pgen = log(df$Pgen)
df$column = factor(df$column, levels=c('count_in_5','count_in_10','count_in_100','count'))

In [None]:
df$Pvae = exp(df$log_Pvae)

Here we add "pseudocounts" so that R doesn't choke on Pgen/Ppost calculation.

In [None]:
df[is.na(df$Pgen),'Pgen'] = min(df$Pgen, na.rm = TRUE)/2
df[is.na(df$Ppost),'Ppost'] = min(df$Ppost, na.rm = TRUE)/2
df$log_Pgen = log(df$Pgen)
df$log_Ppost = log(df$Ppost)

In [None]:
id_vars = c('column', 'split', 'log_frequency')
measure_vars = c('log_Pgen', 'log_Pvae')
measure_vars = c('log_Ppost', 'log_Pvae')
molten = melt(df, id_vars, measure_vars, 
              variable.name='probability_name', value.name='log_probability')
molten = molten[sample(nrow(molten)),]

ggplot(molten, aes(log_frequency, log_probability, color=probability_name)) + 
    facet_grid(rows = vars(column), cols=vars(split), scales='free') + 
    geom_point(alpha=0.025) + 
    theme(aspect.ratio = 1) + 
    coord_cartesian(ylim=c(-25,0))
ggsave(paste0(paste(measure_vars, collapse = '_vs_'),'.png'), width=8, height=8)

In [None]:
df = df %>% group_by(column, split) %>% 
    mutate(normed_Pgen = Pgen/sum(Pgen, na.rm = TRUE)) %>% 
    mutate(normed_Ppost = Ppost/sum(Ppost, na.rm = TRUE)) %>%
    mutate(normed_Pvae = Pvae/sum(Pvae, na.rm = TRUE))
df$log_normed_Pgen = log(df$normed_Pgen)
df$log_normed_Ppost = log(df$normed_Ppost)
df$log_normed_Pvae = log(df$normed_Pvae)

In [None]:
id_vars = c('column', 'split', 'log_frequency')
measure_vars = c('log_normed_Pgen', 'log_normed_Pvae')
measure_vars = c('log_normed_Ppost', 'log_normed_Pvae')
molten = melt(df, id_vars, measure_vars, 
              variable.name='probability_name', value.name='log_probability')
molten = molten[sample(nrow(molten)),]

ggplot(molten, aes(log_frequency, log_probability, color=probability_name)) + 
    facet_grid(rows = vars(column), cols=vars(split), scales='free') + 
    geom_point(alpha=0.025) + 
    theme(aspect.ratio = 1) + 
    coord_cartesian(ylim=c(-25,0))
ggsave(paste0(paste(measure_vars, collapse = '_vs_'),'.png'), width=8, height=8)

We should really just add the actual count, but here we fake the counts via frequency.

In [None]:
df$count = floor(5*df$frequency/min(df$frequency))

In [None]:
df %>% group_by(column, split) %>% 
    summarise(
        like_Pgen = dmultinom(count, prob=normed_Pgen, log=TRUE),
        like_Ppost = dmultinom(count, prob=normed_Ppost, log=TRUE),
        like_Pvae = dmultinom(count, prob=normed_Pvae, log=TRUE),
    )