In [None]:
library(ggplot2)
library(reshape2)
library(dplyr)
library(nlme)
library(tidyr)

theme_set(theme_minimal())
phi = 2 / (1 + sqrt(5))

In [None]:
df = read.csv(bzfile('output/merged.agg.csv.bz2'), stringsAsFactors=FALSE)
df$log_frequency = log(df$frequency)
df$log_Pgen = log(df$Pgen)
df$n_subjects = gsub('count_in_', '', df$column)
df$n_subjects = as.integer(gsub('count', '666', df$n_subjects))
df$column = NULL

_make choices here!_

In [None]:
# Which OLGA variant for scatterplot?
measure_vars = c('log_normed_Pgen', 'log_normed_Pvae')
#measure_vars = c('log_normed_Ppost', 'log_normed_Pvae')

# Which VAE?
df = df[df$model == 'basic',]
#df = df[df$model == 'count_match',]

Here we add "pseudocounts" so that R doesn't choke on Pgen/Ppost calculation.

In [None]:
df$Pvae = exp(df$log_Pvae)
df[is.na(df$Pgen),'Pgen'] = min(df$Pgen, na.rm = TRUE)/2
df[is.na(df$Ppost),'Ppost'] = min(df$Ppost, na.rm = TRUE)/2
df$log_Pgen = log(df$Pgen)
df$log_Ppost = log(df$Ppost)

In [None]:
df = df %>% group_by(n_subjects, split) %>% 
    mutate(normed_Pgen = Pgen/sum(Pgen, na.rm = TRUE)) %>% 
    mutate(normed_Ppost = Ppost/sum(Ppost, na.rm = TRUE)) %>%
    mutate(normed_Pvae = Pvae/sum(Pvae, na.rm = TRUE))
df$log_normed_Pgen = log(df$normed_Pgen)
df$log_normed_Ppost = log(df$normed_Ppost)
df$log_normed_Pvae = log(df$normed_Pvae)

In [None]:
id_vars = c('n_subjects', 'split', 'log_frequency')
molten = melt(df, id_vars, measure_vars, 
              variable.name='probability_name', value.name='log_probability')
molten = molten[sample(nrow(molten)),]

ggplot(molten, aes(log_frequency, log_probability, color=probability_name)) + 
    facet_grid(rows = vars(n_subjects), cols=vars(split), scales='free') + 
    geom_point(alpha=0.015) + 
    theme(aspect.ratio = 1) + 
    coord_cartesian(ylim=c(-25,0)) + 
    guides(colour = guide_legend(override.aes = list(alpha = 1)))
ggsave(paste0(paste(measure_vars, collapse = '_vs_'),'.png'), width=8, height=8)

In [None]:
likes = df %>% group_by(n_subjects, split) %>% 
    summarise(
        like_Pgen = dmultinom(count, prob=normed_Pgen, log=TRUE),
        like_Ppost = dmultinom(count, prob=normed_Ppost, log=TRUE),
        like_Pvae = dmultinom(count, prob=normed_Pvae, log=TRUE),
    )
likes

In [None]:
molten_likes = melt(likes, c('n_subjects', 'split'), value.name = 'likelihood')
molten_likes$group = paste(molten_likes$variable, molten_likes$split)

ggplot(molten_likes, aes(n_subjects, likelihood, color=variable, group=group, linetype=split)) + 
    geom_line() +
    scale_x_log10() +
    theme(aspect.ratio = phi) # golden ratio landscape

ggsave('multinomial_likelihoods.png', width=5, height=5*phi)

In [None]:
# This is a little tedious: we apparently need to have a single grouping variable to feed into lmList, 
# then we need to unpack it later.

df$group = as.factor(paste(df$split, df$n_subjects))
s_Pvae = summary(lmList(log_Pvae ~ log_frequency | group, data=df[,c('log_Pvae', 'log_frequency', 'group')]))
s_Pgen = summary(lmList(log_Pgen ~ log_frequency | group, data=df[,c('log_Pgen', 'log_frequency', 'group')]))
R2 = data.frame(row.names = rownames(s_Pvae$df), R2_Pgen = s_Pgen$r.squared, R2_Pvae = s_Pvae$r.squared)

R2$names = rownames(R2)
R2 = separate(R2, names, c('split', 'n_subjects'))
R2$n_subjects = as.integer(R2$n_subjects)
rownames(R2) = NULL
R2

In [None]:
molten_R2 = melt(R2, c('n_subjects', 'split'), value.name = 'R2')
molten_R2$group = paste(molten_R2$variable, molten_R2$split)

ggplot(molten_R2, aes(n_subjects, R2, color=variable, group=group, linetype=split)) + 
    geom_line() +
    scale_x_log10() +
    theme(aspect.ratio = phi) # golden ratio landscape

ggsave('R2.png', width=5, height=5*phi)