<a href="https://colab.research.google.com/github/pachterlab/GVFP_2021/blob/dev/gg220810_bf_calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bayes factor analysis

This short notebook imports the `.pickle` files containing PyMC3 fits to $\Gamma$-OU and CIR models, computes the $\ln$ Bayes factors (as the ratio of model-specific marginal likelihoods, given a uniform prior), then stores them to disk.

In [1]:
!git clone --branch dev https://github.com/pachterlab/GVFP_2021.git

Cloning into 'GVFP_2021'...
remote: Enumerating objects: 2265, done.[K
remote: Counting objects: 100% (2265/2265), done.[K
remote: Compressing objects: 100% (1199/1199), done.[K
remote: Total 2265 (delta 1051), reused 2265 (delta 1051), pack-reused 0[K
Receiving objects: 100% (2265/2265), 896.58 MiB | 30.67 MiB/s, done.
Resolving deltas: 100% (1051/1051), done.
Checking out files: 100% (767/767), done.


In [2]:
!mv GVFP_2021/* .

In [3]:
!rm -r GVFP_2021

# Helper functions

In [9]:
import pymc3 as pm
import theano.tensor as tt
class LogLike(tt.Op):
    
    itypes = [tt.dvector] # expects a vector of parameter values when called
    otypes = [tt.dscalar] # outputs a single scalar value (the log likelihood)
    
    def __init__(self, mx, data, likelihood):
        
        # add inputs as class attributes
        self.mx = mx
        self.data = data
        self.likelihood = likelihood
        
    def perform(self, node, inputs, outputs):
        
        phi, = inputs # this contains parmeters
        logl = self.likelihood(phi, self.mx, self.data) # calls the log-likelihood function
        outputs[0][0] = np.array(logl) # output the log-likelihood
def ll_CIR(phi, mx, data):
    pass
def ll_GOU(phi, mx, data):
    pass

# BF import and calculation

In [10]:
import numpy as np
sample_names = ['C01','B08','H12','F08']
dataset_names = ['allen_'+y+'_Glutamatergic'  for y in sample_names] 
n_datasets = len(dataset_names)

In [11]:
import pickle
# BF_genes = ['Gabra4', 'Aplp2', 'Srpk1']
BF_genes= ['Birc6','Ube2k','Pum1','Nf1','Rbm25','Hprt','Cap1','Ywhaq','Pnisr','Ywhah','Pura','Ccdc39'] 
# BF_genes= ['Birc6','Ube2k','Rbm25','Hprt','Cap1','Ywhaq','Pnisr','Pura','Ccdc39'] 
n_BF_genes = len(BF_genes)
BF = np.empty((n_datasets,n_BF_genes))
BF[:] = np.nan
trace_dir= './smc_results/'
for j,sample in enumerate(dataset_names):
    for i,gene in enumerate(BF_genes):
#         if gene in finished_genes:
        with open(trace_dir+sample+"_"+gene+'_GOU_trace.pickle', 'rb') as f:
            trace_GOU = pickle.load(f)
        with open(trace_dir+sample+"_"+gene+'_CIR_trace.pickle', 'rb') as f:
            trace_CIR = pickle.load(f)
        BF[j,i]= (trace_CIR.report.log_marginal_likelihood - trace_GOU.report.log_marginal_likelihood)
np.save(trace_dir+'bfs.npy',BF)

In [12]:
BF.T.round(2)

array([[ -62.17,  -82.41,   -6.02,  -63.78],
       [ -67.56,  -98.53,   -3.47,  -49.31],
       [ -83.46,  -95.84,  -12.25,  -68.78],
       [ -95.22, -148.32,  -11.96, -125.92],
       [ -15.28,   36.63,   22.79,    2.89],
       [  -2.71,   39.36,   16.17,   28.26],
       [  37.  ,   33.22,   32.09,   50.85],
       [  92.93,  105.23,   62.93,   69.16],
       [ -16.04,   14.94,   22.76,  -21.44],
       [ -20.66,   62.72,   -0.25,   41.79],
       [ -20.63,    0.17,    4.71,  -18.86],
       [  -4.17,   -1.67,    0.23,   -0.8 ]])