In [16]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import scipy as sp
import inspect
import os
import sys
import time

import paragami

from copy import deepcopy

import bnpregcluster_runjingdev.regression_mixture_lib as gmm_lib

import emfathy

np.random.seed(42) # nothing special about this seed (we hope)!

In [19]:
orig_reg_params = dict()
reg_params = dict()
#datafile = 'fits/transformed_gene_regression_df7_degree3_genes10000.npz'
datafile = 'fits/transformed_gene_regression_df7_degree3_genes1000.npz'
with np.load(datafile) as infile:
    reg_params['y_info'] = infile['y_info']
    reg_params['beta_mean'] = infile['transformed_beta_mean']
    reg_params['beta_info'] = infile['transformed_beta_info']
    orig_reg_params['beta_mean'] = infile['beta_mean']
    orig_reg_params['beta_info'] = infile['beta_info']
    df = infile['df']
    degree = infile['degree']

    x_train = infile['x']
    y_train = infile['y_train']

num_genes = reg_params['beta_mean'].shape[0]
obs_dim = reg_params['beta_mean'].shape[1]

num_components = 100

analysis_name = 'transformed_gene_regression_df{}_degree{}_genes{}_num_components{}_fit'.format(
    df, degree, num_genes, num_components)
print(reg_params['beta_info'].shape, reg_params['beta_mean'].shape)

(700, 9, 9) (700, 9)


In [26]:
# Shrink the y_info with empirical bayes.

e_tau = reg_params['y_info']
e_log_tau = np.log(reg_params['y_info'])
resid = y_train - np.einsum('ti,ni->nt', x_train, orig_reg_params['beta_mean'])
print(x_train.shape, orig_reg_params['beta_mean'].shape, resid.shape, e_log_tau.shape)


def get_hierarchical_gamma_log_prob(gamma, log_gamma, gamma_shape, gamma_rate):
    # These can cause numerical difficulties, so separate them out
    # for easier debugging.
    t1 = gamma_shape * np.log(gamma_rate)
    t2 = sp.special.gammaln(gamma_shape)
    t3 = (gamma_shape - 1) * log_gamma
    t4 = gamma_rate * gamma
    return np.sum(t1 - t2 + t3 - t4)


def get_regression_log_lik_by_nt(e_tau, e_log_tau, resid, prior_shape, prior_rate):
    log_lik_by_nt = \
        -0.5 * e_tau[:, None] * (resid ** 2) + 0.5 * e_log_tau[:, None]
    
    return \
        np.sum(log_lik_by_nt) + \
        np.sum(get_hierarchical_gamma_log_prob(e_tau, e_log_tau, prior_shape, prior_rate))


prior_shape = 3.0
prior_rate = 4.0

get_regression_log_lik_by_nt(e_tau, e_log_tau, resid, prior_shape, prior_rate)

(42, 10) (700, 10) (700, 42) (700,)


-2955.9225238187228

Misguided wishart stuff below

In [None]:
evs = np.full((num_genes, obs_dim), float('nan'))
for n in range(num_genes):
    evs[n, :] = np.linalg.eigvals(reg_params['beta_info'][n, :, :])

In [None]:
# We could justifiably shrink the information matrices.  Let's do empirical bayes.
plt.hist(evs[:, obs_dim - 1], 100);

In [None]:
wishart_mean = np.mean(reg_params['beta_info'], axis=0)
wishart_df = obs_dim + 2

wishart_df_pattern = paragami.NumericScalarPattern(lb=obs_dim)

def get_wishart_loss(wishart_df):
    wishart_df = np.atleast_1d(wishart_df)[0]
    wishart_lp = sp.stats.wishart.logpdf(
        np.moveaxis(reg_params['beta_info'], 0, 2),
        wishart_df,
        scale=wishart_mean / wishart_df)
    return -1 * np.sum(wishart_lp)

In [None]:
wishart_opt = sp.optimize.minimize_scalar(
    paragami.FlattenFunctionInput(
        get_wishart_loss, patterns=wishart_df_pattern, free=True))

In [None]:
wishart_df_opt = wishart_df_pattern.fold(wishart_opt.x, free=True)[0]
wishart_scale_opt = wishart_mean / wishart_df_opt
print(wishart_df_opt)

df_vec = np.linspace(obs_dim + 0.01, obs_dim + 30, 20)
wish_lp = [ get_wishart_loss(df) for df in df_vec ] 
plt.plot(df_vec, wish_lp)
plt.plot(wishart_df_opt, get_wishart_loss(wishart_df_opt), 'r+')

In [None]:
# Sanity check that we match the sufficient statistics

wishart_draws = sp.stats.wishart.rvs(
    df=wishart_df_opt, scale=wishart_scale_opt,
    size=100000)
print(np.linalg.norm(np.mean(wishart_draws, axis=0) -
                     wishart_mean))

mle_logdet = [ np.linalg.slogdet(wishart_draws[n, :, :])[1] for n in range(wishart_draws.shape[0]) ]
obs_logdet = [ np.linalg.slogdet(reg_params['beta_info'][n, :, :])[1] for n in range(num_genes) ]

print('logdet difference {} (se {})'.format(
    (np.mean(mle_logdet) - np.mean(obs_logdet)),
    np.std(mle_logdet) / np.sqrt(wishart_draws.shape[0])))

wishart_draw_se = np.std(wishart_draws, axis=0) / np.sqrt(wishart_draws.shape[0])
wishart_err = (np.mean(wishart_draws, axis=0) - wishart_mean) / wishart_draw_se
plt.matshow(wishart_err); plt.colorbar()
plt.matshow(wishart_draw_se); plt.colorbar()


The above is not well-thought out.  What is the noise in the observations ``beta_info``?  This could get complicated.  Perhaps better to shrink ``y_info``.

In [None]:
plt.hist(reg_params['y_info'], 100);