In [1]:
import pickle
import numpy as np

from sklearn.mixture import GaussianMixture

In [2]:
def get_values_from_trace(model, trace, thin=1, burn=0):
    """
    :param model: pymc3 model
    :param trace: pymc3 trace object
    :param thin: int
    :param burn: int, number of steps to exclude
    :return: dict: varname --> ndarray
    """
    varnames = [var.name for var in model.vars]
    trace_values = {var: trace.get_values(var, thin=thin, burn=burn) for var in varnames}
    return trace_values


In [3]:
class GaussMix(object):
    def __init__(self, n_components, covariance_type="diag"):
        self._n_components = n_components
        self._vars = []
        self._gm = GaussianMixture(n_components=self._n_components, covariance_type=covariance_type)
    
    def fit(self, sample_dict):
        """
        :param sample_dict: dict, var --> 1d array
        """
        self._vars = list(sample_dict.keys())
        X_train = self._dict_to_array(sample_dict)
        self._gm.fit(X_train)
        return self
    
    def score_samples(self, sample_dict):
        """return logp"""
        X = self._dict_to_array(sample_dict)
        logp = self._gm.score_samples(X)
        return logp
    
    def sample(self, n_samples=1):
        X = self._gm.sample(n_samples=n_samples)
        X = X[0]
        X_dict = {}
        for i, v in enumerate(self._vars):
            X_dict[v] = X[:, i]
        return X_dict
    
    def get_vars(self):
        return self._vars
    
    def get_model(self):
        return self._gm
    
    def get_gm_fited_params(self):
        weights = self._gm.weights_
        means = self._gm.means_
        covariances = self._gm.covariances_
        
        results = {}
        for i, v in enumerate(self._vars):
            results[v] = {}
            results[v]["weights"] = weights
            results[v]["means"] = [means[j][i] for j in range(self._n_components)]
            results[v]["sigmas"] = [np.sqrt(covariances[j][i]) for j in range(self._n_components)]
        return results
    
    def get_n_components(self):
        return self._n_components
    
    def _dict_to_array(self, sample_dict):
        X = [sample_dict[v] for v in self._vars]
        X = np.stack(X, axis=1)
        return X

In [4]:
def log_normal_pdf(mu, sigma, y):
    sigma2 = sigma * sigma
    res = - 0.5 * np.log(2 * np.pi * sigma2) - (0.5 / sigma2) * (y - mu) ** 2
    return res


def log_mult_normal_pdf(mu_vec, sigma_vec, y_vec):
    logp = 0.
    for mu, sigma, y in zip(mu_vec, sigma_vec, y_vec):
        logp += log_normal_pdf(mu, sigma, y)
    return logp


def log_gm_pdf(weights, mu_mat, sigma_mat, y_vec):
    """
    :param weights: ndarray of shape (n_components,)
    :param mu_mat: ndarray of shape (n_components, n_features)
    :param sigma_mat: ndarray of shape (n_components, n_features)
    :param y_vec: ndarray of shape (n_features,)
    """
    n_components = mu_mat.shape[0]
    assert n_components == len(weights), "wrong weight len"
    
    prop = 0.
    for i, w in enumerate(weights):
        mu_vec = mu_mat[i, :]
        sigma_vec = sigma_mat[i, :]
        prop += w * np.exp(log_mult_normal_pdf(mu_vec, sigma_vec, y_vec))
    logp = np.log(prop)
    return logp


def make_param_mats(var_names, gm_fited_params):
    n_features = len(var_names)
    n_components = len(gm_fited_params[var_names[0]]["weights"])
    
    weights = gm_fited_params[var_names[0]]["weights"]
    mean_mat = np.zeros([n_components, n_features])
    sigma_mat = np.zeros([n_components, n_features])
    
    for i in range(n_features):
        for j in range(n_components):
            mean_mat[j, i] = gm_fited_params[var_names[i]]["means"][j]
            sigma_mat[j, i] = gm_fited_params[var_names[i]]["sigmas"][j]
    return weights, mean_mat, sigma_mat


def logp_gm(sample_dict, gm_fited_params):
    var_names = list(sample_dict)
    weights, mu_mat, sigma_mat = make_param_mats(var_names, gm_fited_params)
    
    nsamples = len(sample_dict[var_names[0]])
    
    logps = []
    for i in range(nsamples):
        y_vec = [sample_dict[v][i] for v in var_names]
        y_vec = np.array(y_vec)
        
        logps.append(log_gm_pdf(weights, mu_mat, sigma_mat, y_vec))
    return np.array(logps)

In [5]:
model = pickle.load(open("data/pm_model.pickle", "rb"), encoding="latin1")
trace = pickle.load(open("data/trace_obj.pickle", "rb"), encoding="latin1")

In [6]:
sample = get_values_from_trace(model, trace, thin=10, burn=1000)

In [7]:
sample.keys()

dict_keys(['P0_interval__', 'Ls_log__', 'rho_interval__', 'DeltaG1_interval__', 'DeltaDeltaG_interval__', 'DeltaH1_interval__', 'DeltaH2_interval__', 'DeltaH_0_interval__', 'log_sigma_interval__'])

In [8]:
vars_redun = ["DeltaDeltaG_interval__", "DeltaH2_interval__", "rho_interval__"]
sample_redun = {v: sample[v] for v in vars_redun}

In [16]:
np.corrcoef(sample_redun["DeltaDeltaG_interval__"], sample_redun["DeltaH2_interval__"])

array([[ 1.        , -0.66418762],
       [-0.66418762,  1.        ]])

In [17]:
np.corrcoef(sample_redun["DeltaDeltaG_interval__"], sample_redun["rho_interval__"])

array([[ 1.        , -0.81507442],
       [-0.81507442,  1.        ]])

In [18]:
np.corrcoef(sample_redun["DeltaH2_interval__"], sample_redun["rho_interval__"])

array([[1.        , 0.63664573],
       [0.63664573, 1.        ]])

In [9]:
gm = GaussMix(n_components=2)
gm.fit(sample_redun)

<__main__.GaussMix at 0x1c1f726898>

In [11]:
var_names = gm.get_vars()
var_names

['DeltaDeltaG_interval__', 'DeltaH2_interval__', 'rho_interval__']

In [12]:
gm_params = gm.get_gm_fited_params()
gm_params 

{'DeltaDeltaG_interval__': {'weights': array([0.3841449, 0.6158551]),
  'means': [-2.0727776419495987, -2.0988126610978113],
  'sigmas': [0.012977175461882056, 0.01132260032867812]},
 'DeltaH2_interval__': {'weights': array([0.3841449, 0.6158551]),
  'means': [-0.04358964097384545, -0.03600000421669944],
  'sigmas': [0.005217660743037346, 0.0040884011102282885]},
 'rho_interval__': {'weights': array([0.3841449, 0.6158551]),
  'means': [-1.8077246982554414, -1.6876925899280435],
  'sigmas': [0.06248262122442569, 0.050620789169130875]}}

# Testing if it runs correctly

In [13]:
X_test = gm.sample(n_samples=10)
X_test

{'DeltaDeltaG_interval__': array([-2.05724518, -2.07888896, -2.07351354, -2.0741865 , -2.06958531,
        -2.07517282, -2.10353689, -2.08948616, -2.09619999, -2.08921003]),
 'DeltaH2_interval__': array([-0.04687381, -0.04662911, -0.04126353, -0.04252729, -0.03772795,
        -0.0349692 , -0.03706326, -0.03999489, -0.03443599, -0.03906427]),
 'rho_interval__': array([-1.76702403, -1.78057609, -1.84396141, -1.80886029, -1.80720863,
        -1.64473731, -1.69814697, -1.7735829 , -1.62355458, -1.73398754])}

In [14]:
gm.score_samples(X_test)

array([7.53313439, 8.29027972, 8.39159228, 8.63723898, 8.00946794,
       7.18812068, 9.58255798, 8.14840344, 8.82037984, 8.80592661])

In [15]:
logp_gm(X_test, gm_params)

array([7.53313439, 8.29027972, 8.39159228, 8.63723898, 8.00946794,
       7.18812068, 9.58255798, 8.14840344, 8.82037984, 8.80592661])

# Trying different `n_components` and `covariance_type`

## `n_components = 2`,  `covariance_type=full`

In [19]:
gm = GaussMix(n_components=2, covariance_type="full")
gm.fit(sample_redun)

<__main__.GaussMix at 0x1c24256518>