# MSc DS: Learning a Gaussian with missing values via SGD

In [50]:
import tensorflow as tf
import numpy as np
import scipy.stats
import scipy.io
import scipy.sparse
from scipy.io import loadmat
import pandas as pd
import tensorflow_probability as tfp
tfd = tfp.distributions
tfk = tf.keras
tfkl = tf.keras.layers
from PIL import Image
import matplotlib.pyplot as plt

We load the Iris data set.

In [139]:
from sklearn.datasets import load_iris
data = load_iris(True)[0]

We now standardise the data:

In [140]:
xfull = ((data - np.mean(data,0))/np.std(data,0)).astype(np.float32)
n = xfull.shape[0] # number of observations
p = xfull.shape[1] # number of feat*ures

We will remove uniformy at random 10% of the data. This corresponds to a *missing completely at random (MCAR)* scenario.

In [141]:
perc_miss = 0.1 # 50% of missing data
xmiss = np.copy(xfull)
xmiss_flat = xmiss.flatten()
miss_pattern = np.random.choice(n*p, np.floor(n*p*perc_miss).astype(np.int), replace=False)
xmiss_flat[miss_pattern] = np.nan 
xmiss = xmiss_flat.reshape([n,p]) # in xmiss, the missing values are represented by nans
mask = np.isfinite(xmiss) # binary mask that indicates which values are missing

We want to learn a Gaussian distribution:
$$p(x) = \mathcal{N}(x|\mu,\Sigma), $$
where $\Sigma$ is a diagonal matrix, using maximum likelihood.

In [142]:
mu = tf.Variable(tf.ones([p]), dtype=tf.float32)
log_sigma_diag = tf.Variable(tf.zeros([p]), dtype=tf.float32) # log-sd of the Gaussian

We first use define a function that can compute the likelihood of a complete data point.

In [143]:
@tf.function
def log_likelihood(x):
  sigma_diag = tf.exp(log_sigma_diag)
  p_x = tfd.MultivariateNormalDiag(loc = mu, scale_diag = sigma_diag)
  return(p_x.log_prob(x))

Then, a similar one than can compute $\log p(x^{obs})$, which is the relevant quantity to look at under MCAR.

In [133]:
@tf.function
def log_likelihood_incomplete(x,m): # log(p(x_obs))
  x = tf.squeeze(x)
  m = tf.squeeze(m)
  mean = mu[m]
  sigma_diag = tf.exp(log_sigma_diag[m])
  p_x_obs = tfd.MultivariateNormalDiag(loc = mean, scale_diag = sigma_diag)
  return(p_x_obs.log_prob(x[m]))

Now we perform SGD, first on complete data.

In [144]:
params = [mu] + [log_sigma_diag]

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [145]:
def train_step(data):
  with tf.GradientTape() as tape: # the gradient tape saves all the step that needs to be saved fopr automatic differentiation
    loss = -log_likelihood(data)  # the loss is the average negative log likelihood
  gradients = tape.gradient(loss, params)  # here, the gradient is automatically computed
  optimizer.apply_gradients(zip(gradients, params))  # Adam iteration

In [146]:
def train_step_incomplete(data,mask):
  with tf.GradientTape() as tape: # the gradient tape saves all the step that needs to be saved fopr automatic differentiation
    loss = -log_likelihood_incomplete(data,mask)  # the loss is the average negative log likelihood
  gradients = tape.gradient(loss, params)  # here, the gradient is automatically computed
  optimizer.apply_gradients(zip(gradients, params))  # Adam iteration

In [147]:
train_data_complete = tf.data.Dataset.from_tensor_slices(xfull).shuffle(p).batch(1) 

In [None]:
EPOCHS = 1000

for epoch in range(1,EPOCHS+1):
  for data in train_data_complete:
    train_step(data) # Adam iteration
  if (epoch % 100) == 1:
    ll_train = tf.reduce_mean(log_likelihood(xfull))
    print('Epoch  %g' %epoch)
    print('Training log-likelihood %g' %ll_train.numpy())
    print('Mean %g')
    tf.print(mu)
    print('-----------')

Epoch  1
Training log-likelihood -7.62358
Mean %g
[0.991388202 0.989105403 0.992225766 0.992002726]
-----------


And now on incomplete data.

In [None]:
train_data_incomplete = tf.data.Dataset.from_tensor_slices((xmiss,mask)).shuffle(p).batch(1) 

In [None]:
EPOCHS = 1000

for epoch in range(1,EPOCHS+1):
  for data,m in train_data_incomplete:
    train_step_incomplete(data,m) # Adam iteration
  if (epoch % 100) == 1:
    ll_train = tf.reduce_mean(log_likelihood(xfull))
    print('Epoch  %g' %epoch)
    print('Training log-likelihood %g' %ll_train.numpy())
    print('Mean %g')
    tf.print(mu)
    print('-----------')