# 最大期望算法 Expectation Maximization Algorithm

## 代码实现

### 生成测试数据

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# random number seed, for reproduction
np.random.seed(20200503)

# randomly generate 100 boys' height data, with mean = 175, std = 10
N = 10000
mu = 175
sigma = 10
height_of_boy = mu + sigma * np.random.randn(N)

# randomly generate 100 girls' height data, with mean = 160, std = 10
mu = 160
height_of_girl = mu + sigma * np.random.randn(N)

fig, axs = plt.subplots(3, 1, constrained_layout=True, figsize=(8, 8), dpi=300)

axs[0].hist(height_of_boy, bins=20, density=True)
axs[0].set_title('Boys Histogram of Height: $\mu$ = 175cm, $\sigma$ = 10')
axs[0].set_xlabel('Height')
axs[0].set_ylabel('Probability Density')

axs[1].hist(height_of_girl, bins=20, density=True)
axs[1].set_title('Girls Histogram of Height: $\mu$ = 160cm, $\sigma$ = 10')
axs[1].set_xlabel('Height')
axs[1].set_ylabel('Probability Density')

# mix the data
mixed_height = np.asarray([height_of_boy, height_of_girl])
mixed_height = mixed_height.reshape(2 * N, )

axs[2].hist(mixed_height, color=(0,0,1), bins=40, density=True)
axs[2].set_title('Histogram of Height')
axs[2].set_xlabel('Height')
axs[2].set_ylabel('Probability Density')

plt.show()

### EM 算法求解男生、女生的身高

In [None]:
import pylab as pl
from IPython import display

# output normal dist. f(x) based on the input x
def gaussian_distribution(x, mu, std):
    return 1.0 / np.sqrt(2*np.pi*(std**2)) * np.exp(-((x-mu)**2) / (2*std**2))


# E step: estimate the prob. (implicit var.) of boys' samples based on the dist. pars.
def E_step(heights, mu_b, var_b, mu_g, var_g):
    b = gaussian_distribution(heights, mu_b, var_b)
    g = gaussian_distribution(heights, mu_g, var_g)
    return b / (b + g)


# M step: recalculate the prob. dist. of boys and girls as the prob. dist. of implicit var. is known
def M_step(heights, prob_of_boys):
    mu_b = (heights * prob_of_boys).sum() / prob_of_boys.sum()
    var_b = np.sqrt((prob_of_boys * (heights - mu_b)**2).sum() / prob_of_boys.sum())
    
    prob_of_girl = 1 - prob_of_boys
    mu_g = (heights * prob_of_girl).sum() / prob_of_girl.sum()
    var_g = np.sqrt((prob_of_girl * (heights - mu_g)**2).sum() / prob_of_girl.sum())
    return (mu_b, var_b, mu_g, var_g)


# whole proc. of EM Alg.
# 1. init. dist. pars. randomly
# 2. E step
# 3. M step
# 4. iterate the proc. until the change of pars. is small
def EM_iteration(heights, iters = 5):
    mu_b = 180
    var_b = 5
    mu_g = 150
    var_g = 5
    x_range = np.linspace(heights.min(), heights.max(), 100)
    
    fig = pl.figure(figsize=(12,8), dpi=300)
    ax = fig.subplots()
    for i in range(iters):
        ax.cla()
        ax.hist([height_of_boy, height_of_girl], bins=50, density=True, 
                color=['C0', 'C1'], label=['Data: Boys', 'Data: Girls'])
        ax.set_title('Histogram of Height, iter = %d' % (i+1))
        
        ax.plot(x_range, gaussian_distribution(x_range, mu_b, var_b), color='C0',
                label='$\mu_b$=%.2f, $\sigma_b$=%.2f' % (mu_b, var_b))
        
        ax.plot(x_range, gaussian_distribution(x_range, mu_g, var_g), color='C1',
                label='$\mu_g$=%.2f, $\sigma_g$=%.2f' % (mu_g, var_g))
        
        ax.set_ylabel('Probability density')
        ax.set_xlabel('Height')
        ax.legend(loc='upper right')
        _ = display.clear_output(wait=True)
        _ = display.display(fig)
    
        prob_of_boys = E_step(heights, mu_b, var_b, mu_g, var_g)
        mu_b, var_b, mu_g, var_g = M_step(heights, prob_of_boys)

    pl.close()

EM_iteration(mixed_height, 20)