# 数据集: 仿真两个高斯分布数据并混合
- 数据长度：1000
$$
\phi (x; \theta) = \frac{1} {\sqrt {(2 \pi)} \sigma} exp{- \frac {(x - \mu)^2} {2{\sigma}^2}}
$$

In [1]:
import numpy as np
import random
import math


In [2]:
# create data
def load_data(mu0, sigma0, mu1, sigma1, alpha0, alpha1):
    """
    初始化数据
    高斯分布0 均值、方差、系数
    高斯分布1
    """
    
    # length
    data_len = 1000
    
    data0 = np.random.normal(mu0, sigma0, int(data_len * alpha0))
    data1 = np.random.normal(mu1, sigma1, int(data_len * alpha1))
    
    dataSet = []
    dataSet.extend(data0)
    dataSet.extend(data1)
    
    # 打乱
    random.shuffle(dataSet)
    
    return dataSet

In [13]:
# EM Algorithm
def cacl_Gauss(datasetArr, mu, sigma):
    """
    计算高斯密度函数计算值 9.25
    """
    
    #
    result = (1 / (math.sqrt(2 * math.pi) * sigma)) * np.exp(-1 * ((datasetArr - mu) **2) / (2 * sigma **2))
    
    return result


def E_step(datasetArr, alpha0, mu0, sigma0, alpha1, mu1, sigma1):
    """
    EM算法中的E步
    """
    gamma0 = alpha0 * cacl_Gauss(datasetArr, mu0, sigma0)
    gamma1 = alpha1 * cacl_Gauss(datasetArr, mu1, sigma1)
    
    sum_ = gamma0 + gamma1
    
    gamma0 = gamma0 / sum_
    gamma1 = gamma1 / sum_
    
    return gamma0, gamma1

def M_step(mu0, mu1, gamma0, gamma1, datasetArr):
    """
    EM算法中M步
    """
    mu0_new = np.dot(gamma0, datasetArr) / np.sum(gamma0)
    mu1_new = np.dot(gamma1, datasetArr) / np.sum(gamma1)
    
    sigma0_new = math.sqrt(np.dot(gamma0, (datasetArr - mu0) ** 2) / np.sum(gamma0))
    sigma1_new = math.sqrt(np.dot(gamma1, (datasetArr - mu1) ** 2) / np.sum(gamma1))
    
    alpha0_new = np.sum(gamma0) / len(gamma0)
    alpha1_new = np.sum(gamma1) / len(gamma1)
    
    return mu0_new, mu1_new, sigma0_new, sigma1_new, alpha0_new, alpha1_new

def EM_Train(datasetList, max_iter = 200):
    """
    根据EM算法进行参数估计
    """
    
    datasetArr = np.array(datasetList)
    
    # initialize parameter
    alpha0 = 0.5
    mu0 = 0
    sigma0 = 1
    
    alpha1 = 0.5
    mu1 = 1
    sigma1 = 1
    
    step = 0
    
    while (step < max_iter):
        step += 1
        # E step
        gamma0, gamma1 = E_step(datasetArr, alpha0, mu0, sigma0, alpha1, mu1, sigma1)
        # M step
        mu0, mu1, sigma0, sigma1, alpha0, alpha1 = M_step(mu0, mu1, gamma0, gamma1, datasetArr)
        
    return alpha0, mu0, sigma0, alpha1, mu1, sigma1



In [14]:
alpha0 = 0.3
mu0 = -2
sigma0 = 0.5

alpha1 = 0.7
mu1 = 0.5
sigma1 = 1

datasetList = load_data(mu0, sigma0, mu1, sigma1, alpha0, alpha1)

print("------------------------\n")
print("The parameter set is: \n")
print("alpha0: %.3f,  mu0: %.3f,    sigma0: %.3f,   alpha1: %.3f,   mu1:  %.3f,   sigma1: %.3f" % (alpha0, mu0, sigma0, alpha1, mu1, sigma1))

# EM 算法
alpha0, mu0, sigma0, alpha1, mu1, sigma1 = EM_Train(datasetList, max_iter= 500)

print("------------------------\n")
print("Predict the parameter set is: \n")
print("alpha0: %.3f,   mu0: %.3f,   sigma0: %.3f,   alpha1: %.3f,    mu1:  %.3f,   sigma1: %.3f" % (alpha0, mu0, sigma0, alpha1, mu1, sigma1))

------------------------

The parameter set is: 

alpha0: 0.300,  mu0: -2.000,    sigma0: 0.500,   alpha1: 0.700,   mu1:  0.500,   sigma1: 1.000
------------------------

Predict the parameter set is: 

alpha0: 0.273,   mu0: -2.069,   sigma0: 0.434,   alpha1: 0.727,    mu1:  0.376,   sigma1: 1.042
