In [85]:
#loading library
import numpy as np
import scipy as sp
import pandas as pd
from sklearn.cluster import KMeans

In [108]:
#loading data
data = pd.read_table('EMGaussian.data', header = None, sep='\s+', names = ('x1', 'x2'))
test = pd.read_table('EMGaussian.test', header = None, sep='\s+', names = ('x1', 'x2'))
X = np.array(data)

In [109]:
X

array([[  0.91029 ,   0.016192],
       [ -2.531   ,   6.6981  ],
       [  4.3602  ,   8.3541  ],
       [ -0.70489 ,  -0.51343 ],
       [ -5.7383  ,  -6.2088  ],
       [ -4.4819  ,  -4.7127  ],
       [  0.90842 ,   0.38202 ],
       [  4.4273  ,   5.8406  ],
       [ -5.5724  ,   3.9576  ],
       [  4.256   ,  -0.71465 ],
       [ -1.8763  ,   4.3208  ],
       [  3.9002  ,   4.9458  ],
       [ -1.6512  ,   3.1684  ],
       [  3.4097  ,  -2.8458  ],
       [  2.7509  ,  -3.7186  ],
       [  4.0017  ,  -5.6322  ],
       [ -1.3762  ,   2.2893  ],
       [  3.1672  ,  -2.0876  ],
       [  2.5706  ,   7.835   ],
       [  4.1783  ,   0.46918 ],
       [ -1.4094  ,   1.6812  ],
       [  3.3032  ,  -0.23407 ],
       [ -0.6899  ,   5.6732  ],
       [  4.4551  ,   0.91978 ],
       [  0.43072 ,   6.1793  ],
       [  4.2493  ,   7.8125  ],
       [ -2.2102  ,   5.1359  ],
       [  4.5083  ,   6.8181  ],
       [ -1.5381  ,   4.4656  ],
       [  3.8053  ,  -0.85611 ],
       [ -

In [110]:
#initialization
K = 4
[N,d] = data.shape #dimension of x
kmean_res = KMeans(init = 'random', n_clusters=4).fit(data)
lmu = kmean_res.cluster_centers_
lsigma = [np.eye(2,2) for k in range(K)]
lpi = [1/K for k in range(K)]

In [111]:
lmu

array([[-2.24034753,  4.12744772],
       [ 3.80280826,  5.10467248],
       [ 3.33557966, -2.644529  ],
       [-3.81879354, -4.27423441]])

In [112]:
def compute_loglikelihood(X, Q, lpi, lmu, lsigma):
    (N,d)=X.shape
    lglklhd=0
    for i in range(N):
        for k in range(K):
            lglklhd += Q[i,k]*np.log(sp.stats.multivariate_normal.pdf(X[i,:],lmu[k,:], lsigma[k])) + Q[i,k]*np.log(lpi[k])
    return lglklhd

In [113]:
#lpi is 1d-array; lmu, X are 2d-arrays; lsig is a list

Q = np.zeros((N,K))
condition = False
count = 0
tol=0.001

lglklhd=compute_loglikelihood(X,Q,lpi,lmu,lsigma)

while(condition==False & count < 30):
    
    count += 1
#E-step
    for i in range(N):
        denom=0
        for k in range(K):
            denom=denom+lpi[k]*sp.stats.multivariate_normal.pdf(X[i,:],lmu[k,:], lsigma[k])
        for k in range(K):
            Q[i,k]=lpi[k]*sp.stats.multivariate_normal.pdf(X[i,:],lmu[k,:], lsigma[k])/denom

#M-step
    for k in range(K):
        #compute mu
        denom_mu=np.zeros(2)
        for i in range(N):
            denom_mu += X[i,:]*Q[i,k]
        lmu[k,:]= denom_mu/np.sum(Q[:,k])
        #compute sigma
        denom_sigma=np.matlib.zeros((2,2))
        for i in range(N):
            A=np.asmatrix(X[i,:]-lmu[k])
            denom_sigma += A.T.dot(A)
        lsigma[k]=np.asarray(denom_sigma/np.sum(Q[:,k]))
        #compute pi
        lpi[k]=np.sum(Q[:,k])/np.sum(Q)
        
        lg_past=lglklhd
        lglklhd=compute_loglikelihood(X,Q,lpi,lmu,lsigma)
        if (np.abs(lglklhd-lg_past<tol)):
            condition=True
        
    

In [114]:
lmu

array([[-2.22375874,  4.12517673],
       [ 3.78817488,  5.10711606],
       [ 3.35897274, -2.66609607],
       [-3.7969143 , -4.23540821]])

In [115]:
count

1

In [96]:
X[:,0]

array([ 0.91029 , -2.531   ,  4.3602  , -0.70489 , -5.7383  , -4.4819  ,
        0.90842 ,  4.4273  , -5.5724  ,  4.256   , -1.8763  ,  3.9002  ,
       -1.6512  ,  3.4097  ,  2.7509  ,  4.0017  , -1.3762  ,  3.1672  ,
        2.5706  ,  4.1783  , -1.4094  ,  3.3032  , -0.6899  ,  4.4551  ,
        0.43072 ,  4.2493  , -2.2102  ,  4.5083  , -1.5381  ,  3.8053  ,
       -3.6955  ,  4.0747  , -3.3814  ,  4.3867  , -3.2273  ,  3.5652  ,
        0.98228 ,  4.3048  , -5.6913  ,  4.3524  , -3.3028  ,  4.2079  ,
       -2.8765  ,  4.4197  , -4.3528  ,  3.7617  , -2.0124  ,  3.7219  ,
       -4.1637  ,  3.7035  , -1.5331  ,  4.4484  , -2.8353  ,  4.0611  ,
       -2.0325  ,  3.4695  , -0.75983 ,  3.8583  ,  3.4517  , -0.41852 ,
        3.7967  , -1.4054  ,  4.723   , -0.60948 ,  4.6102  , -2.4528  ,
        4.1376  , -1.4837  ,  3.9858  ,  0.29761 ,  5.0873  , -3.3855  ,
        3.8461  , -0.19813 ,  3.8222  ,  1.4774  ,  3.7977  , -1.8651  ,
        4.2754  , -0.45499 ,  4.4675  , -2.916   , 

In [106]:
#now let's plot 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from plotly.graph_objs import Scatter, Figure, Layout

trace = Scatter(
    x = X[:,0],
    y = X[:,1],
    mode = 'markers'
)

centroids = Scatter(
    x = lmu[:,0]
    y = lmu[:,1]
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [108]:
k=1
np.dot(np.transpose((X - lmu[k])),(X - lmu[k]))

array([[  6811.90148996,   2270.04417975],
       [  2270.04417975,  10389.73236583]])

In [101]:
lmu[1]

array([ 0.70680149,  0.70680149])

In [17]:
mu0
sigma0

array([[ 0.70746067,  0.        ],
       [ 0.        ,  0.70746067]])

In [105]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from plotly.graph_objs import Scatter, Figure, Layout

iplot([Scatter(x=[1, 2, 3], y=[3, 1, 6])])