In [48]:
import pandas as pd
import numpy as np
# import seaborn as sns
from operator import attrgetter
import math
from matplotlib import pyplot as plt

In [17]:
# importing R dependencies
import rpy2
import readline
import rpy2.robjects as robjects

In [18]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [19]:
robjects.r('pi')

array([ 3.14159265])

In [22]:
%R require(ggplot2)




array([0], dtype=int32)

In [26]:
robjects.r.array([1])

R object with classes: ('array',) mapped to:
<Array - Python:0x7fa50d6e4dc8 / R:0x30a3b38>
[IntVector]

We begin with the following mixture model $f$
$$f(x) = \pi \phi(x|\mu_{1},\sigma_{1}^{2}) + (1-\pi)\phi(x|\mu_{2},\sigma_{2}^{2})$$

where $\pi \in [0,1]$.


Given a dataset of $n$ observations we have that the likelihood function is

$$
L(\mu_{1},\sigma_{1}^{2},\mu_{2},\sigma_{2}^{2}|x) = 
\prod_{i=1}^{n}[\pi\phi(x_{i}|\mu_{1},\sigma_{1}^{2}) + (1-\pi)\phi(x_{i}|\mu_{2},\sigma_{2}^{2})]
$$

# Algorithm

In [40]:
class Particle:
    def __init__(self,position,velocity):
        self.position = position
        self.velocity = velocity
        self.pbest = position
        self.current_fitness = 0
        self.best_fitness = 0
        self.num_parameters = len(self.position)
    def update_position(self,gbest,
                        use_boundary=False,
                        lower_bound=None,
                        upper_bound=None):              
        c1 = 2
        c2 = 2
        r1 = np.random.rand(1)
        r2 = np.random.rand(1)
        delta = 2*r1*(self.pbest - self.position) + 2*r2*(gbest - self.position)
        
        upper_bound=np.zeros(self.num_parameters),
        lower_bound=np.zeros(self.num_parameters)
        
        cond1 = delta <= upper_bound
        cond2 = delta >= lower_bound
        
        if (use_boundary and cond1.all() and cond2.all()) or use_boundary==False:
            self.velocity = self.velocity + delta
            self.position = self.position + self.velocity
    def _phi(self,x,mean,var):
        k = math.sqrt(2*math.pi*var)
        p = math.exp((x-mean)*(x-mean)/(-2*var))
        return p/k
    def _mix_dist(self,x,pi,mean1,mean2,var1,var2):
        comp1 = pi*self._phi(x,mean1,var1)
        comp2 = (1-pi)*self._phi(x,mean2,var2)
        return comp1+comp2
    def _gaussian_likelihood(self,x,pi,mean1,mean2,var1,var2):
        # x is a vector of data
        n = len(x)
        p = 1
        for i in range(0,n):
            p = p * self._mix_dist(x[i],pi,mean1,mean2,var1,var2)
        return p
    def calculate_fitness(self,x,pi,mean1,mean2,var1,var2):
        self.current_fitness = self._gaussian_likelihood(x,pi,mean1,mean2,var1,var2)
        if self.current_fitness > self.best_fitness:
            self.pbest = self.position
            self.best_fitness = self.current_fitness

In [78]:
# Algorithm Parameters

N = 100
iterations = 50

a = np.random.normal(0,1,700)
b = np.random.normal(5,1,300)

x = np.concatenate((a,b),axis=0)

plt.hist(s)
plt.show()

data_min = min(x)
data_max = max(x)
lower_boundary = np.array([0,data_min,data_min,0,0])
upper_boundary = np.array([1,data_max,data_max,10,10])    
particles = [0] * N
tolerance = 0.001

In [79]:
# random intialization of particles

np.random.seed(69)

# estimating a weight parameter and mu parameter for each of the two gaussian components
# posn[0] = weight
# posn[1] = mu1
# posn[2] = mu2

for p in range(N):    
    rand_pi = np.random.uniform(0,1)
    rand_mean1 = np.random.uniform(data_min,data_max)
    rand_mean2 = np.random.uniform(data_min,data_max)
    sigma1 = 1
    sigma2 = 1
    rand_posn = np.array([rand_pi,rand_mean1,rand_mean2])
    rand_velocity = np.array([np.random.uniform(0,1),
                             np.random.uniform(0,1),
                             np.random.uniform(0,1)])
    
    particles[p] = Particle(rand_posn,rand_velocity)
    particles[p].calculate_fitness(x,rand_pi,rand_mu1,rand_mu2,sigma1,sigma2)

gbest = max(particles,key=attrgetter('best_fitness'))    

# repeat until convergence
for i in range(iterations):
    if i%100 == 0:
        print("iteration " + str(i))
    for p in particles:
        p.update_position(gbest.position,True,lower_boundary,upper_boundary)

        params = p.position
        pi = params[0]
        mean1 = params[1]
        mean2 = params[2]
        sigma1 = 1
        sigma2 = 1
        p.calculate_fitness(x,pi,mean1,mean2,sigma1,sigma2)
        gbest = max(particles,key=attrgetter('best_fitness'))    

print(gbest.pbest)
print(gbest.best_fitness)

iteration 0
[ 0.29624916  5.98658273  0.75084666]
0


In [None]:
# next steps: 
# improve efficiency of likelihood calculation (mapreduce?
# combine R and python scripts
# add sigma parameters
# plot the estimated mixture densities vs kde for EM and PSO
# port EM to python

# monte carlo: run algorithm multiple times 
# R generates data, passes to python