In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_iris

In [2]:
class GMM:
    def __init__(self, n_components, max_iter = 100, comp_names=None):
        
        self.n_componets = n_components
        self.max_iter = max_iter
        if comp_names == None:
            self.comp_names = [f"comp{index}" for index in range(self.n_componets)]
        else:
            self.comp_names = comp_names
        self.pi = [1/self.n_componets for comp in range(self.n_componets)]

    def multivariate_normal(self, X, mean_vector, covariance_matrix):
        return (2*np.pi)**(-len(X)/2)*np.linalg.det(covariance_matrix)**(-1/2)*np.exp(-np.dot(np.dot((X-mean_vector).T, np.linalg.inv(covariance_matrix)), (X-mean_vector))/2)

    def fit(self, X):
        new_X = np.array_split(X, self.n_componets)
        self.mean_vector = [np.mean(x, axis=0) for x in new_X]
        self.covariance_matrixes = [np.cov(x.T) for x in new_X]
        del new_X
        for iteration in range(self.max_iter):
            
            self.r = np.zeros((len(X), self.n_componets))
            for n in range(len(X)):
                for k in range(self.n_componets):
                    self.r[n][k] = self.pi[k] * self.multivariate_normal(X[n], self.mean_vector[k], self.covariance_matrixes[k])
                    self.r[n][k] /= sum([self.pi[j]*self.multivariate_normal(X[n], self.mean_vector[j], self.covariance_matrixes[j]) for j in range(self.n_componets)])
            N = np.sum(self.r, axis=0)
            self.mean_vector = np.zeros((self.n_componets, len(X[0])))
            for k in range(self.n_componets):
                for n in range(len(X)):
                    self.mean_vector[k] += self.r[n][k] * X[n]
            self.mean_vector = [1/N[k]*self.mean_vector[k] for k in range(self.n_componets)]
            self.covariance_matrixes = [np.zeros((len(X[0]), len(X[0]))) for k in range(self.n_componets)]
            for k in range(self.n_componets):
                self.covariance_matrixes[k] = np.cov(X.T, aweights=(self.r[:, k]), ddof=0)
            self.covariance_matrixes = [1/N[k]*self.covariance_matrixes[k] for k in range(self.n_componets)]
            self.pi = [N[k]/len(X) for k in range(self.n_componets)]
    def predict(self, X):
        
        probas = []
        for n in range(len(X)):
            probas.append([self.multivariate_normal(X[n], self.mean_vector[k], self.covariance_matrixes[k])
                           for k in range(self.n_componets)])
        cluster = []
        for proba in probas:
            cluster.append(self.comp_names[proba.index(max(proba))])
        return cluster

In [3]:
data = pd.read_csv('iris.csv')
x_train = data.iloc[:,[0,1,2,3]].values
y = data.iloc[:,[5]].values
data.drop(columns=['Id', 'Species','PetalLengthCm','PetalWidthCm'], axis=1, inplace=True)
data = pd.DataFrame(data)
dta = data
data = data.to_numpy()
type(data)

numpy.ndarray

In [4]:
gmm = GMM(n_components = 3)
gmm.fit(data)
labels = gmm.predict(data)
print(len(labels))

150


In [5]:
def fit_y_data(y):
    a_y = []
    for i in y:
        if(i == 'comp0'):
            a_y.append(1)
        if(i == 'comp1'):
            a_y.append(2)
        if(i == 'comp2'):
            a_y.append(3)
    y = np.array(a_y)
    return y

In [6]:
def fit_y_data2(y):
    a_y = []
    for i in y:
        if(i == 'Iris-setosa'):
            a_y.append(1)
        if(i == 'Iris-versicolor'):
            a_y.append(2)
        if(i == 'Iris-virginica'):
            a_y.append(3)
    y = np.array(a_y)
    return y

In [7]:
y = fit_y_data2(y)

In [8]:
cluster = fit_y_data(labels)
print(len(cluster))

150


In [9]:
print("accurancy = " ,np.mean(y == cluster))

accurancy =  0.8066666666666666


In [10]:
#s = np.linalg.norm(p - d)

In [47]:
data = pd.read_csv('iris.csv')
data['cluster'] = cluster
data.drop(columns=['Id', 'Species'], axis=1, inplace=True)


In [48]:
X, _ = load_iris(return_X_y=True)
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
# for p in data

In [72]:
def distance_cal(s,p):
    a = np.delete(s, len(s)-1)
    b = np.delete(p, len(p)-1)
    return np.linalg.norm(a - b)

In [84]:
def calcu_sil_score(data):
    sil = []
    for i in range(len(data)):
        s = data[i] 
        a = []
        b = []
        for p in data:
            if(p[4] == s[4]):
                a.append(distance_cal(s,p))
            else:
                b.append(distance_cal(s,p))

        a_n = np.asarray(a)
        b_n = np.asarray(b)
        # print("b_n",b_n)
        # print("a_n",a_n)
        a_i = np.average(a_n)
        b_i = np.average(b_n)
        # print("b_i ",b_i)
        # print("a_i",a_i)
        s_i = (b_i - a_i)/max(b_i,a_i)
        # print(s_i)
        sil.append(s_i)
        # break

    print(sil)
    sil_n = np.asarray(sil)
    sil_score = np.mean(sil_n)
    return sil_score
            
        


In [85]:
a = calcu_sil_score(data.to_numpy())
print(a)

[0.8798957724779872, 0.8495963124537335, 0.8591795930032032, 0.840143613801283, 0.8758630257692814, 0.7937829412711846, 0.8525846511493408, 0.8817187734573061, 0.7932938276748445, 0.8598356369981468, 0.8386617782960565, 0.86683776345261, 0.8451150423826004, 0.7850582537974221, 0.7484372252939795, 0.6980043088297391, 0.8131445829706637, 0.8784573450614451, 0.7592340612135672, 0.8522088126715, 0.8261922930687225, 0.8575585259933788, 0.8247272099575004, 0.8349037910244457, 0.8190703973694259, 0.8378806653844986, 0.8657224608929307, 0.8718467680596345, 0.87282899087879, 0.8522167720541183, 0.8508166372861865, 0.8368855803115524, 0.8017832809944953, 0.7662035529568983, 0.8598356369981468, 0.8617477878990786, 0.8302079484723186, 0.8598356369981468, 0.8063801176977703, 0.8790316458136787, 0.8760176878007648, -0.0377641762149475, 0.8216678364372523, 0.8378765024992076, 0.7950859309064836, 0.8441557484526193, 0.8469785087788914, 0.8508966782288248, 0.8512314057354744, 0.8799410513885101, 0.5861

In [87]:
a = silhouette_score(X,cluster)
print(a)

0.4713825156872283
