In [2]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture

# Load your dataset
data = pd.read_csv('data.csv')

# Extract the relevant features (age, income, etc.) as NumPy array
X = data[['Gender', 'Marital status', 'Age', 'Income', 'Education', 'Occupation', 'Settlement size', ]].values

# Instantiate the GMM model
n_components = 10  # You can vary the number of components
n_clusters = 10
n_samples, n_features = X.shape
np.random.seed(42)
gmm = GaussianMixture(n_components=n_components, max_iter=100, tol=1e-4)
gmm.means_init = X[np.random.choice(n_samples, n_clusters, replace=False)]
gmm.covariances_init = [np.cov(X.T)] * n_clusters
gmm.weights_init = np.ones(n_clusters) / n_clusters
# Fit the data
print(gmm.means_init)
gmm.fit(X)

# Get the parameters of the GMM
# weights = gmm.weights_
# means = gmm.means_
# covariances = gmm.covariances_

# Predict cluster labels for each data point
labels = gmm.predict(X)

# Predict posterior probabilities of each component for each data point
probs = gmm.predict_proba(X)
weights = gmm.weights_
means = gmm.means_
covariances = gmm.covariances_
print(weights)
print(means)


[[     1      1     43  48632      1      0      0]
 [     0      0     28 141847      0      1      1]
 [     1      1     48 116235      2      0      0]
 [     0      0     20 116582      0      2      2]
 [     0      0     49 118571      1      2      2]
 [     0      1     21 101471      1      2      1]
 [     0      0     42 162869      1      1      1]
 [     1      1     32 124975      1      1      0]
 [     0      0     48 148115      1      1      1]
 [     1      1     25  94075      1      1      0]]
[0.15849547 0.00549999 0.083503   0.0625     0.21199204 0.17573189
 0.03577761 0.10192246 0.13257753 0.032     ]
[[6.59296382e-01 7.19239377e-01 2.77758898e+01 8.26584593e+04
  7.19239377e-01 0.00000000e+00 0.00000000e+00]
 [6.36364682e-01 4.54546201e-01 5.39999837e+01 1.42907925e+05
  1.72727228e+00 3.63636961e-01 1.18181703e+00]
 [8.74274109e-01 3.77259306e-01 4.48315363e+01 9.56326663e+04
  1.58079077e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.03

In [118]:
import numpy as np
import pandas as pd

import numpy as np
from scipy.stats import multivariate_normal
def multivariate_normal_pdf(x, mean, cov, allow_singular=True):
    """
    Calculate the PDF of a multivariate normal distribution.

    Parameters:
        x (numpy.ndarray): The input vector at which to evaluate the PDF.
        mean (numpy.ndarray): The mean vector of the distribution.
        cov (numpy.ndarray): The covariance matrix of the distribution.
        allow_singular (bool): Whether to allow singular covariance matrices.

    Returns:
        float: The PDF value at the given point x.
    """
    # Check that the dimensions of x, mean, and cov are compatible
    if len(x) != len(mean) or len(x) != cov.shape[0] or cov.shape[0] != cov.shape[1]:
        raise ValueError("Input dimensions are not compatible.")

    try:
        # Calculate the normalization factor
        n = len(x)
        normalization = 1.0 / ((2 * np.pi) ** (n / 2) * np.sqrt(np.linalg.det(cov)))

        # Calculate the exponent in the PDF
        exponent = -0.5 * np.dot(np.dot((x - mean).T, np.linalg.inv(cov)), (x - mean))

        # Calculate the PDF value
        pdf = normalization * np.exp(exponent)

        return pdf
    except LinAlgError as e:
        if allow_singular:
            # If allow_singular is True, return 0 for singular covariance matrix
            return 0.0
        else:
            raise e
class GaussianMixtureModel:
    def __init__(self, n_clusters=2, max_iter=100, tol=1e-4, random_state=42):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.pi = None  # Cluster weights
        self.mu = None  # Cluster means
        self.sigma = None  # Cluster covariances

    def fit(self, X):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        self.pi = np.ones(self.n_clusters) / self.n_clusters
        self.mu = X[np.random.choice(n_samples, self.n_clusters, replace=False)]
        self.sigma = [np.cov(X.T)] * self.n_clusters
        for _ in range(self.max_iter):
            # Expectation step
            gamma = self._expectation(X)

            # Maximization step
            self._maximization(X, gamma)

            # Check for convergence
            diff = np.mean((np.abs(self.mu - self.mu_prev)))
            print(diff)
            if np.all(np.abs(self.mu - self.mu_prev) < self.tol):
                break

    def _expectation(self, X):
        gamma = np.zeros((X.shape[0], self.n_clusters))
    
        for k in range(self.n_clusters):
            try:
                # print(X.shape)
                # print(self.mu[k].shape)
                # print(self.pi[k])
                self.sigma[k] += 1e-6 * np.identity(X.shape[1])
                gamma[:, k] = self.pi[k] * multivariate_normal.pdf(X, self.mu[k], self.sigma[k], allow_singular = True)
                # print(gamma[:, k].shape)
            except np.linalg.LinAlgError:
            # Handle the singular covariance matrix (e.g., add regularization)
                self.sigma[k] += 1e-6 * np.identity(X.shape[1])
                gamma[:, k] = self.pi[k] * multivariate_normal_pdf(X, self.mu[k], self.sigma[k], allow_singular=True)
    
        gamma /= np.sum(gamma, axis=1, keepdims=True)
        return gamma


    def _maximization(self, X, gamma):
        n_samples, _ = X.shape
        self.mu_prev = self.mu.copy()

        for k in range(self.n_clusters):
            Nk = np.sum(gamma[:, k])
            self.pi[k] = Nk / n_samples
            self.mu[k] = np.sum(X * gamma[:, k][:, np.newaxis], axis=0) / Nk
            diff = X - self.mu[k]
            self.sigma[k] = np.dot((diff * gamma[:, k][:, np.newaxis]).T, diff) / Nk
        
        # Add regularization term
            # self.sigma[k] += 1e-6 * np.identity(X.shape[1])


    def get_parameters(self):
        return self.pi, self.mu, self.sigma

    def calculate_likelihoods(self, X):
        likelihoods = np.zeros((X.shape[0], self.n_clusters))
    
        for k in range(self.n_clusters):
            try:
                self.sigma[k] += 1e-6 * np.identity(X.shape[1])
                likelihoods[:, k] = multivariate_normal.pdf(X, self.mu[k], self.sigma[k], allow_singular = True)
            except np.linalg.LinAlgError:
            # Handle the singular covariance matrix (e.g., add regularization)
                self.sigma[k] += 1e-6 * np.identity(X.shape[1])
                likelihoods[:, k] = multivariate_normal_pdf(X, self.mu[k], self.sigma[k], allow_singular=True)
    
        return likelihoods


    def get_membership_values(self, X):
        gamma = self._expectation(X)
        return gamma


# Load your dataset
data = pd.read_csv('data.csv')

# Extract the relevant features (age, income, etc.) as NumPy array
X = data[['Gender', 'Marital status', 'Age', 'Income', 'Education', 'Occupation', 'Settlement size']].values
# X = data[['Gender', 'Marital status',]].values

# Instantiate your custom GMM model
# Assuming you have your data in a DataFrame 'data'

# Initialize and fit the GMM
gmm = GaussianMixtureModel(n_clusters=10)  # Change the number of clusters as needed
gmm.fit(X)

# Get GMM parameters
pi, mu, sigma = gmm.get_parameters()

# Calculate likelihoods for a given set of samples
likelihoods = gmm.calculate_likelihoods(X)

# Get membership values for data samples
membership_values = gmm.get_membership_values(X)
print(pi)
print(mu)
print(sigma)



1798.9
372.45714285714286
478.5
171.27142857142857
452.9
445.12857142857143
259.8857142857143
302.5857142857143
358.42857142857144
290.14285714285717
326.6857142857143
171.37142857142857
321.54285714285714
292.62857142857143
181.35714285714286
189.02857142857144
152.0
132.24285714285713
281.48571428571427
295.8142857142857
160.04285714285714
244.85714285714286
149.4857142857143
217.5
154.44285714285715
195.15714285714284
188.71428571428572
268.34285714285716
300.37142857142857
217.07142857142858
162.64285714285714
138.92857142857142
154.44285714285715
157.5857142857143
188.92857142857142
281.1714285714286
175.0
142.97142857142856
199.97142857142856
300.6714285714286
361.0571428571429
223.27142857142857
243.07142857142858
331.15714285714284
145.12857142857143
211.38571428571427
147.12857142857143
168.1
183.0857142857143
172.55714285714285
172.68571428571428
119.01428571428572
118.14285714285714
192.14285714285714
169.27142857142857
213.22857142857143
164.94285714285715
150.2571428571428

In [41]:
import numpy as np
from scipy.stats import multivariate_normal

class GMM:
    def __init__(self, n_components=2, max_iter=100, tol=1e-3):
        self.n_components = n_components
        self.max_iter = max_iter
        self.tol = tol

    def fit(self, data):
        self.data = data
        self.n_samples, self.n_features = data.shape

        # Initialize parameters
        self.initialize_parameters()

        for _ in range(self.max_iter):
            self.old_weights = self.weights.copy()
            self.old_means = self.means.copy() 
            # Expectation step: Calculate responsibilities
            responsibilities = self.expectation_step()

            # Maximization step: Update parameters
            self.update_parameters(responsibilities)

            # Check for convergence
            if self.converged():
                break

    def initialize_parameters(self):
        self.weights = np.ones(self.n_components) / self.n_components
        self.means = np.random.rand(self.n_components, self.n_features)
        self.covariances = [np.eye(self.n_features) for _ in range(self.n_components)]

    def expectation_step(self):
        responsibilities = np.zeros((self.n_samples, self.n_components))

        for k in range(self.n_components):
            print(self.covariances[k])
            responsibilities[:, k] = self.weights[k] * multivariate_normal.pdf(
                self.data, self.means[k], self.covariances[k], allow_singular = True
            )

        # Normalize responsibilities
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)

        return responsibilities

    def update_parameters(self, responsibilities):
        # Update weights
        self.weights = responsibilities.mean(axis=0)

        # Update means
        for k in range(self.n_components):
            weighted_sum = (responsibilities[:, k] @ self.data) / responsibilities[:, k].sum()
            self.means[k] = weighted_sum

        # Update covariances
        for k in range(self.n_components):
            weighted_diff = self.data - self.means[k]
            weighted_cov = (responsibilities[:, k][:, np.newaxis] * weighted_diff).T @ weighted_diff
            self.covariances[k] = weighted_cov / responsibilities[:, k].sum()

    def converged(self):
        # Check for convergence based on parameter updates
        return np.all(np.abs(self.old_weights - self.weights) < self.tol) and \
               np.all(np.abs(self.old_means - self.means) < self.tol)

    def predict(self, data):
        responsibilities = np.zeros((data.shape[0], self.n_components))

        for k in range(self.n_components):
            responsibilities[:, k] = self.weights[k] * multivariate_normal.pdf(
                data, self.means[k], self.covariances[k], allow_singular = True
            )

        # Normalize responsibilities
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)

        return np.argmax(responsibilities, axis=1)

    def get_parameters(self):
        return self.weights, self.means, self.covariances

    def predict_proba(self, data):
        responsibilities = np.zeros((data.shape[0], self.n_components))

        for k in range(self.n_components):
            responsibilities[:, k] = self.weights[k] * multivariate_normal.pdf(
                data, self.means[k], self.covariances[k], allow_singular = True
            )

        # Normalize responsibilities
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)

        return responsibilities

# Example usage:
if __name__ == "__main__":
    # Generate synthetic data for testing
    np.random.seed(0)
    n_samples = 300
    means = np.array([[-1, -1], [1, 1], [2, 3]])
    covs = np.array([[[1, 0.5], [0.5, 1]], [[1, -0.7], [-0.7, 1]], [[0.5, 0], [0, 0.5]]])
    weights = [0.3, 0.5, 0.2]

    component_indices = np.random.choice(3, size=n_samples, p=weights)
    samples = np.array([np.random.multivariate_normal(means[i], covs[i]) for i in component_indices])

    # Fit the GMM
    gmm = GMM(n_components=3)
    gmm.fit(samples)

    # Get cluster labels and probabilities
    print(samples)
    labels = gmm.predict(samples)
    probabilities = gmm.predict_proba(samples)

    print("Cluster labels:")
    print(labels)
    print("Probabilities:")
    print(probabilities)


[[1. 0.]
 [0. 1.]]
[[1. 0.]
 [0. 1.]]
[[1. 0.]
 [0. 1.]]
[[1.6302561  1.10495773]
 [1.10495773 2.58785537]]
[[1.77358218 1.09428321]
 [1.09428321 2.40526503]]
[[1.71118145 1.1606081 ]
 [1.1606081  2.60096716]]
[[1.5819085  1.09077408]
 [1.09077408 2.60102915]]
[[1.81526184 1.06606535]
 [1.06606535 2.32077766]]
[[1.71631994 1.20241288]
 [1.20241288 2.65195814]]
[[1.5466496  1.07395912]
 [1.07395912 2.59129037]]
[[1.84279467 1.03579859]
 [1.03579859 2.25309027]]
[[1.72246954 1.24897732]
 [1.24897732 2.69550032]]
[[1.51702601 1.05090879]
 [1.05090879 2.56660122]]
[[1.86162906 1.00392976]
 [1.00392976 2.19028447]]
[[1.73068687 1.30314564]
 [1.30314564 2.7380013 ]]
[[1.48866147 1.0178382 ]
 [1.0178382  2.5289005 ]]
[[1.87397603 0.97107649]
 [0.97107649 2.12619492]]
[[1.74224312 1.36720429]
 [1.36720429 2.78355203]]
[[1.45855313 0.97079654]
 [0.97079654 2.47691971]]
[[1.88054165 0.93811783]
 [0.93811783 2.05745663]]
[[1.75836011 1.44250127]
 [1.44250127 2.83450417]]
[[1.42429662 0.90612359]


In [3]:
import numpy as np
import matplotlib.pyplot as plt

class GMM:
    def __init__(self, n_clusters, n_iter=100):
        self.n_clusters = n_clusters
        self.n_iter = n_iter

    def _initialize_parameters(self, dataset):
        n_samples, n_dims = dataset.shape
        self.cluster_probs = np.random.dirichlet(np.ones(self.n_clusters))
        self.mus = np.random.normal(loc=0.0, scale=3.0, size=(self.n_clusters, n_dims))
        self.covs = np.array([np.eye(n_dims) for _ in range(self.n_clusters)])

    def _e_step(self, dataset):
        n_samples = dataset.shape[0]
        unnormalized_responsibilities = (
            np.outer(self.cluster_probs, np.ones(n_samples)) *
            np.array([self.multivariate_normal_pdf(dataset, self.mus[i], self.covs[i]) for i in range(self.n_clusters)])
        )
        self.responsibilities = unnormalized_responsibilities / np.sum(unnormalized_responsibilities, axis=0)
        class_responsibilities = np.sum(self.responsibilities, axis=1)
        return class_responsibilities

    def _m_step(self, dataset, class_responsibilities):
        n_samples = dataset.shape[0]
        self.cluster_probs = class_responsibilities / n_samples
        for i in range(self.n_clusters):
            self.mus[i] = np.sum(self.responsibilities[i] * dataset.T, axis=1) / class_responsibilities[i]
        for i in range(self.n_clusters):
            diff = dataset - self.mus[i]
            self.covs[i] = np.dot((self.responsibilities[i] * diff.T), diff) / class_responsibilities[i]

    def fit(self, dataset):
        self._initialize_parameters(dataset)
        for _ in range(self.n_iter):
            class_responsibilities = self._e_step(dataset)
            self._m_step(dataset, class_responsibilities)

    def multivariate_normal_pdf(self, x, mean, cov):
        n = x.shape[1]
        det = np.linalg.det(cov)
        inv_cov = np.linalg.inv(cov)
        norm_const = 1.0 / ((2 * np.pi) ** (n / 2) * np.sqrt(det))
        exp_term = np.exp(-0.5 * np.sum(np.dot(x - mean, inv_cov) * (x - mean), axis=1))
        return norm_const * exp_term

def main():
    N_CLUSTERS = 2
    N_SAMPLES = 1000
    np.random.seed(42)

    # Create a synthetic dataset
    CLUSTER_PROBS = [0.3, 0.7]
    MUS_TRUE = [
        [5.0, 5.0],
        [-3.0, -2.0],
    ]
    COVS_TRUE = [
        [
            [1.5, 0.5],
            [0.5, 2.0],
        ],
        [
            [1.5, 0.0],
            [0.0, 1.8],
        ]
    ]

    dataset = np.concatenate([
        np.random.multivariate_normal(MUS_TRUE[i], COVS_TRUE[i], int(CLUSTER_PROBS[i] * N_SAMPLES))
        for i in range(N_CLUSTERS)
    ])

    # Fit the GMM
    gmm = GMM(N_CLUSTERS)
    gmm.fit(dataset)

    print("------")
    print("Class Probabilities")
    print(gmm.cluster_probs)
    print("------")
    print("Mus")
    print(gmm.mus)
    print("------")
    print("Covariance Matrices")
    print(gmm.covs)

if __name__ == "__main__":
    main()


------
Class Probabilities
[0.29992592 0.70007408]
------
Mus
[[ 4.97716218  4.99627277]
 [-2.89130913 -1.92984073]]
------
Covariance Matrices
[[[ 1.42894212  0.49743838]
  [ 0.49743838  1.88183021]]

 [[ 1.60343124 -0.00480989]
  [-0.00480989  1.63691368]]]


In [None]:
import numpy as np
from scipy.stats import multivariate_normal

class GaussianMixtureModel:
    def __init__(self, n_components, max_iterations=100, tolerance=1e-4, covariance_regularization=1e-6):
        self.n_components = n_components
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.covariance_regularization = covariance_regularization
        self.log_likelihoods = []  # To store log-likelihood values during training

    def fit(self, X):
        self.n_samples, self.n_features = X.shape
        self.weights = np.ones(self.n_components) / self.n_components
        self.means = X[np.random.choice(self.n_samples, self.n_components, replace=False)]
        self.covariances = [np.cov(X.T) + np.eye(self.n_features) * self.covariance_regularization] * self.n_components

        for iteration in range(self.max_iterations):
            # Expectation step
            responsibilities = self._calculate_responsibilities(X)

            # Maximization step
            N_k = responsibilities.sum(axis=0)
            self.weights = N_k / self.n_samples
            self.means = np.dot(responsibilities.T, X) / (N_k[:, np.newaxis] + self.covariance_regularization)
            self.covariances = [self._compute_covariance(X, responsibilities[:, i], self.means[i]) for i in range(self.n_components)]
            print(self.tolerance)
            # Calculate and store the log-likelihood
            log_likelihood = self._calculate_log_likelihood(X)
            self.log_likelihoods.append(log_likelihood)

            # Check for convergence
            if iteration > 0 and abs(self.log_likelihoods[-1] - self.log_likelihoods[-2]) < self.tolerance:
                break

    def _calculate_responsibilities(self, X):
        responsibilities = np.zeros((self.n_samples, self.n_components))
        for i in range(self.n_components):
            cov_with_reg = self.covariances[i] + np.eye(self.n_features) * self.covariance_regularization
            responsibilities[:, i] = self.weights[i] * multivariate_normal.pdf(X, mean=self.means[i], cov=cov_with_reg, allow_singular = True)
        responsibilities /= responsibilities.sum(axis=1, keepdims=True)
        return responsibilities

    def _compute_covariance(self, X, resp, mean):
        weighted_sum = np.dot(resp * (X - mean).T, X - mean)
        covariance = weighted_sum / (resp.sum() + self.covariance_regularization) + np.eye(self.n_features) * self.covariance_regularization
        return covariance

    def _calculate_log_likelihood(self, X):
        log_likelihood = 0.0
        for i in range(self.n_samples):
            likelihood_i = np.sum([self.weights[j] * multivariate_normal.pdf(X[i], mean=self.means[j], cov=self.covariances[j], allow_singular = True) for j in range(self.n_components)])
            log_likelihood += np.log(likelihood_i)
        print(log_likelihood)
        return log_likelihood

    def predict(self, X):
        responsibilities = self._calculate_responsibilities(X)
        return np.argmax(responsibilities, axis=1)





n_clusters = 3
gmm_model = GaussianMixtureModel(n_components=n_clusters)
data = pd.read_csv('data.csv')
X = data[['Gender', 'Marital status', 'Age', 'Income', 'Education', 'Occupation', 'Settlement size', ]].values
# Fit the GMM to your data
gmm_model.fit(X)

# Perform predictions (cluster assignments) on your data
cluster_assignments = gmm_model.predict(X)
print("Cluster assignments:", cluster_assignments)
