# Generating a Mock Sample of a Multidimensional Space

## Importing Modules

In [38]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import pandas as pd

## Define the Parameters
- The number of samples
- The number of dimensions
- The number of components

In [60]:
num_dimensions = 9  # Number of dimensions
num_components = 10  # Number of Gaussian components
num_samples = 10000  # Number of samples
min_separation = 2.9  # Minimum separation between component means
error_std = 0.1  # Standard deviation of errors

## Generating the Data

In [61]:
import numpy as np
from sklearn.mixture import GaussianMixture

def generate_mock_data_with_errors(
    num_dimensions, num_components, num_samples, min_separation, error_std
):
    """
    Generate mock data from a Gaussian Mixture Model with full covariance matrices and add mock errors.
    Ensures minimum separation between component means.

    Parameters:
        num_dimensions (int): Number of dimensions.
        num_components (int): Number of Gaussian components.
        num_samples (int): Number of samples to generate.
        min_separation (float): Minimum distance between component means.
        error_std (float): Standard deviation of the errors.

    Returns:
        X (numpy.ndarray): The sampled data points with errors (num_samples x num_dimensions).
        y (numpy.ndarray): The labels for the components (num_samples,).
        errors (numpy.ndarray): The mock errors added to the data (num_samples x num_dimensions).
    """
    # Step 1: Generate means with minimum separation
    means = []
    while len(means) < num_components:
        candidate_mean = np.random.uniform(-3, 3, size=num_dimensions)  # Generate mean in the range [-3, 3]
        if all(np.linalg.norm(candidate_mean - existing_mean) >= min_separation for existing_mean in means):
            means.append(candidate_mean)
    means = np.array(means)

    # Step 2: Generate full covariance matrices for each Gaussian component
    covariances = []
    for _ in range(num_components):
        A = np.random.rand(num_dimensions, num_dimensions)  # Random square matrix
        #regularization = np.eye(num_dimensions) * 0.3  # Regularization for positive-definiteness
        full_cov = np.dot(A, A.T)  # Make positive-definite
        covariances.append(full_cov)

    # Normalize covariances to ensure reasonable scales
    covariances = [cov / np.linalg.norm(cov) for cov in covariances]

    # Further regularize the covariances
    regularization = 1.2
    covariances = [cov + np.eye(cov.shape[0]) * regularization for cov in covariances]

    # Step 3: Generate mixture weights for the components
    weights = np.random.dirichlet(np.ones(num_components), size=1).flatten()

    # Step 4: Initialize a GaussianMixture object
    gmm = GaussianMixture(n_components=num_components, covariance_type='full', random_state=42)

    # Step 5: Manually set the GMM parameters
    gmm.means_ = means  # Assign the generated means
    gmm.covariances_ = np.array(covariances)
    gmm.precisions_cholesky_ = np.linalg.cholesky(np.linalg.inv(gmm.covariances_))
    gmm.weights_ = weights

    # Step 6: Generate synthetic data
    X, y = gmm.sample(num_samples)

    # Step 7: Generate mock errors
    errors = np.random.normal(0, error_std, size=X.shape)

    # Step 8: Add errors to the original data
    X = X + errors

    # Step 9: Return the generated data with errors, labels, and errors themselves
    return X, y, errors


## Generating a plot for a two-dimensional case

In [62]:
def plot_data(X, y):
    plt.figure(figsize=(8, 6))
    unique_labels = np.unique(y)
    
    for label in unique_labels:
        component_data = X[y == label]
        plt.scatter(
            component_data[:, 0], 
            component_data[:, 1], 
            label=f'Component {label}', 
            s=10
        )
    
    plt.title('Generated Data from Gaussian Mixture Model')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend(title='Components', loc='best')
    plt.show()

## Running the Functions

In [63]:
# Generate data with errors
X, y, errors = generate_mock_data_with_errors(num_dimensions, num_components, num_samples, min_separation, error_std)

# Plot if 2D
if num_dimensions == 2:
    plot_data(X, y)
else:
    print(f"Data generated with {num_samples} samples in {num_dimensions} dimensions, with {num_components} components.")

Data generated with 10000 samples in 9 dimensions, with 10 components.


## Save Sampled Data to a .csv

In [64]:
def save_data_with_errors_to_csv(X, y, errors, filename="sampled_data_with_errors.csv"):
    """
    Save sampled data, errors, and their labels to a CSV file.

    Parameters:
        X (numpy.ndarray): The sampled data points (num_samples x num_dimensions).
        y (numpy.ndarray): The labels for the components (num_samples,).
        errors (numpy.ndarray): The errors for the data points (num_samples x num_dimensions).
        filename (str): The name of the CSV file to save the data to.
    """
    # Create column names for the data dimensions and errors
    num_dimensions = X.shape[1]
    data_columns = [f"Dimension_{i+1}" for i in range(num_dimensions)]
    error_columns = [f"Error_Dimension_{i+1}" for i in range(num_dimensions)]

    # Add the component labels column
    all_columns = data_columns + error_columns + ["Component_Label"]

    # Combine the data, errors, and labels into a single DataFrame
    data_with_errors_and_labels = np.hstack((X, errors, y.reshape(-1, 1)))  # Combine X, errors, and y
    df = pd.DataFrame(data_with_errors_and_labels, columns=all_columns)

    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Data with errors successfully saved to {filename}")

In [65]:
# Save to CSV
save_data_with_errors_to_csv(X, y, errors, filename="generated_data_with_errors.csv")

Data with errors successfully saved to generated_data_with_errors.csv
