In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import pathlib


In [2]:
def generate_data(n_features, n_clusters, sigma=1, N=1000, repeat=20):
    np.random.seed(n_features * n_clusters + N + repeat + 100 // sigma)
    configuration_name = '{}x{}-{} {}'.format(N, n_features, n_clusters, sigma)
    cluster_sizes = np.random.uniform(low=1, high=5, size=n_clusters)
    cluster_sizes = (cluster_sizes * N / cluster_sizes.sum()).astype(int)
    diff = cluster_sizes.sum() - N
    cluster_sizes[0] -= diff

    parent_path = './datasets/synthetic/{}'.format(configuration_name)
    pathlib.Path(parent_path).mkdir(exist_ok=True)
    for i in range(repeat):
        np.random.seed(n_features * n_clusters + i * N)

        child_path = '{}/{}'.format(parent_path, '%02d' % i)
        pathlib.Path(child_path).mkdir(exist_ok=True)
        
        X = np.empty((N, n_features))
        y = np.empty(N)
        count_from, count_to = 0, 0
        for c in range(n_clusters):
            centre = np.random.normal(loc=0, scale=sigma, size=n_features)
            std = np.sqrt(np.random.uniform(low=0.5, high=1.5, size=n_features))
            count_to += cluster_sizes[c]
            X[count_from:count_to, :] = np.random.normal(loc=centre, scale=std, size=(cluster_sizes[c], n_features))
            y[count_from:count_to] = c
            count_from += cluster_sizes[c]
        
        perm = np.random.permutation(len(y))
        X = X[perm]
        y = y[perm]
        
        np.savetxt('{}/X.csv'.format(child_path), X, fmt= '%.6f', delimiter=',')
        np.savetxt('{}/y.csv'.format(child_path), y, fmt='%i', delimiter=',')
        
        del X, y
    

In [3]:
def generate_data_with_noise_features(n_features, n_noise_features, n_clusters, sigma=1, N=1000, repeat=20):
    np.random.seed(n_features * n_clusters + N + repeat + 100 // sigma)
    configuration_name = '{}x{}-{} +{}NF {}'.format(N, n_features, n_clusters, n_noise_features, sigma)
    cluster_sizes = np.random.uniform(low=1, high=5, size=n_clusters)
    cluster_sizes = (cluster_sizes * N / cluster_sizes.sum()).astype(int)
    diff = cluster_sizes.sum() - N
    cluster_sizes[0] -= diff

    parent_path = './datasets/synthetic/{}'.format(configuration_name)
    pathlib.Path(parent_path).mkdir(exist_ok=True)
    for i in range(repeat):
        np.random.seed(n_features * n_clusters + i * N)

        child_path = '{}/{}'.format(parent_path, '%02d' % i)
        pathlib.Path(child_path).mkdir(exist_ok=True)
        
        X = np.empty((N, n_features))
        y = np.empty(N)
        count_from, count_to = 0, 0
        for c in range(n_clusters):
            centre = np.random.normal(loc=0, scale=sigma, size=n_features)
            std = np.sqrt(np.random.uniform(low=0.5, high=1.5, size=n_features))
            count_to += cluster_sizes[c]
            X[count_from:count_to, :] = np.random.normal(loc=centre, scale=std, size=(cluster_sizes[c], n_features))
            y[count_from:count_to] = c
            count_from += cluster_sizes[c]
        
        perm = np.random.permutation(len(y))
        X = X[perm]
        y = y[perm]
        
        # adding noise features
        xmin, xmax = np.min(X), np.max(X)
        X_noise = np.random.uniform(size=(N, n_noise_features), low=xmin, high=xmax)
        X = np.hstack((X, X_noise))
        # noise features added
        
        np.savetxt('{}/X.csv'.format(child_path), X, fmt= '%.6f', delimiter=',')
        np.savetxt('{}/y.csv'.format(child_path), y, fmt='%i', delimiter=',')
        
        del X, X_noise, y
    

In [4]:
def generate_data_with_blurred_features(n_features, noise_percent, n_clusters, sigma=1, N=1000, repeat=20):
    np.random.seed(n_features * n_clusters + N + repeat + 100 // sigma)
    configuration_name = '{}x{}-{} {}%N {}'.format(N, n_features, n_clusters, noise_percent, sigma)
    cluster_sizes = np.random.uniform(low=1, high=5, size=n_clusters)
    cluster_sizes = (cluster_sizes * N / cluster_sizes.sum()).astype(int)
    diff = cluster_sizes.sum() - N
    cluster_sizes[0] -= diff

    parent_path = './datasets/synthetic/{}'.format(configuration_name)
    pathlib.Path(parent_path).mkdir(exist_ok=True)
    for i in range(repeat):
        np.random.seed(n_features * n_clusters + i * N)

        child_path = '{}/{}'.format(parent_path, '%02d' % i)
        pathlib.Path(child_path).mkdir(exist_ok=True)
        
        X = np.empty((N, n_features))
        y = np.empty(N)
        count_from, count_to = 0, 0
        for c in range(n_clusters):
            centre = np.random.normal(loc=0, scale=sigma, size=n_features)
            std = np.sqrt(np.random.uniform(low=0.5, high=1.5, size=n_features))
            count_to += cluster_sizes[c]
            X[count_from:count_to, :] = np.random.normal(loc=centre, scale=std, size=(cluster_sizes[c], n_features))
            y[count_from:count_to] = c
            count_from += cluster_sizes[c]
        
        perm = np.random.permutation(len(y))
        X = X[perm]
        y = y[perm]
        
        # blurring features
        x_mins, x_maxs = np.min(X, axis=0), np.max(X, axis=0)
        feature_cluster_table = np.random.uniform(size=(n_features, n_clusters)) < noise_percent / 100
        for f in range(n_features):
            for c in range(n_clusters):
                if feature_cluster_table[f, c]:
                    mask = y == c
                    noise = np.random.uniform(size=np.sum(mask), low=x_mins[f], high=x_maxs[f])
                    X[mask, f] = noise
        # features blurred
        
        np.savetxt('{}/X.csv'.format(child_path), X, fmt= '%.6f', delimiter=',')
        np.savetxt('{}/y.csv'.format(child_path), y, fmt='%i', delimiter=',')
        
        del X, y
    

In [5]:
sigmas = [1, 2, 3, 4, 5]
feature_cluster_size = [(2, 3, 500), (6, 3, 1000), (12, 6, 1000), (20, 10, 1000)]

for n_f, n_c, sz in feature_cluster_size:
    for s in sigmas:
        generate_data(n_f, n_c, s, N=sz)
        generate_data_with_noise_features(n_f, n_f // 2, n_c, s, N=sz)
        generate_data_with_blurred_features(n_f, 50, n_c, s, N=sz)
