In [6]:
import numpy as np
import pandas as pd

In [8]:
import numpy as np

def create_dataset():
    # Dataset parameters
    num_samples = 8000
    num_features = 320
    num_classes = 4
    num_samples_per_class = num_samples // num_classes

    # Initialize dataset
    dataset = np.zeros((num_samples, num_features))
    labels = np.zeros((num_samples, num_classes))

    # Generate samples for each class
    for i in range(num_classes):
        start_index = i * 20
        end_index = (i + 1) * 20

        # Generate samples for class i
        for j in range(num_samples_per_class):
            sample_index = i * num_samples_per_class + j

            # Set features dependent on class-membership
            for k in range(start_index, end_index):
                dataset[sample_index, k] = np.random.randint(1, 11)

            # Set remaining features
            for k in range(end_index, num_features, 20):
                if np.random.rand() < 0.5:
                    dataset[sample_index, k:k+20] = np.random.randint(1, 11, size=20)

            # Set label for class i
            labels[sample_index, i] = 1

    # Inject noise into labels
    noise_fraction = 0.1
    num_noisy_labels = int(noise_fraction * num_samples)
    noisy_label_indices = np.random.choice(num_samples, num_noisy_labels, replace=False)
    noisy_class_indices = np.random.choice(num_classes, num_noisy_labels, replace=True)
    labels[noisy_label_indices, noisy_class_indices] = 1 - labels[noisy_label_indices, noisy_class_indices]

    # Make data partially labeled
    labeled_fraction = 0.5
    num_labeled_samples = int(labeled_fraction * num_samples)
    labeled_indices = np.random.choice(num_samples, num_labeled_samples, replace=False)
    labels[labeled_indices] = -1

    # Split into train and test sets
    test_fraction = 0.2
    num_test_samples = int(test_fraction * num_samples)
    test_indices = np.random.choice(num_samples, num_test_samples, replace=False)

    train_dataset = dataset[~np.isin(np.arange(num_samples), test_indices)]
    train_labels = labels[~np.isin(np.arange(num_samples), test_indices)]
    test_dataset = dataset[test_indices]
    test_labels = labels[test_indices]

    return train_dataset, train_labels, test_dataset, test_labels

# Create the dataset
train_dataset, train_labels, test_dataset, test_labels = create_dataset()


In [11]:
np.savetxt('train_dataset.csv', train_dataset, delimiter=',')
np.savetxt('train_labels.csv', train_labels, delimiter=',')
np.savetxt('test_dataset.csv', test_dataset, delimiter=',')
np.savetxt('test_labels.csv', test_labels, delimiter=',')

In [9]:
train_labels

array([[ 1.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [-1., -1., -1., -1.],
       ...,
       [-1., -1., -1., -1.],
       [-1., -1., -1., -1.],
       [ 0.,  0.,  0.,  1.]])