In [25]:
import numpy as np
import scipy as sp
import torch
import matplotlib.pyplot as plt

import torch
from torchvision import datasets, transforms
import random

Toy Example

In [26]:
np.random.seed(4321)
dim=20
c = torch.randn((dim, 1), requires_grad=False)
d = torch.randn((dim, 1), requires_grad=False)
A = torch.randn((dim, dim), requires_grad=False)

eigenvalues = torch.empty(dim).uniform_(0.1, 1)
Q, _ = torch.qr(torch.randn(dim, dim))
D = torch.diag(eigenvalues)
H = Q @ D @ Q.T

torch.save(A, 'data/A.pt')
torch.save(H, 'data/H.pt')
torch.save(c, 'data/c.pt')
torch.save(d, 'data/d.pt')

Data Hyper-Cleaning

Binary MNIST

In [27]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

p = 0.5  # corruption probability


# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Randomly select two different classes
classes = list(range(10))
class1 = random.choice(classes)
class2 = random.choice([c for c in classes if c != class1])

# Function to filter data for the selected classes
def filter_mnist_by_class(data, class1, class2):
    indices = (data.targets == class1) | (data.targets == class2)
    filtered_data = data.data[indices]
    filtered_targets = data.targets[indices]
    # Convert labels to binary {-1, 1}
    filtered_targets = torch.where(filtered_targets == class1, torch.tensor(0), torch.tensor(1))
    return filtered_data, filtered_targets

# Filter the data
data, targets = filter_mnist_by_class(mnist_data, class1, class2)

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.5 * n_samples)
n_val = int(0.25 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Randomly corrupt some of the training samples
corruption_mask = np.random.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels
B_tr[corruption_mask] = 1 -  B_tr_true[corruption_mask]  # Flip the labels where corruption_mask is True

B_tr = torch.nn.functional.one_hot(B_tr, num_classes=2)
B_val = torch.nn.functional.one_hot(B_val, num_classes=2)
B_test = torch.nn.functional.one_hot(B_test, num_classes=2)

# Output shapes and corruption details
print(f'Selected classes: {class1} and {class2}')
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')


torch.save(A_tr, 'data/A_tr.pt');
torch.save(A_val, 'data/A_val.pt');
torch.save(A_test, 'data/A_test.pt');
torch.save(B_tr, 'data/B_tr.pt');
torch.save(B_val, 'data/B_val.pt');
torch.save(B_test, 'data/B_test.pt');

Selected classes: 2 and 5
Train: torch.Size([962, 784]), torch.Size([962, 2])
Validation: torch.Size([481, 784]), torch.Size([481, 2])
Test: torch.Size([481, 784]), torch.Size([481, 2])
Number of corrupted labels: 495


In [28]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random


# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Randomly assign 5 classes to label 0 and 5 classes to label 1
classes = list(range(10))
random.shuffle(classes)
class_labels = {cls: 0 if i < 5 else 1 for i, cls in enumerate(classes)}

# Function to map MNIST labels to binary labels
def map_mnist_to_binary(data, class_labels):
    binary_targets = torch.tensor([class_labels[label.item()] for label in data.targets])
    return data.data, binary_targets

# Map the entire dataset to binary labels
data, targets = map_mnist_to_binary(mnist_data, class_labels)

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.5 * n_samples)
n_val = int(0.25 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Randomly corrupt some of the training samples
corruption_mask = np.random.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels
B_tr[corruption_mask] = 1 - B_tr_true[corruption_mask]  # Flip the labels where corruption_mask is True

# One-hot encode the labels
B_tr = torch.nn.functional.one_hot(B_tr, num_classes=2)
B_val = torch.nn.functional.one_hot(B_val, num_classes=2)
B_test = torch.nn.functional.one_hot(B_test, num_classes=2)

# Output shapes and corruption details
print(f'Class-to-binary mapping: {class_labels}')
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')

# Save the data
torch.save(torch.tensor(A_tr), 'data/A_trp' + str(p) + '.pt');
torch.save(torch.tensor(A_val), 'data/A_valp' + str(p) + '.pt');
torch.save(torch.tensor(A_test), 'data/A_testp' + str(p) + '.pt');
torch.save(torch.tensor(B_tr), 'data/B_trp' + str(p) + '.pt');
torch.save(torch.tensor(B_val), 'data/B_valp' + str(p) + '.pt');
torch.save(torch.tensor(B_test), 'data/B_testp' + str(p) + '.pt');

# torch.save(A_tr, 'data/A_tr.pt')
# torch.save(A_val, 'data/A_val.pt')
# torch.save(A_test, 'data/A_test.pt')
# torch.save(B_tr, 'data/B_tr.pt')
# torch.save(B_val, 'data/B_val.pt')
# torch.save(B_test, 'data/B_test.pt')


Class-to-binary mapping: {9: 0, 1: 0, 8: 0, 3: 0, 2: 0, 6: 1, 7: 1, 4: 1, 0: 1, 5: 1}
Train: torch.Size([5000, 784]), torch.Size([5000, 2])
Validation: torch.Size([2500, 784]), torch.Size([2500, 2])
Test: torch.Size([2500, 784]), torch.Size([2500, 2])
Number of corrupted labels: 2550


  torch.save(torch.tensor(A_tr), 'data/A_trp' + str(p) + '.pt');
  torch.save(torch.tensor(A_val), 'data/A_valp' + str(p) + '.pt');
  torch.save(torch.tensor(A_test), 'data/A_testp' + str(p) + '.pt');
  torch.save(torch.tensor(B_tr), 'data/B_trp' + str(p) + '.pt');
  torch.save(torch.tensor(B_val), 'data/B_valp' + str(p) + '.pt');
  torch.save(torch.tensor(B_test), 'data/B_testp' + str(p) + '.pt');


Full MNIST

In [29]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

p = 0.5

# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

data, targets = mnist_data.data, mnist_data.targets

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.5 * n_samples)
n_val = int(0.25 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Get unique classes in training data
unique_classes = B_tr_true.unique()

# Determine the number of classes to corrupt
n_corrupt_classes = max(1, int(p * len(unique_classes)))

# Randomly select classes to corrupt
corrupt_classes = torch.tensor(random.sample(unique_classes.tolist(), n_corrupt_classes))

# Clone the training labels
B_tr = B_tr_true.clone()

for cls in corrupt_classes:
    # Find indices of the current class
    cls_indices = B_tr_true == cls
    # Generate random labels different from the original class label
    random_labels = torch.randint(low=1, high=10, size=(1, 1))
    new_labels = (cls + random_labels) % 10
    
    # Assign new labels to all instances of the selected class
    B_tr[cls_indices] = new_labels

B_tr = torch.nn.functional.one_hot(B_tr, num_classes=10)
B_val = torch.nn.functional.one_hot(B_val, num_classes=10)
B_test = torch.nn.functional.one_hot(B_test, num_classes=10)

# Output shapes and corruption details
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Corrupted classes: {corrupt_classes.tolist()}')

# Save datasets
torch.save(torch.tensor(A_tr), 'data/A_trp' + str(p) + '.pt');
torch.save(torch.tensor(A_val), 'data/A_valp' + str(p) + '.pt');
torch.save(torch.tensor(A_test), 'data/A_testp' + str(p) + '.pt');
torch.save(torch.tensor(B_tr), 'data/B_trp' + str(p) + '.pt');
torch.save(torch.tensor(B_val), 'data/B_valp' + str(p) + '.pt');
torch.save(torch.tensor(B_test), 'data/B_testp' + str(p) + '.pt');

Train: torch.Size([5000, 784]), torch.Size([5000, 10])
Validation: torch.Size([2500, 784]), torch.Size([2500, 10])
Test: torch.Size([2500, 784]), torch.Size([2500, 10])
Corrupted classes: [7, 6, 3, 2, 5]


  torch.save(torch.tensor(A_tr), 'data/A_trp' + str(p) + '.pt');
  torch.save(torch.tensor(A_val), 'data/A_valp' + str(p) + '.pt');
  torch.save(torch.tensor(A_test), 'data/A_testp' + str(p) + '.pt');
  torch.save(torch.tensor(B_tr), 'data/B_trp' + str(p) + '.pt');
  torch.save(torch.tensor(B_val), 'data/B_valp' + str(p) + '.pt');
  torch.save(torch.tensor(B_test), 'data/B_testp' + str(p) + '.pt');


MNIST PCA

In [30]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(A_tr)
index = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.90)[0][0]

pca = PCA(n_components=index)
pca.fit(A_tr)
A_tr2 = pca.transform(A_tr)
A_val2 = pca.transform(A_val)
A_test2 = pca.transform(A_test)

# Output shapes and corruption details
print(f'Train: {A_tr2.shape}, {B_tr.shape}')
print(f'Validation: {A_val2.shape}, {B_val.shape}')
print(f'Test: {A_test2.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')

torch.save(torch.tensor(A_tr2), 'data/A_trp' + str(p) + '.pt');
torch.save(torch.tensor(A_val2), 'data/A_valp' + str(p) + '.pt');
torch.save(torch.tensor(A_test2), 'data/A_testp' + str(p) + '.pt');
torch.save(B_tr, 'data/B_trp' + str(p) + '.pt');
torch.save(B_val, 'data/B_valp' + str(p) + '.pt');
torch.save(B_test, 'data/B_testp' + str(p) + '.pt');

Train: (5000, 82), torch.Size([5000, 10])
Validation: (2500, 82), torch.Size([2500, 10])
Test: (2500, 82), torch.Size([2500, 10])
Number of corrupted labels: 2550
