In [14]:
import numpy as np
import scipy as sp
import torch
import matplotlib.pyplot as plt

import torch
from torchvision import datasets, transforms
import random

Toy Example

In [15]:
np.random.seed(4321)
dim=20
c = torch.randn((dim, 1), requires_grad=False)
d = torch.randn((dim, 1), requires_grad=False)
A = torch.randn((dim, dim), requires_grad=False)

eigenvalues = torch.empty(dim).uniform_(0.1, 1)
Q, _ = torch.qr(torch.randn(dim, dim))
D = torch.diag(eigenvalues)
H = Q @ D @ Q.T

torch.save(A, 'data/A.pt')
torch.save(H, 'data/H.pt')
torch.save(c, 'data/c.pt')
torch.save(d, 'data/d.pt')

Data Hyper-Cleaning

Binary MNIST

In [16]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

p = 0.5  # corruption probability


# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Randomly select two different classes
classes = list(range(10))
class1 = random.choice(classes)
class2 = random.choice([c for c in classes if c != class1])

# Function to filter data for the selected classes
def filter_mnist_by_class(data, class1, class2):
    indices = (data.targets == class1) | (data.targets == class2)
    filtered_data = data.data[indices]
    filtered_targets = data.targets[indices]
    # Convert labels to binary {-1, 1}
    filtered_targets = torch.where(filtered_targets == class1, torch.tensor(0), torch.tensor(1))
    return filtered_data, filtered_targets

# Filter the data
data, targets = filter_mnist_by_class(mnist_data, class1, class2)

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.7 * n_samples)
n_val = int(0.15 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Randomly corrupt some of the training samples
corruption_mask = np.random.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels
B_tr[corruption_mask] = 1 -  B_tr_true[corruption_mask]  # Flip the labels where corruption_mask is True

B_tr = torch.nn.functional.one_hot(B_tr, num_classes=2)
B_val = torch.nn.functional.one_hot(B_val, num_classes=2)
B_test = torch.nn.functional.one_hot(B_test, num_classes=2)

# Output shapes and corruption details
print(f'Selected classes: {class1} and {class2}')
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')


torch.save(A_tr, 'data/A_tr-p' + str(p) + '.pt');
torch.save(A_val, 'data/A_val-p' + str(p) + '.pt');
torch.save(A_test, 'data/A_test-p' + str(p) + '.pt');
torch.save(B_tr, 'data/B_tr-p' + str(p) + '.pt');
torch.save(B_val, 'data/B_val-p' + str(p) + '.pt');
torch.save(B_test, 'data/B_test-p' + str(p) + '.pt');

Selected classes: 4 and 5
Train: torch.Size([1311, 784]), torch.Size([1311, 2])
Validation: torch.Size([281, 784]), torch.Size([281, 2])
Test: torch.Size([282, 784]), torch.Size([282, 2])
Number of corrupted labels: 664


In [18]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

p = 0.5  # corruption probability

# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Randomly assign 5 classes to label 0 and 5 classes to label 1
classes = list(range(10))
random.shuffle(classes)
class_labels = {cls: 0 if i < 5 else 1 for i, cls in enumerate(classes)}

# Function to map MNIST labels to binary labels
def map_mnist_to_binary(data, class_labels):
    binary_targets = torch.tensor([class_labels[label.item()] for label in data.targets])
    return data.data, binary_targets

# Map the entire dataset to binary labels
data, targets = map_mnist_to_binary(mnist_data, class_labels)

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.7 * n_samples)
n_val = int(0.15 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Randomly corrupt some of the training samples
corruption_mask = np.random.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels
B_tr[corruption_mask] = 1 - B_tr_true[corruption_mask]  # Flip the labels where corruption_mask is True

# One-hot encode the labels
B_tr = torch.nn.functional.one_hot(B_tr, num_classes=2)
B_val = torch.nn.functional.one_hot(B_val, num_classes=2)
B_test = torch.nn.functional.one_hot(B_test, num_classes=2)

# Output shapes and corruption details
print(f'Class-to-binary mapping: {class_labels}')
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')

# Save the data
torch.save(A_tr, 'data/A_tr.pt')
torch.save(A_val, 'data/A_val.pt')
torch.save(A_test, 'data/A_test.pt')
torch.save(B_tr, 'data/B_tr.pt')
torch.save(B_val, 'data/B_val.pt')
torch.save(B_test, 'data/B_test.pt')


Class-to-binary mapping: {8: 0, 6: 0, 5: 0, 0: 0, 1: 0, 3: 1, 4: 1, 9: 1, 2: 1, 7: 1}
Train: torch.Size([7000, 784]), torch.Size([7000, 2])
Validation: torch.Size([1500, 784]), torch.Size([1500, 2])
Test: torch.Size([1500, 784]), torch.Size([1500, 2])
Number of corrupted labels: 3475


Full MNIST

In [4]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

# p = 0.4  # corruption probability


# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

data, targets = mnist_data.data, mnist_data.targets

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.7 * n_samples)
n_val = int(0.15 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# # Randomly corrupt some of the training samples
corruption_mask = torch.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels

# Generate random labels for the corrupted samples
random_labels = torch.randint(1, 10, (n_train,))  # Random integers from 1 to 9

# Apply the corruption, ensuring new labels are different from the original
B_tr[corruption_mask] = (B_tr_true[corruption_mask] + random_labels[corruption_mask]) % 10

B_tr = torch.nn.functional.one_hot(B_tr, num_classes=10)
B_val = torch.nn.functional.one_hot(B_val, num_classes=10)
B_test = torch.nn.functional.one_hot(B_test, num_classes=10)


# Output shapes and corruption details
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')


torch.save(A_tr, 'data/A_tr.pt'); 
torch.save(A_val, 'data/A_val.pt');
torch.save(A_test, 'data/A_test.pt');
torch.save(B_tr, 'data/B_tr.pt');
torch.save(B_val, 'data/B_val.pt');
torch.save(B_test, 'data/B_test.pt');

Train: torch.Size([7000, 784]), torch.Size([7000, 10])
Validation: torch.Size([1500, 784]), torch.Size([1500, 10])
Test: torch.Size([1500, 784]), torch.Size([1500, 10])
Number of corrupted labels: 3414


MNIST PCA

In [5]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(A_tr)
index = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.75)[0][0]

pca = PCA(n_components=index)
pca.fit(A_tr)
A_tr2 = pca.transform(A_tr)
A_val2 = pca.transform(A_val)
A_test2 = pca.transform(A_test)

# Output shapes and corruption details
print(f'Train: {A_tr2.shape}, {B_tr.shape}')
print(f'Validation: {A_val2.shape}, {B_val.shape}')
print(f'Test: {A_test2.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')

torch.save(torch.tensor(A_tr2), 'data/A_tr.pt');
torch.save(torch.tensor(A_val2), 'data/A_val.pt');
torch.save(torch.tensor(A_test2), 'data/A_test.pt');
torch.save(B_tr, 'data/B_tr.pt');
torch.save(B_val, 'data/B_val.pt');
torch.save(B_test, 'data/B_test.pt');

Train: (7000, 31), torch.Size([7000, 10])
Validation: (1500, 31), torch.Size([1500, 10])
Test: (1500, 31), torch.Size([1500, 10])
Number of corrupted labels: 3414


(torch.Size([7000, 784]),
 tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]))