In [2]:
import numpy as np
import scipy as sp
import torch
import matplotlib.pyplot as plt

import torch
from torchvision import datasets, transforms
import random

Toy Example

In [7]:
np.random.seed(4321)
dim=20
c = torch.randn((dim, 1), requires_grad=False)
d = torch.randn((dim, 1), requires_grad=False)
A = torch.randn((dim, dim), requires_grad=False)

eigenvalues = torch.empty(dim).uniform_(0.1, 1)
Q, _ = torch.qr(torch.randn(dim, dim))
D = torch.diag(eigenvalues)
H = Q @ D @ Q.T

torch.save(A, 'data/A.pt')
torch.save(H, 'data/H.pt')
torch.save(c, 'data/c.pt')
torch.save(d, 'data/d.pt')

Data Hyper-Cleaning

In [4]:
# np.random.seed(4321)

# # Define dimensions and parameters
# n_features = 20  # Number of features
# n_train = 1000  # Number of training samples
# n_val = 100   # Number of validation samples
# n_test = 200  # Number of test samples
# p = 0.1        # Corruption probability

# # Generate random feature matrices
# A_tr = np.random.randn(n_train, n_features)  # Training features
# A_val = np.random.randn(n_val, n_features)   # Validation features
# A_test = np.random.randn(n_test, n_features) # Test features

# # Generate original true labels
# B_tr_true = np.random.randint(0, 2, n_train) * 2 - 1  # Binary labels {-1, 1}
# B_val_true = np.random.randint(0, 2, n_val) * 2 - 1   # Binary labels {-1, 1}
# B_test_true = np.random.randint(0, 2, n_test) * 2 - 1 # Binary labels {-1, 1}

# # Corrupt the training labels
# corruption_mask = np.random.rand(n_train) < p
# B_tr = B_tr_true.copy()
# B_tr[corruption_mask] = -B_tr_true[corruption_mask]  # Flip labels

# # # Corrupt the validation labels
# # corruption_mask_val = np.random.rand(n_val) < p
# B_val = B_val_true.copy()
# # B_val[corruption_mask_val] = -B_val_true[corruption_mask_val]  # Flip labels

# # # Corrupt the test labels
# # corruption_mask_test = np.random.rand(n_test) < p
# B_test = B_test_true.copy()
# # B_test[corruption_mask_test] = -B_test_true[corruption_mask_test]  # Flip labels


# torch.save(torch.tensor(A_tr), 'data/A_tr.pt'); 
# torch.save(torch.tensor(A_val), 'data/A_val.pt');
# torch.save(torch.tensor(A_test), 'data/A_test.pt');
# torch.save(torch.tensor(B_tr), 'data/B_tr.pt');
# torch.save(torch.tensor(B_val), 'data/B_val.pt');
# torch.save(torch.tensor(B_test), 'data/B_test.pt');

MNIST

In [5]:
import torch
import numpy as np
from torchvision import datasets, transforms
import random

p = 0.2  # corruption probability


# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])
mnist_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Randomly select two different classes
classes = list(range(10))
class1 = random.choice(classes)
class2 = random.choice([c for c in classes if c != class1])

# Function to filter data for the selected classes
def filter_mnist_by_class(data, class1, class2):
    indices = (data.targets == class1) | (data.targets == class2)
    filtered_data = data.data[indices]
    filtered_targets = data.targets[indices]
    # Convert labels to binary {-1, 1}
    filtered_targets = torch.where(filtered_targets == class1, torch.tensor(-1), torch.tensor(1))
    return filtered_data, filtered_targets

# Filter the data
data, targets = filter_mnist_by_class(mnist_data, class1, class2)

# Flatten the images and split into train, validation, and test sets
n_samples = data.shape[0]
n_train = int(0.7 * n_samples)
n_val = int(0.15 * n_samples)
n_test = n_samples - n_train - n_val

# Shuffle the data
indices = torch.randperm(n_samples)
data = data[indices].float().view(n_samples, -1) / 255.0  # Normalize and flatten
targets = targets[indices]

# Split the data
A_tr, B_tr_true = data[:n_train], targets[:n_train]
A_val, B_val = data[n_train:n_train+n_val], targets[n_train:n_train+n_val]
A_test, B_test = data[n_train+n_val:], targets[n_train+n_val:]

# Randomly corrupt some of the training samples
corruption_mask = np.random.rand(n_train) < p
B_tr = B_tr_true.clone()  # Clone to avoid modifying the original labels
B_tr[corruption_mask] = -B_tr_true[corruption_mask]  # Flip the labels where corruption_mask is True

# Output shapes and corruption details
print(f'Selected classes: {class1} and {class2}')
print(f'Train: {A_tr.shape}, {B_tr.shape}')
print(f'Validation: {A_val.shape}, {B_val.shape}')
print(f'Test: {A_test.shape}, {B_test.shape}')
print(f'Number of corrupted labels: {corruption_mask.sum()}')


torch.save(A_tr, 'data/A_tr.pt'); 
torch.save(A_val, 'data/A_val.pt');
torch.save(A_test, 'data/A_test.pt');
torch.save(B_tr, 'data/B_tr.pt');
torch.save(B_val, 'data/B_val.pt');
torch.save(B_test, 'data/B_test.pt');

Selected classes: 4 and 3
Train: torch.Size([1394, 784]), torch.Size([1394])
Validation: torch.Size([298, 784]), torch.Size([298])
Test: torch.Size([300, 784]), torch.Size([300])
Number of corrupted labels: 275
