In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
import numpy as np
import os

In [2]:
datapath = "data/"

In [3]:
def get_normalization_params(dataset_name, datapath):
    if dataset_name == "cifar10":
        # stds are different in paper wtf
        train_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=True, transform=transforms.ToTensor(), download=True)
    elif dataset_name == "cifar100":
        pass
    elif dataset_name == "clothing1m":
        pass
    else:
        raise Exception
    
    means = train_dataset.data.mean(axis=(0,1,2)) / 255.0
    stds = train_dataset.data.std(axis=(0,1,2)) / 255.0
    
    return means, stds

In [4]:
def get_transforms(dataset_name, **kwargs):
    means, stds = kwargs["means"], kwargs["stds"]
    
    if dataset_name == "cifar10":
        train_transform = transforms.Compose([transforms.RandomCrop(32, padding=4), 
                                              transforms.RandomHorizontalFlip(),
                                              transforms.ToTensor(),
                                              transforms.Normalize(means, stds)])

        test_transform = transforms.Compose([transforms.ToTensor(), 
                                             transforms.Normalize(means, stds)])
    elif dataset_name == "cifar100":
        pass
    
    elif dataset_name == "clothing1m":
        pass
    
    else:
        raise Exception
    
    return train_transform, test_transform

In [5]:
def get_splits(dataset_name, datapath, **kwargs):
    train_transform, test_transform = kwargs["train_transform"], kwargs["test_transform"]
    
    if dataset_name == "cifar10":
        train_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=True, transform=train_transform, download=True)
        test_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=False, transform=test_transform)
    
    elif dataset_name == "cifar100":
        pass
    
    elif dataset_name == "clothing1m":
        pass
    
    else:
        raise Exception
    
    return train_dataset, test_dataset

In [6]:
def get_datasets(dataset_name, datapath):
    means, stds = get_normalization_params(dataset_name, datapath)
    
    transform_params = {"means": means, "stds": stds}
    train_transform, test_transform = get_transforms(dataset_name, **transform_params)
    
    transforms = {"train_transform": train_transform, "test_transform": test_transform}
    train_dataset, test_dataset = get_splits(dataset_name, datapath, **transforms)
    
    return train_dataset, test_dataset

In [7]:
# https://pytorch.org/docs/stable/notes/randomness.html

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

def get_datta_loaders():
    pass

In [8]:
dataset_name = "cifar10"
train_dataset, test_dataset = get_datasets(dataset_name, datapath)

Files already downloaded and verified
Files already downloaded and verified


In [9]:
print(train_dataset)
print(test_dataset)

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data/cifar10
    Split: Train
    StandardTransform
Transform: Compose(
               RandomCrop(size=(32, 32), padding=4)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.49139968 0.48215841 0.44653091], std=[0.24703223 0.24348513 0.26158784])
           )
Dataset CIFAR10
    Number of datapoints: 10000
    Root location: data/cifar10
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=[0.49139968 0.48215841 0.44653091], std=[0.24703223 0.24348513 0.26158784])
           )


### Symmetric Noise CIFAR-10

In [10]:
targets = torch.tensor(train_dataset.targets)
noisy_targets = targets.detach().clone()

indices = torch.arange(targets.size()[0])

p_mask = torch.ones_like(targets[indices]) * 0.4
flip_mask = torch.bernoulli(input=p_mask)
flip_mask.shape

torch.Size([50000])

In [11]:
keep_mask = (flip_mask * (-1)) + 1
keep_mask

tensor([0., 1., 0.,  ..., 1., 1., 0.])

In [12]:
np.where(flip_mask.numpy() == 1.0)[0].shape

(19936,)

In [13]:
labels = torch.tensor(list(train_dataset.class_to_idx.values()))
print(labels)
labels_len = torch.tensor(labels.size(dim=0))
print(labels_len)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
tensor(10)


In [14]:
targets = torch.tensor(train_dataset.targets)

p_mask_label = torch.ones_like(labels) / labels_len
print(p_mask_label)
flip_mask_label = torch.distributions.categorical.Categorical(p_mask_label)
flipped_targets = flip_mask_label.sample(sample_shape=targets[indices].shape)
print(flipped_targets)
print(flipped_targets.shape)
print(indices)

tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
tensor([8, 8, 9,  ..., 9, 6, 9])
torch.Size([50000])
tensor([    0,     1,     2,  ..., 49997, 49998, 49999])


In [15]:
masked_flipped_targets = flipped_targets * flip_mask
masked_flipped_targets

tensor([8., 0., 9.,  ..., 0., 0., 9.])

In [16]:
targets

tensor([6, 9, 9,  ..., 9, 1, 1])

In [17]:
masked_targets = targets[indices] * keep_mask
masked_targets

tensor([0., 9., 0.,  ..., 9., 1., 0.])

In [18]:
noisy_targets_sub = (masked_targets + masked_flipped_targets).long()
print(noisy_targets_sub.shape)
print(noisy_targets_sub)

torch.Size([50000])
tensor([8, 9, 9,  ..., 9, 1, 9])


In [19]:
noisy_targets[indices] = noisy_targets_sub
print(targets)
print(noisy_targets)

tensor([6, 9, 9,  ..., 9, 1, 1])
tensor([8, 9, 9,  ..., 9, 1, 9])


In [20]:
np.where(noisy_targets.numpy() == train_dataset.targets)[0].size / np.array(train_dataset.targets).size

0.64322

In [21]:
# data to cifar10, add paper noisy labels, save mine, set seed for my noisy label generation

### Asymmetric Noise CIFAR-10

In [22]:
targets

tensor([6, 9, 9,  ..., 9, 1, 1])

In [23]:
train_dataset.class_to_idx

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [24]:
a = torch.tensor(range(10))
a

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [25]:
a[[1,4]] = -1
a

tensor([ 0, -1,  2,  3, -1,  5,  6,  7,  8,  9])

In [26]:
targets = torch.tensor(train_dataset.targets)
noisy_targets = targets.detach().clone()

indices = torch.where(targets == train_dataset.class_to_idx["truck"])[0]
print(indices)
print(targets[indices])

p_mask = torch.ones_like(targets[indices]) * 0.4
flip_mask = torch.bernoulli(input=p_mask)
print(flip_mask.shape)
print(flip_mask)
keep_mask = (flip_mask * (-1)) + 1

tensor([    1,     2,    14,  ..., 49963, 49971, 49997])
tensor([9, 9, 9,  ..., 9, 9, 9])
torch.Size([5000])
tensor([0., 1., 0.,  ..., 0., 1., 1.])


In [27]:
p_mask_label = torch.zeros_like(labels)
p_mask_label[train_dataset.class_to_idx["automobile"]] = 1.0
print(p_mask_label)
flip_mask_label = torch.distributions.categorical.Categorical(p_mask_label)
flipped_targets = flip_mask_label.sample(sample_shape=targets[indices].shape)
print(flipped_targets)
print(flipped_targets.shape)

tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([1, 1, 1,  ..., 1, 1, 1])
torch.Size([5000])


In [28]:
targets[indices]

tensor([9, 9, 9,  ..., 9, 9, 9])

In [29]:
masked_flipped_targets = flipped_targets * flip_mask
print(masked_flipped_targets.shape)
print(masked_flipped_targets)

torch.Size([5000])
tensor([0., 1., 0.,  ..., 0., 1., 1.])


In [30]:
masked_targets = targets[indices] * keep_mask
masked_targets

tensor([9., 0., 9.,  ..., 9., 0., 0.])

In [31]:
noisy_targets_sub = (masked_targets + masked_flipped_targets).long()
print(noisy_targets_sub.shape)
print(noisy_targets_sub)

torch.Size([5000])
tensor([9, 1, 9,  ..., 9, 1, 1])


In [32]:
targets

tensor([6, 9, 9,  ..., 9, 1, 1])

In [33]:
noisy_targets[indices] = noisy_targets_sub
print(targets)
print(noisy_targets)

tensor([6, 9, 9,  ..., 9, 1, 1])
tensor([6, 9, 1,  ..., 1, 1, 1])


In [34]:
indices = torch.where(targets == train_dataset.class_to_idx["truck"])[0]
torch.where(targets[indices] != noisy_targets[indices])[0].size()[0] / indices.size()[0]

0.392

In [35]:
dirty_indicator_indices = torch.where(targets != noisy_targets)[0]
print(indices.size())
print(dirty_indicator_indices.size())
print(indices)
print(dirty_indicator_indices)

torch.Size([5000])
torch.Size([1960])
tensor([    1,     2,    14,  ..., 49963, 49971, 49997])
tensor([    2,    15,    67,  ..., 49945, 49971, 49997])


Function

In [37]:
train_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=True, transform=transforms.ToTensor(), download=True)

targets = torch.tensor(train_dataset.targets)

src = "truck"
dsts = ["automobile"]
dataset_name = "cifar10"
p = 0.4

indices, dirty_indicator_indices, noisy_targets = apply_noise(train_dataset, src, dsts, p)
noisy_targets.size()
#targets[indices] = 

Files already downloaded and verified


torch.Size([50000])

In [38]:
dirty_indicator_indices.size()[0] / indices.size()[0]

0.4008

In [39]:
type(train_dataset.data)

numpy.ndarray

In [40]:
np.save("test.npy", train_dataset.data)

In [41]:
train_data_2 = np.load("test.npy")

In [568]:
np.all(train_dataset.data == train_data_2)

True

In [93]:
def make_inherent_label_noise(train_dataset, src, dsts, p):
    # clean targets
    targets = torch.tensor(train_dataset.targets)
    # copy clean targets to noisy targets
    noisy_targets = targets.detach().clone()

    # get all labels
    labels = torch.tensor(list(train_dataset.class_to_idx.values()))
    labels_len = torch.tensor(labels.size(dim=0))

    # get src targets indices (indices in all dataset)
    # asym: get indices of src targets
    indices = torch.where(targets == train_dataset.class_to_idx[src])[0]
    #print(indices)

    # p_mask eg 0.4 for each src target, each target flips with probability p
    p_mask = torch.ones_like(targets[indices]) * p
    # flip_mask is 0s and 1s  (flip is 1s)
    flip_mask = torch.bernoulli(input=p_mask)

    # keep_mask is inverse of flip_mask (keep is 1s)
    keep_mask = (flip_mask * (-1)) + 1

    # p_mask_label is dst label probability distribution to flip to (length is number of classes), sums to 1.0
    # asym: dst class is 1.0, all else 0.0
    p_mask_label = torch.zeros_like(labels, dtype=torch.float)

    p_mask_label[[train_dataset.class_to_idx[dst] for dst in dsts]] = 1.0 / len(dsts)
    print(p_mask_label)

    # flip_mask_label is categorical distribution with params p_mask_label for each dst class
    flip_mask_label = torch.distributions.categorical.Categorical(p_mask_label)

    # flipped_targets is dst labels for each src label that the src label can flip to
    # for now, only one dst, so all in flip_targets is dst class label
    flipped_targets = flip_mask_label.sample(sample_shape=targets[indices].shape)

    # mask the flipped_targets to get the actually flipped instances (ones not to be flipped are 0s, ones to be flipped are dst labels)
    masked_flipped_targets = flipped_targets * flip_mask

    # mask the actual targets to keep the ones not flipped (ones not to be flipped are original labels, ones to be flipped are 0s)
    masked_targets = targets[indices] * keep_mask

    # add vectors together - kept ones remain, flipped ones are flipped
    noisy_targets_sub = (masked_targets + masked_flipped_targets).long()

    # insert into noisy_targets the flipped targets
    noisy_targets[indices] = noisy_targets_sub

    # get the indices of the noisy instances (indices in all dataset)
    dirty_indicator_indices = torch.where(targets != noisy_targets)[0]
    
    return indices, dirty_indicator_indices, noisy_targets

In [94]:
print(train_dataset.class_to_idx.keys())
print(train_dataset.class_to_idx.values())

dict_keys(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'])
dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


Files already downloaded and verified
tensor([    1,     2,    14,  ..., 49963, 49971, 49997])
tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])


(tensor([    1,     2,    14,  ..., 49963, 49971, 49997]),
 tensor([    1,     2,    50,  ..., 49911, 49917, 49926]),
 tensor([6, 1, 1,  ..., 9, 1, 1]))

In [126]:
def make_symmetric_noise_rules(dataset_name, train_dataset):
    if dataset_name == "cifar10":
        labels = list(train_dataset.class_to_idx.keys())
        noise_rules = []

        for src in labels:
            dsts = labels.copy()
            dsts.remove(src)
            p = 0.4

            noise_rule = {"src":src, "dsts":dsts, "p":p}
            noise_rules.append(noise_rule)

    elif "cifar100":
        pass
    else:
        raise Exception
        
    return noise_rules

def make_asymmetric_noise_rules(dataset_name, train_dataset):
    if dataset_name == "cifar10":
        noise_rules = [
            {"src":"truck", "dsts":["automobile"], "p":0.4},
            {"src":"bird", "dsts":["airplane"], "p":0.4},
            {"src":"cat", "dsts":["dog"], "p":0.4},
            {"src":"dog", "dsts":["cat"], "p":0.4}
        ]

    elif "cifar100":
        pass
    else:
        raise Exception
        
    return noise_rules

symmetric

In [125]:
train_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=True, transform=transforms.ToTensor(), download=True)
noise_rules = make_symmetric_noise_rules(dataset_name, train_dataset)
print(noise_rules)

targets = torch.tensor(train_dataset.targets)
noisy_targets = targets.detach().clone()

for noise_rule in noise_rules:
    indices_per_rule, dirty_indicator_indices_per_rule, noisy_targets_per_rule = \
        make_inherent_label_noise(train_dataset, noise_rule["src"], noise_rule["dsts"], noise_rule["p"])
    noisy_targets[indices_per_rule] = noisy_targets_per_rule[indices_per_rule]

print(torch.where(targets != noisy_targets)[0])
print(torch.where(targets != noisy_targets)[0].size(dim=0) / targets.size(dim=0))

Files already downloaded and verified
[{'src': 'airplane', 'dsts': ['automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'automobile', 'dsts': ['airplane', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'bird', 'dsts': ['airplane', 'automobile', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'cat', 'dsts': ['airplane', 'automobile', 'bird', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'deer', 'dsts': ['airplane', 'automobile', 'bird', 'cat', 'dog', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'dog', 'dsts': ['airplane', 'automobile', 'bird', 'cat', 'deer', 'frog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'frog', 'dsts': ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'horse', 'ship', 'truck'], 'p': 0.4}, {'src': 'horse', 'dsts': ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'ship', 'truck'], 'p': 0.4}, {'src': '

asymmetric

In [132]:
train_dataset = datasets.CIFAR10(os.path.join(datapath, dataset_name), train=True, transform=transforms.ToTensor(), download=True)
noise_rules = make_asymmetric_noise_rules(dataset_name, train_dataset)
print(noise_rules)

targets = torch.tensor(train_dataset.targets)
noisy_targets = targets.detach().clone()

for noise_rule in noise_rules:
    indices_per_rule, dirty_indicator_indices_per_rule, noisy_targets_per_rule = \
        make_inherent_label_noise(train_dataset, noise_rule["src"], noise_rule["dsts"], noise_rule["p"])
    noisy_targets[indices_per_rule] = noisy_targets_per_rule[indices_per_rule]

print(torch.where(targets != noisy_targets)[0])
print(torch.where(targets != noisy_targets)[0].size(dim=0))
print(torch.where(targets != noisy_targets)[0].size(dim=0) / targets.size(dim=0))

Files already downloaded and verified
[{'src': 'truck', 'dsts': ['automobile'], 'p': 0.4}, {'src': 'bird', 'dsts': ['airplane'], 'p': 0.4}, {'src': 'cat', 'dsts': ['dog'], 'p': 0.4}, {'src': 'dog', 'dsts': ['cat'], 'p': 0.4}]
tensor([    1,     2,    14,  ..., 49963, 49971, 49997])
tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([    6,    13,    18,  ..., 49987, 49991, 49995])
tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([    9,    17,    21,  ..., 49979, 49982, 49983])
tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])
tensor([   27,    40,    51,  ..., 49964, 49980, 49988])
tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])
tensor([    2,     6,     9,  ..., 49987, 49991, 49995])
7918
0.15836
