In [40]:
# Load data (MNIST)

import torch
import torchvision
import matplotlib.pyplot as plt
import numpy as np

batch_size_train = 100000
batch_size_test = 20000
log_interval = 10


# num_samples = 128 # number of training samples
num_samples = 1024 * 8
num_samples_test = 100 # number of test samples

new_dim1 = 28 * 1 # first dimension
new_dim2 = 28 * 1 # second dimension


old_dim = 28 # MNIST original dimension


random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST(root = './', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
    batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST(root = './', train=False, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
    batch_size=batch_size_test, shuffle=True)

examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)

print(example_data.shape)



torch.Size([10000, 1, 28, 28])


In [13]:
# Grayscale version of CIFAR-10

import torch
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import torchvision.transforms as transforms


batch_size_train = 10000
batch_size_test = 20000
log_interval = 10

# num_samples = 1024 * 4 # number of training samples
num_samples = 512 # number of training samples



# num_samples = 128 # number of training samples
num_samples_test = 100 # number of test samples

new_dim1 = 32 * 1 # first dimension
new_dim2 = 32 * 1 # second dimension


old_dim = 32 # MNIST original dimension


transform = transforms.Compose(
    [transforms.Grayscale(num_output_channels=1),
                                    transforms.ToTensor(), 
                                    transforms.Normalize((0.5, ), (0.5, ))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size_train,
                                          shuffle=True, num_workers=1)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size_test,
                                         shuffle=False, num_workers=1)

# classes = ('plane', 'car', 'bird', 'cat',
#            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')



Files already downloaded and verified
Files already downloaded and verified


In [14]:
examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)

print(example_data.shape)
print(example_targets.shape)

torch.Size([10000, 1, 32, 32])
torch.Size([10000])


In [15]:
# Create training data and test data for binary classification task

from tqdm import tqdm 
training_data = enumerate(train_loader)
test_data = enumerate(test_loader)
(batch_id1, (data_tr_old, target_tr_old)) = next(training_data)
(batch_id2, (data_test_old, target_test_old)) = next(test_data)
print("loaded")


# Classes
# class1 = [0, 1, 2, 3, 4]
# class2 = [5, 6, 7, 8, 9]
class1 = [0]
class2 = [2]
# Pad the image on the right and on the bottom
def filter_digits(data_old, target_old, num, threshold):
    data = torch.zeros([num, 1, new_dim1, new_dim2])
    target = torch.zeros([num])

    num_samples_per_class = int(num / 2)
    sample1 = 0
    sample2 = 0
    attempt = 0
    sample = 0

    while (sample1 < num_samples_per_class) or (sample2 < num_samples_per_class):
        if attempt == threshold:
            print("FAILED: need more samples in batch")
            return (data, target)
        # Balance classes
        is_sample_for_class1 = (target_old[attempt] in class1) and (sample1 < num_samples_per_class)
        is_sample_for_class2 = (target_old[attempt] in class2) and (sample2 < num_samples_per_class)
        if is_sample_for_class1 or is_sample_for_class2: 
            avg = torch.mean(data_old[attempt][0])
            target[sample] = target_old[attempt]
            for i in range(old_dim):
                for j in range(old_dim):
                    data[sample][0][i][j] = data_old[attempt][0][i][j] - avg
            if is_sample_for_class1:
                sample1 += 1
            if is_sample_for_class2:
                sample2 += 1
          # Augment sample counts
            sample += 1
        attempt += 1

    target.apply_(lambda x: 1 if (x in class1) else -1)

    data = data.float()
    target = target.float()
    return (data, target)


# Training data and test data
(data_tr, target_tr) = filter_digits(data_tr_old, target_tr_old, num_samples, batch_size_train)
print("Created training data")
(data_test, target_test) = filter_digits(data_test_old, target_test_old, num_samples_test, batch_size_test)
print("Created test data")


torch.save(data_tr, "training-test-data/training_data_nonlinear.txt")


torch.save(target_tr, "training-test-data/training_targets_nonlinear.txt")


torch.save(data_test, "training-test-data/test_data_nonlinear.txt")


torch.save(target_test, "training-test-data/test_targets_nonlinear.txt")



loaded
Created training data
Created test data


In [8]:
# Check that classes are balanced
counter = 0
for i in range(num_samples):
    if target_tr[i] == 1:
        counter += 1
print(counter/num_samples)

counter = 0
for i in range(num_samples_test):
    if target_test[i] == 1:
        counter += 1
    

0.5


In [44]:
class1

[0, 1, 2, 3, 4]