In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import TensorDataset, DataLoader
import torch.utils.data as data_utils
import numpy as np
from random import randint
from sklearn.impute import SimpleImputer  # used to input missing values

In [2]:
BATCH_SIZE = 100
MISSING_SQUARE = 13 #size of missing square
IMAGE_SHAPE = (28,28)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)

<torch._C.Generator at 0x7fd7f4020bd0>

# MNIST

In [3]:
mnist_train = datasets.MNIST("./", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("./", train=False, download=True, transform=transforms.ToTensor())

In [4]:
labels = np.zeros(len(mnist_train))
for i in range(len(mnist_train)): labels[i] = mnist_train[i][1] 

In [5]:
data_with_nan = (mnist_train.data.numpy()/255.0)

In [6]:
# create missing squares at random positions
mask = np.zeros((len(mnist_train), IMAGE_SHAPE[0], IMAGE_SHAPE[1]))
for z in range(len(mask)):
    x_position = randint(0,IMAGE_SHAPE[0]-MISSING_SQUARE)
    y_position = randint(0,IMAGE_SHAPE[1]-MISSING_SQUARE)
    for i in range(x_position,x_position+MISSING_SQUARE):
        for j in range(y_position,y_position+MISSING_SQUARE):
            mask[z][i][j] = 1
            data_with_nan[z][i][j] = np.nan

In [7]:
#simple imputation
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
imp.fit(data_with_nan.reshape((60000,784)))
data_after_imputation = imp.transform(data_with_nan.reshape((60000,784)))
data_after_imputation = data_after_imputation.reshape((60000,1,28,28)) #back to (28,28) shape

In [8]:
#saving to files
np.save('./data/MNIST_data_imputation.npy', data_after_imputation)
np.save('./data/MNIST_labels.npy', labels)
np.save('./data/MNIST_mask.npy', mask.reshape((60000,1,28,28)))

In [9]:
test_labels = np.zeros(len(mnist_test))
for i in range(len(mnist_test)): test_labels[i] = mnist_test[i][1] 

In [10]:
test_data_with_nan = (mnist_test.data.numpy()/255.0)

# create missing squares at random positions
test_mask = np.zeros((len(mnist_test), IMAGE_SHAPE[0], IMAGE_SHAPE[1]))
for z in range(len(test_mask)):
    x_position = randint(0,IMAGE_SHAPE[0]-MISSING_SQUARE)
    y_position = randint(0,IMAGE_SHAPE[1]-MISSING_SQUARE)
    for i in range(x_position,x_position+MISSING_SQUARE):
        for j in range(y_position,y_position+MISSING_SQUARE):
            test_mask[z][i][j] = 1
            test_data_with_nan[z][i][j] = np.nan

test_mask = test_mask.reshape((10000,1,28,28))

test_data_after_imputation = imp.transform(test_data_with_nan.reshape((10000,784)))
test_data_after_imputation = test_data_after_imputation.reshape((10000,1,28,28)) #back to (28,28) shape

In [11]:
#saving to files
np.save('./data/MNIST_test_data_imputation.npy', test_data_after_imputation)
np.save('./data/MNIST_test_labels.npy', test_labels)
np.save('./data/MNIST_test_mask.npy', test_mask)

# CIFAR

In [12]:
MISSING_SQUARE = 13 #size of missing square
IMAGE_SHAPE = (32,32)

In [13]:
cifar_train = datasets.CIFAR10("./", train=True, download=True, transform=transforms.ToTensor())
cifar_test = datasets.CIFAR10("./", train=False, download=True, transform=transforms.ToTensor())

Files already downloaded and verified
Files already downloaded and verified


In [14]:
labels = np.zeros(len(cifar_train))
for i in range(len(cifar_train)): labels[i] = cifar_train[i][1] 

In [15]:
data_with_nan = (cifar_train.data/255.0)

In [16]:
# create missing squares at random positions
mask = np.zeros((len(cifar_train), IMAGE_SHAPE[0], IMAGE_SHAPE[1], 3))
for z in range(len(mask)):
    x_position = randint(0,IMAGE_SHAPE[0]-MISSING_SQUARE)
    y_position = randint(0,IMAGE_SHAPE[1]-MISSING_SQUARE)
    
    for c in range(3): #loop over channels
        for i in range(x_position,x_position+MISSING_SQUARE):
            for j in range(y_position,y_position+MISSING_SQUARE):
                mask[z][i][j][c] = 1
                data_with_nan[z][i][j][c] = np.nan

In [17]:
#simple imputation
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
imp.fit(data_with_nan.reshape((len(cifar_train),32*32*3)))
data_after_imputation = imp.transform(data_with_nan.reshape((len(cifar_train),32*32*3)))
data_after_imputation = data_after_imputation.reshape((len(cifar_train),32,32,3)) #back to (32,32) shape

In [18]:
#saving to files
np.save('./data/CIFAR_data_imputation.npy', data_after_imputation)
np.save('./data/CIFAR_labels.npy', labels)
np.save('./data/CIFAR_mask.npy', mask)

In [19]:
test_labels = np.zeros(len(cifar_test))
for i in range(len(cifar_test)): test_labels[i] = cifar_test[i][1] 

In [20]:
test_data_with_nan = (cifar_test.data/255.0)

In [21]:
# create missing squares at random positions
test_mask = np.zeros((len(cifar_test), IMAGE_SHAPE[0], IMAGE_SHAPE[1], 3))
for z in range(len(test_mask)):
    x_position = randint(0,IMAGE_SHAPE[0]-MISSING_SQUARE)
    y_position = randint(0,IMAGE_SHAPE[1]-MISSING_SQUARE)
    
    for c in range(3): #loop over channels
        for i in range(x_position,x_position+MISSING_SQUARE):
            for j in range(y_position,y_position+MISSING_SQUARE):
                test_mask[z][i][j][c] = 1
                test_data_with_nan[z][i][j][c] = np.nan

In [23]:
test_data_after_imputation = imp.transform(test_data_with_nan.reshape((len(cifar_test),32*32*3)))
test_data_after_imputation = test_data_after_imputation.reshape((len(cifar_test),32,32,3)) #back to (32,32) shape

In [24]:
#saving to files
np.save('./data/CIFAR_test_data_imputation.npy', test_data_after_imputation)
np.save('./data/CIFAR_test_labels.npy', test_labels)
np.save('./data/CIFAR_test_mask.npy', test_mask)