# Loading Data - SVHN

### Load in MatLab matrices

In [62]:
import scipy.io
from sklearn.model_selection import train_test_split

In [61]:
test_mat = scipy.io.loadmat('../data/data_svhn/test_32x32.mat')
train_mat = scipy.io.loadmat('../data/data_svhn/train_32x32.mat')
extra_mat = scipy.io.loadmat('../data/data_svhn/extra_32x32.mat')

ModuleNotFoundError: No module named 'sklearn.model_selection.train_test_split'

### Get data from matrices

In [269]:
# Get 
x_test = test_mat.get('X')  #numpy arrays
y_test = test_mat.get('y')

x_train = train_mat.get('X')
y_train = train_mat.get('y')

x_extra = extra_mat.get('X')
y_extra = extra_mat.get('y')

In [270]:
x_extra.shape

(32, 32, 3, 531131)

In [271]:
x_train.shape

(32, 32, 3, 73257)

In [272]:
x_test.shape

(32, 32, 3, 26032)

### Reshape the matrices

[h,w,channels,samples] -> [samples,h,w,channels]

In [273]:
x_test = np.transpose(x_test, axes=(3,0,1,2))
x_train = np.transpose(x_train, axes=(3,0,1,2))
x_extra = np.transpose(x_extra, axes=(3,0,1,2))

In [274]:
x_train.shape

(73257, 32, 32, 3)

### Split validation data

In [295]:
# Split train data into train and validation gettting certain number of labels from each class
def train_val_split_count(x_train, y_train, size, seed):
    
    if seed != None:
        np.random.seed(seed)  # Set seed if it is stated.

    labels = set(y_train.flatten())  # Get label names
    n_labels = len(labels)  # Get number of labels

    x_val = []
    y_val = []
    split = []

    for i in labels:
        labels_i = np.where(y_train == i)[0]
        #np.random.seed(seed)
        samples = np.random.choice(labels_i, size)  # TODO: Check if enough labels in the class
        split.append(samples)

    split = np.array(split).flatten()
    print(split[:10])

    x_val = np.array(x_train[split])
    y_val = np.array(y_train[split])
    
    x_train = np.delete(x_train, split, axis=0)
    y_train = np.delete(y_train, split, axis=0)
    
    return (x_train, x_val, y_train, y_val)

In [284]:
x_train1, x_val1, y_train1, y_val1 = train_val_split_count(x_train, y_train, size = 200, seed = 333)

[43375  4994 51023 32153 39898 29024 43722 18435 45000 54415]


In [285]:
print(x_train1.shape)
print(y_train1.shape)
print(x_val1.shape)
print(y_val1.shape)

(71280, 32, 32, 3)
(71280, 1)
(2000, 32, 32, 3)
(2000, 1)


In [296]:
x_extra2, x_val2, y_extra2, y_val2 = train_val_split_count(x_extra, y_extra, size = 400, seed = 333)

[432553 178960 197836 131442 524675 416661 144427 212716 381479  59914]


In [297]:
print(x_extra.shape)
print(y_extra2.shape)
print(x_val2.shape)
print(y_val2.shape)

(531131, 32, 32, 3)
(527153, 1)
(4000, 32, 32, 3)
(4000, 1)


In [303]:
from collections import Counter

Counter(list(y_val_all.flatten()))

Counter({1: 600,
         2: 600,
         3: 600,
         4: 600,
         5: 600,
         6: 600,
         7: 600,
         8: 600,
         9: 600,
         10: 600})

### Add together train and extra data

In [299]:
x_train_all = np.concatenate([x_train1, x_extra2])
y_train_all = np.concatenate([y_train1, y_extra2])

In [300]:
x_val_all = np.concatenate([x_val1, x_val2])
y_val_all = np.concatenate([y_val1, y_val2])

In [301]:
x_train_all.shape

(598433, 32, 32, 3)

In [302]:
y_val_all.shape

(6000, 1)

### Load from script

In [1]:
from load_data_svhn import load_data_svhn

In [2]:
(x_train, y_train), (x_val, y_val), (x_test, y_test) = load_data_svhn()

In [3]:
print(x_train.shape)
print(y_train.shape)

print(x_val.shape)
print(y_val.shape)

print(x_test.shape)
print(y_test.shape)

(598433, 32, 32, 3)
(598433, 1)
(6000, 32, 32, 3)
(6000, 1)
(26032, 32, 32, 3)
(26032, 1)
