In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [2]:
data = pd.read_csv('data/train.csv')
y = data['label'].values

posns = []

for i in range(len(data)):
    row = data.iloc[i].values[1:]

    idx = []
    for j in range(len(row)):
        if j % 3 == 0 and row[j] != -1:
            idx.append(j)

    posn = np.array([row[idx], row[[i+1 for i in idx]], row[[i+2 for i in idx]]]).T
    posns.append(posn)


In [3]:
# Save posns
np.savez('data/posns_train', *posns)
np.save('data/y_train', y)

In [4]:
data = np.load("data/posns_train.npz")
yp = np.load("data/y_train.npy")
data = list(data.values())
yp.shape

(60000,)

In [5]:
[d.shape for d in data] == [p.shape for p in posns]

True

In [6]:
thetas = [np.radians(0),np.radians(90), np.radians(180), np.radians(270)]
cos = [np.cos(th) for th in thetas]
sin = [np.sin(th) for th in thetas]

rots = np.array([np.array([[c, -s], [s, c]]) for c, s in zip(cos, sin)] + [np.array([[-1, 0], [0, 1]])])
rots = torch.tensor(rots, dtype=torch.float32).to('cuda')

In [7]:
rot_posns = []

for d in tqdm(data):
    d = torch.tensor(d, dtype=torch.float32).to('cuda')
    rot_d = d[:, :2] @ rots
    rot_d = torch.cat([rot_d, d[:, 2].reshape(1, len(d), 1).repeat(len(rots), 1, 1)], dim=2)

    rot_d = rot_d.cpu().numpy()
    for i in range(len(rot_d)):
        rot_posns.append(rot_d[i])

np.savez('data/posns_train_rot', *rot_posns)

100%|██████████| 60000/60000 [00:13<00:00, 4452.37it/s]


In [14]:
for noise in [0.05, 0.10]:

    noise_posns = []

    for pos in tqdm(rot_posns):
        n_pts = pos.shape[0]
        n_noise = int(n_pts * noise)

        noise_pts = np.random.randint(0, 28, size=(n_noise, 2))
        vnoise_val = int(pos[:, 2].sum() * noise / n_noise)
        vnoise = np.ones((n_noise, 1)) * vnoise_val

        noise_pts = np.concatenate([noise_pts, vnoise], axis=1)

        pos = np.concatenate([pos, noise_pts], axis=0)

        noise_posns.append(pos)

    np.savez(f'data/posns_train_noise_{int(noise*100)}', *noise_posns)
    print(len(noise_posns))

100%|██████████| 300000/300000 [00:09<00:00, 31861.44it/s]


300000


100%|██████████| 300000/300000 [00:08<00:00, 35654.36it/s]


300000


In [17]:
data = pd.read_csv('data/test.csv')
y = data['label'].values

posns = []

for i in range(len(data)):
    row = data.iloc[i].values[1:]

    idx = []
    for j in range(len(row)):
        if j % 3 == 0 and row[j] != -1:
            idx.append(j)

    posn = np.array([row[idx], row[[i+1 for i in idx]], row[[i+2 for i in idx]]]).T
    posns.append(posn)


In [18]:
# Save posns
np.savez('data/posns_test', *posns)
np.save('data/y_test', y)

In [20]:
data = np.load("data/posns_test.npz")
yp = np.load("data/y_test.npy")
data = list(data.values())
yp.shape

(10000,)

In [21]:
[d.shape for d in data] == [p.shape for p in posns]

True

In [22]:
rot_posns = []

for d in tqdm(data):
    d = torch.tensor(d, dtype=torch.float32).to('cuda')
    rot_d = d[:, :2] @ rots
    rot_d = torch.cat([rot_d, d[:, 2].reshape(1, len(d), 1).repeat(len(rots), 1, 1)], dim=2)

    rot_d = rot_d.cpu().numpy()
    for i in range(len(rot_d)):
        rot_posns.append(rot_d[i])

np.savez('data/posns_test_rot', *rot_posns)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [00:02<00:00, 4740.12it/s]


In [35]:
for noise in [0.05, 0.10,0.2,0.5]:

    noise_posns = []

    for pos in tqdm(rot_posns):
        n_pts = pos.shape[0]
        n_noise = int(n_pts * noise)

        noise_pts = np.random.randint(-10, 35, size=(n_noise, 2))
        vnoise_val = int(pos[:, 2].sum() * noise / n_noise)
        vnoise = np.ones((n_noise, 1)) * vnoise_val

        noise_pts = np.concatenate([noise_pts, vnoise], axis=1)

        pos = np.concatenate([pos, noise_pts], axis=0)

        noise_posns.append(pos)

    torch.save(noise_posns,f'data/posns_test_noise_{int(noise*100)}.pt')
    print(len(noise_posns))

100%|████████████████████████████████| 300000/300000 [00:08<00:00, 34412.14it/s]


300000


100%|████████████████████████████████| 300000/300000 [00:08<00:00, 35703.49it/s]


300000


100%|████████████████████████████████| 300000/300000 [00:08<00:00, 34318.51it/s]


300000


100%|████████████████████████████████| 300000/300000 [00:09<00:00, 32769.89it/s]


300000


In [None]:
# Visualize a digit
import matplotlib.pyplot as plt
posn = noise_posns[0]
plt.figure(figsize=(5, 5))
plt.scatter(posn[:, 0], posn[:, 1], c='black', s=posn[:, 2]/10)
plt.show()