In [1]:
from dataset import Dataset, Sample
import utils
import numpy as np
import torch

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
import pickle

In [4]:
ds = Dataset.Load("datasets/gds/training") # Load the dataset

In [5]:
class PreprocessedSample(object):
    def __init__(self, sample: Sample, resample_count=64):
        trajectory = utils.resample(sample.trajectory, resample_count)
        trajectory = utils.scale_to_unit_square(trajectory)
        vecs = utils.vectorize(trajectory)
        
        self.trajectory = trajectory
        self.vecs = vecs
        self.gname = sample.gname

In [6]:
# Resample, scale, translate and convert to floats
preprocessed_samples = []
for i, s in enumerate(ds.samples, 64):
    ps = PreprocessedSample(s)
    if i % 1000 == 0:
        print(f"processed {i} samples")
    preprocessed_samples.append(ps)

processed 1000 samples
processed 2000 samples
processed 3000 samples
processed 4000 samples


In [7]:
preprocessed_x = np.array([p.trajectory.flatten() for p in preprocessed_samples])
preprocessed_x = preprocessed_x.astype(np.float32)
print(preprocessed_x.shape)

(4800, 128)


In [11]:
import os
os.mkdir("datasets/gds_as_flat_series/")

In [12]:
# Encode labels
preprocessed_y = [p.gname for p in preprocessed_samples]
preprocessed_y = np.array(preprocessed_y).reshape(-1, 1)
oh = OneHotEncoder()
oh.fit(preprocessed_y)
preprocessed_y = oh.transform(preprocessed_y).toarray()
preprocessed_y = preprocessed_y.astype(np.float32)
print(preprocessed_y.shape)

with open("datasets/gds_as_flat_series/oh_encoder", "wb") as f:
    pickle.dump(oh, f)

(4800, 16)


In [13]:
# Split the data into training and testing and save locally for quick loads in the future
X_train, X_test, y_train, y_test = train_test_split(preprocessed_x, preprocessed_y, test_size=0.2, train_size=0.8, random_state=42, shuffle=True, stratify=preprocessed_y)

X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

In [14]:
torch.save(X_train, "datasets/gds_as_flat_series/X_train_ts.pt")
torch.save(X_test, "datasets/gds_as_flat_series/X_test_ts.pt")
torch.save(y_train, "datasets/gds_as_flat_series/y_train_ts.pt")
torch.save(y_test, "datasets/gds_as_flat_series/y_test_ts.pt")