In [2]:
import sys
import os
from torch.utils.data import DataLoader
sys.path.append("..")
from tqdm import tqdm
import shutil
import torch
import numpy as np

from utils import train_test_split, create_metadata_df
from dataset_xy import CARLADatasetXY
from dataset import CARLADataset
from data_preprocessing import preprocessing, transform_lidar_bev

from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


Goal of this notebook is to develop a method that saves the data from df_train, df_test1 and df_test2 preprocessed and in as few folders as possible. Furthermore the data from df_train shall be stored in shuffled order such that we don't have to explicitly shuffle anymore while training to prevent random reads.

In [56]:
path_data = os.path.join("..", "..", "data", "data")
path_data = os.path.join("..", "..", "data", "train_set")

config_xy = {"used_inputs": ["rgb", "lidar", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle", "brake", "command"],
        "y": ["brake", "steer", "throttle"],
        "seq_len": 1
        }
        
df_meta_data = create_metadata_df(path_data, config_xy["used_inputs"])
dataset = CARLADatasetXY(root_dir=path_data, df_meta_data=df_meta_data, config=config_xy)

In [58]:
dl = DataLoader(dataset=dataset, batch_size=64, num_workers=0)

In [59]:
for x,y in tqdm(dl):
    pass

100%|██████████| 14/14 [00:02<00:00,  4.79it/s]


In [48]:
train_test_config = {
    "train": ['Town00', 'Town01', 'Town02', 'Town03', 'Town04', 'Town05', 'Town07', 'Town08', 'Town09', 'Town10'],
    "test": ['Town06']
}

df_train, df_test_1, df_test_2 = train_test_split(df_meta_data, towns_intersect=train_test_config)

np.random.seed(42)
indices_rand = np.random.choice(list(range(len(df_train))), size=len(df_train), replace=False)
df_train_shuffled = df_train.loc[indices_rand].reset_index(drop=True)

In [49]:
dataset_train = CARLADatasetXY(root_dir=path_data, df_meta_data=df_train_shuffled, config=config_xy)
dataset_test_1 = CARLADatasetXY(root_dir=path_data, df_meta_data=df_test_1, config=config_xy)
dataset_test_2 = CARLADatasetXY(root_dir=path_data, df_meta_data=df_test_2, config=config_xy)
dataset_map = {"train_set": dataset_train, "test_set_1": dataset_test_1, "test_set_2": dataset_test_2}

In [50]:
dir_destination = os.path.join("..", "..", "data")
normalize = True

In [52]:
for dataset_key in dataset_map:
    
    path_dataset = os.path.join(dir_destination, dataset_key)
    if not os.path.exists(path_dataset):
        os.makedirs(path_dataset)

    dataset = dataset_map[dataset_key]
    df_meta_data = dataset.df_meta_data
    number_set_len = len(str(len(df_meta_data)))

    for idx in tqdm(range(len(df_meta_data))):
        idx_len = len(str(idx))
        for input in dataset.used_inputs:
            path_dataset_input = os.path.join(path_dataset, input)
            # Will only be created once at the first iteration for a dataset
            if not os.path.exists(path_dataset_input):
                os.makedirs(path_dataset_input)
            # Load, potentially preprocess and safe input data
            if input in preprocessing.keys():
                # Load data
                path_data_load = os.path.join(df_meta_data["dir"][idx], input, df_meta_data[input][idx])
                data = dataset.load_data_from_path(path_data_load)
                # Normalize/ Preprocess
                if normalize:
                    if input == "rgb":
                        data = torch.Tensor(data)
                    data = preprocessing[input](data)
                data_suffix = f"{'0'* (number_set_len - idx_len)}{str(idx)}.npy"
                path_data_save = os.path.join(path_dataset_input, data_suffix)
                with open(path_data_save, 'wb') as f:
                    np.save(f, data)
            # Copy measurements
            if input == "measurements":
                path_data_source = os.path.join(df_meta_data["dir"][idx], input, df_meta_data[input][idx])
                data_suffix = f"{'0'* (number_set_len - idx_len)}{str(idx)}.json"
                path_data_destination = os.path.join(path_dataset_input, data_suffix)
                shutil.copy(path_data_source, path_data_destination)

  0%|          | 845/205535 [00:14<57:20, 59.50it/s]  


KeyboardInterrupt: 

In [6]:
# Size of entire preprocessed dataset (actually lidar still needs to be normalized) in GB
((0.461 + 1.8) * len(df_meta_data)) / 1000

585.182976