In [1]:
from data_sampler import WeightedSampler
from dataset import CARLADataset, CARLADatasetMultiProcessing
from data_preprocessing import preprocessing
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
import torch
import os
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile, ZIP_DEFLATED
import zipfile
import shutil
import tarfile

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Preprocess rgb files and save them as zipped/compressed torch tensors to disk

In [4]:
def compress_file(path_source, path_destination):
   path_pwd = os.getcwd()
   directory, file_name = os.path.split(path_source)
   file_name_zip = file_name + ".zip"
   os.chdir(directory)
   with ZipFile(file_name_zip, 'w', ZIP_DEFLATED) as zip:
      zip.write(file_name)
   os.chdir(path_pwd)
   # old_file = os.path.join(directory, file_name_zip)
   # shutil.copy(old_file, path_destination)
   # os.remove(old_file)

In [5]:
def extract_file(file_path):
    path_pwd = os.getcwd()
    directory, file_name = os.path.split(file_path)
    file_name_zip = file_name + ".zip"
    os.chdir(directory)


    with ZipFile(file_name, 'r') as zip:
        #file_name = os.path.splitext(path_source)[0].split(os.sep)[-1]
        # zip.extract(file_name, path_destination)
        zip.extractall() # path=path_destination
    
    os.chdir(path_pwd)

In [25]:
path_data = os.path.join("..", "data", "Dataset Ege")
# path_data = os.path.join("..", "data", "data")


config = {"used_inputs": ["rgb", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

dataset = CARLADataset(root_dir=path_data, config=config)
print(dataset.__len__())

Varying number files among input types: ../data/Dataset Ege/Town10HD_Scenario10_route16_11_28_18_26_19/rgb_prep_zip
217


In [27]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    dir_name_zip = os.path.join(df_meta.iloc[idx]["dir"], "rgb_prep_zip")
    if not os.path.exists(dir_name_zip):
        os.makedirs(dir_name_zip)
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)
    filename_torch = os.path.join(dir_name_zip, f"{df_meta.iloc[idx]['rgb'].split('.')[0]}.pt")
    torch.save(img_torch_prep, filename_torch)
    compress_file(filename_torch, dir_name_zip)
    os.remove(filename_torch)

100%|██████████| 217/217 [00:22<00:00,  9.68it/s]


## Preprocess rgb files and save them (high storage demand)

In [28]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    dir_name_zip = os.path.join(df_meta.iloc[idx]["dir"], "rgb_prep")
    if not os.path.exists(dir_name_zip):
        os.makedirs(dir_name_zip)
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)
    filename_torch = os.path.join(dir_name_zip, f"{df_meta.iloc[idx]['rgb'].split('.')[0]}.pt")
    torch.save(img_torch_prep, filename_torch)

100%|██████████| 217/217 [00:02<00:00, 91.49it/s]


## Compare the approaches

### Preprocessing on the fly

In [18]:
config = {"used_inputs": ["rgb", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

In [19]:
dataset = CARLADataset(root_dir=path_data, config=config)
weighted_sampler = WeightedSampler(dataset=dataset)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0, sampler=weighted_sampler)

In [20]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)

100%|██████████| 217/217 [00:01<00:00, 171.97it/s]


In [13]:
# Unfair to use DataLoader (with batches) because for other approaches the Dataset class was not adjusted (time consuming)
count = 0
for batch in tqdm(dl):
    # preprocessing
    for key in preprocessing:
        batch[key] = preprocessing[key](batch[key])
    count += 1
print(count)

100%|██████████| 14/14 [00:01<00:00, 10.34it/s]

14





### Preprocessing from disk and compressed

In [21]:
config = {"used_inputs": ["rgb_prep_zip", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

In [22]:
dataset = CARLADataset(root_dir=path_data, config=config)
weighted_sampler = WeightedSampler(dataset=dataset)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0, sampler=weighted_sampler)

In [30]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path_compressed = os.path.join(df_meta.iloc[idx][0], "rgb_prep_zip", df_meta.iloc[idx][1])
    path_decompressed = os.path.splitext(path_compressed)[0]
    extract_file(path_compressed)
    img_torch = torch.load(path_decompressed)
    os.remove(path_decompressed)


100%|██████████| 217/217 [00:02<00:00, 81.08it/s]


### Preprocessing from disk

In [34]:
config = {"used_inputs": ["rgb_prep", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

dataset = CARLADataset(root_dir=path_data, config=config)
weighted_sampler = WeightedSampler(dataset=dataset)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0, sampler=weighted_sampler)

df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path = os.path.join(df_meta.iloc[idx][0], "rgb_prep", df_meta.iloc[idx][1])
    img_torch = torch.load(path)

100%|██████████| 217/217 [00:00<00:00, 685.09it/s]


## Move sensor data that isn't used to other directory 

In [55]:
# root_dir = os.path.join("..", "data", "Dataset Ege")
root_dir = os.path.join("..", "data", "data")
keep_input = ["lidar", "rgb", "measurements"]

In [56]:
for (root, dirs, files) in os.walk(root_dir, topdown=True):
    # Current folder contains the files
    if not dirs:
        dir, input_type = os.path.split(root)
        if input_type not in keep_input:
            dir_new = os.path.join(os.path.split(dir)[0] + " unused", os.path.split(dir)[1])
            # path_split = dir.split(os.sep)
            # path_split[2] = path_split[2] + " unused"
            # dir_new = os.path.join(*path_split)
            if not os.path.exists(dir_new):
                os.makedirs(dir_new)
            shutil.move(root, dir_new)

## Outdated functions

In [None]:
def compress_file(file_path, dest_path):
  path_pwd = os.getcwd()
  # Extract the directory and file name from the file path
  directory, file_name = os.path.split(file_path)
  os.chdir(directory)
  # Compress the file using shutil.make_archive
  shutil.make_archive(file_name, 'zip', directory, file_name) # directory, file_name
  os.chdir(path_pwd)
  old_file = os.path.join(directory, file_name + '.zip')
  # shutil.copy(old_file, dest_path)
  # os.remove(old_file)

In [None]:
def extract_file(file_path):
    with ZipFile(path_source, 'r') as zip:
        file_name = os.path.splitext(path_source)[0].split(os.sep)[-1]
        zip.printdir()
        # zip.extract(file_name, path_destination)
        zip.extractall() # path=path_destination

In [None]:
def extract_file(file_path):
    path_pwd = os.getcwd()
    directory, file_name = os.path.split(file_path)
    os.chdir(directory)
    shutil.unpack_archive(file_name, directory, "zip")
    os.chdir(path_pwd)