# Content
This notebook experiments with different file formats (uncompressed and compressed) methods for rgb input (should be the same for lidar) for saving preprocessed data to disk. <br>
--> Result: use .npy (uncompressed: big but very fast to load) or .npz (compressed: smaller but slower to load)

In [1]:
import sys
sys.path.append("..")
from dataset import CARLADataset
from data_preprocessing import preprocessing
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
import torch
import os
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile, ZIP_DEFLATED
import zipfile
import shutil
import tarfile
from utils import train_test_split, create_metadata_df

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Preprocess rgb files and save them as zipped/compressed torch tensors to disk

In [2]:
def compress_file(path_source, path_destination):
   path_pwd = os.getcwd()
   directory, file_name = os.path.split(path_source)
   file_name_zip = file_name + ".zip"
   os.chdir(directory)
   with ZipFile(file_name_zip, 'w', ZIP_DEFLATED) as zip:
      zip.write(file_name)
   os.chdir(path_pwd)
   # old_file = os.path.join(directory, file_name_zip)
   # shutil.copy(old_file, path_destination)
   # os.remove(old_file)

In [3]:
def extract_file(file_path):
    path_pwd = os.getcwd()
    directory, file_name = os.path.split(file_path)
    file_name_zip = file_name + ".zip"
    os.chdir(directory)


    with ZipFile(file_name, 'r') as zip:
        #file_name = os.path.splitext(path_source)[0].split(os.sep)[-1]
        # zip.extract(file_name, path_destination)
        zip.extractall() # path=path_destination
    
    os.chdir(path_pwd)

In [4]:
# "../../input/dataset-ege-1/Dataset Ege 1"

path_data = os.path.join("..", "..", "data", "data")

config = {"used_inputs": ["rgb","measurements"], 
        "used_measurements": ["speed", "steer", "throttle", "brake", "command"],
        "seq_len": 1
        }

df_meta_data = create_metadata_df(path_data, config["used_inputs"])

dataset = CARLADataset(root_dir=path_data, df_meta_data=df_meta_data, config=config)


batch_size = 64

In [27]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    dir_name_zip = os.path.join(df_meta.iloc[idx]["dir"], "rgb_prep_zip")
    if not os.path.exists(dir_name_zip):
        os.makedirs(dir_name_zip)
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)
    filename_torch = os.path.join(dir_name_zip, f"{df_meta.iloc[idx]['rgb'].split('.')[0]}.pt")
    torch.save(img_torch_prep, filename_torch)
    compress_file(filename_torch, dir_name_zip)
    os.remove(filename_torch)

100%|██████████| 217/217 [00:22<00:00,  9.68it/s]


In [38]:
# save npy/ npz
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path_parts = dataset.df_meta_data["dir"][idx].split(os.sep)
    path_parts[2] += "_prep"
    dir_name_zip = os.path.join(*path_parts, "rgb")
    if not os.path.exists(dir_name_zip):
        os.makedirs(dir_name_zip)
        shutil.copytree(os.path.join(dataset.df_meta_data["dir"][idx], "measurements"), os.path.join(*path_parts, "measurements"))
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)
    img_np_prep = img_torch_prep.numpy()
    filename_np = os.path.join(dir_name_zip, f"{df_meta.iloc[idx]['rgb'].split('.')[0]}.npy")
    # torch.save(img_torch_prep, filename_torch)
    with open(filename_np, 'wb') as f:
        np.save(f, img_np)
        # np.savez_compressed(f, img_np)

100%|██████████| 258866/258866 [47:20<00:00, 91.14it/s] 


In [23]:
dataset.get_statistics()

Unnamed: 0,rgb_in_GB,measurements_in_GB,time_hours
0,55.926342,1.138127,35.953611


## Preprocess rgb files and save them (high storage demand)

In [28]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    dir_name_zip = os.path.join(df_meta.iloc[idx]["dir"], "rgb_prep")
    if not os.path.exists(dir_name_zip):
        os.makedirs(dir_name_zip)
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)
    filename_torch = os.path.join(dir_name_zip, f"{df_meta.iloc[idx]['rgb'].split('.')[0]}.pt")
    torch.save(img_torch_prep, filename_torch)

100%|██████████| 217/217 [00:02<00:00, 91.49it/s]


## Compare the approaches (loading speed)

### Preprocessing on the fly

In [18]:
config = {"used_inputs": ["rgb", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

In [19]:
dataset = CARLADataset(root_dir=path_data, config=config)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0, sampler=None)

In [20]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path = os.path.join(df_meta.iloc[idx][0], "rgb", df_meta.iloc[idx][1])
    img_np = dataset.load_data_from_path(path)
    img_torch = torch.Tensor(img_np)
    img_torch_prep = preprocessing["rgb"](img_torch)

100%|██████████| 217/217 [00:01<00:00, 171.97it/s]


In [13]:
# Unfair to use DataLoader (with batches) because for other approaches the Dataset class was not adjusted (time consuming)
count = 0
for batch in tqdm(dl):
    # preprocessing
    for key in preprocessing:
        batch[key] = preprocessing[key](batch[key])
    count += 1
print(count)

100%|██████████| 14/14 [00:01<00:00, 10.34it/s]

14





### Preprocessing from disk and compressed

In [21]:
config = {"used_inputs": ["rgb_prep_zip", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

In [22]:
dataset = CARLADataset(root_dir=path_data, config=config)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0, sampler=None)

In [30]:
df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path_compressed = os.path.join(df_meta.iloc[idx][0], "rgb_prep_zip", df_meta.iloc[idx][1])
    path_decompressed = os.path.splitext(path_compressed)[0]
    extract_file(path_compressed)
    img_torch = torch.load(path_decompressed)
    os.remove(path_decompressed)


100%|██████████| 217/217 [00:02<00:00, 81.08it/s]


### Preprocessing from disk

In [34]:
config = {"used_inputs": ["rgb_prep", "measurements"], 
        "used_measurements": ["speed", "steer", "throttle"],
        "seq_len": 1
        }

dataset = CARLADataset(root_dir=path_data, config=config)
dl = DataLoader(dataset=dataset, batch_size=16, num_workers=0)

df_meta = dataset.df_meta_data
for idx in tqdm(range(len(df_meta))):
    path = os.path.join(df_meta.iloc[idx][0], "rgb_prep", df_meta.iloc[idx][1])
    img_torch = torch.load(path)

100%|██████████| 217/217 [00:00<00:00, 685.09it/s]


# Compare winning approaches (npz and npy) according to file size

In [5]:
ds_stats = dataset.get_statistics() # alt 55.93

In [6]:
# Macs Finder apparently calcs 1GB as 10**9 Bytes --> 55.93 + 1.14 = 57.07
ds_stats 

Unnamed: 0,rgb_in_GB,measurements_in_GB,driving_time,%_of_entire_data
0,55.93,1.14,"1 day, 11:57:13",100.0


In [69]:
rgb_entire_size = ds_stats["rgb_in_GB"].item() 
# rgb_entire_size = 5

rgb_entire_size_mb = rgb_entire_size * 1000
# rgb_size_frame = rgb_entire_size_mb / len(df_meta_data)
npy_size = 0.416
npz_size = 0.380
predicted_size_npy = npy_size * len(df_meta_data) / 1000
predicted_size_npz = npz_size * len(df_meta_data) / 1000

In [70]:
predicted_size_npy

107.688256

## Outdated functions

In [None]:
def compress_file(file_path, dest_path):
  path_pwd = os.getcwd()
  # Extract the directory and file name from the file path
  directory, file_name = os.path.split(file_path)
  os.chdir(directory)
  # Compress the file using shutil.make_archive
  shutil.make_archive(file_name, 'zip', directory, file_name) # directory, file_name
  os.chdir(path_pwd)
  old_file = os.path.join(directory, file_name + '.zip')
  # shutil.copy(old_file, dest_path)
  # os.remove(old_file)

In [None]:
def extract_file(file_path):
    with ZipFile(path_source, 'r') as zip:
        file_name = os.path.splitext(path_source)[0].split(os.sep)[-1]
        zip.printdir()
        # zip.extract(file_name, path_destination)
        zip.extractall() # path=path_destination

In [None]:
def extract_file(file_path):
    path_pwd = os.getcwd()
    directory, file_name = os.path.split(file_path)
    os.chdir(directory)
    shutil.unpack_archive(file_name, directory, "zip")
    os.chdir(path_pwd)