# Converter
- Quickly and dirtily execute all the commands to process all chunks for BDD100K

In [None]:
#!aws s3 rm s3://s-laion/ssd-videos/ --recursive --exclude='' --dryrun
!aws s3 ls s3://s-laion/ssd-videos/ --human-readable --summarize

Adjust `truncate_frames` to fully recover the 100M images

In [None]:
%cd /opt/awesome/data/

from subprocess import run, call, PIPE, DEVNULL
from tqdm import tqdm
from tqdm_logger import TqdmLogger
import os, sys
import tensorflow as tf
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

base_path = '/opt/awesome/' #should've `/` in the end
base_download_url = "http://dl.yf.io/bdd100k/video_parts/" #train/test/val
urls = lambda x,y : ["{}bdd100k_videos_{}_{}.zip".format(base_download_url, x, str(index).zfill(2)) for index in range(y)]

train_urls = urls('train', 70) #70 URLs all to be generated
test_urls = urls('test', 20)
val_urls = urls('val', 10)
final_url_list = train_urls + val_urls #test_urls excluded, no ground truth :/

target_url = final_url_list[0]
iteration_index = 1

def convert_chunk_to_tfrecords(url, it_num):
    '''
    Converts a video part URL to TFRecords and uploads them to AWS
    bucket, ensuring minimal storage usage by deleting redundant copies
    '''    
    #Downloading video part/chunk
    logging.info('--- Download Started ---')
    run([
        'wget', target_url, '--show-progress'
        ], stderr=PIPE, universal_newlines=True)
    logging.info('Video Part Download Complete!')

    #Deleting useless files
    #if os.path.exists('./bdd100k/'):
    run([
        'rm -rf bdd100k {}mytemp/*'.format(base_path)
        ], stderr=PIPE, shell=True)

    #Unzipping the data
    run([
        'unzip {}data/{} && rm *.zip'.format(base_path, target_url.split('/')[-1])
        ], stderr=PIPE, shell=True)

    #renaming any subdirectory to train and moving info
    run([
        'mv {}* {}train; mv ./info ./bdd100k/'.format(base_path + "data/bdd100k/videos/", base_path + "data/bdd100k/videos/",) 
        ], stderr=PIPE, shell=True)

    logging.info('Unzipping and setup completed!')

    #Processing - Indexing
    run([
        'python3', '{}scripts/BDD_Driving_Model/data_prepare/filter.py'.format(base_path), '{}data/bdd100k/'.format(base_path)
    ], stderr=PIPE, stdout=DEVNULL)

    #Converting to TFrecords - estimated time: 3-5 mins/per video part
    run([(
        "mkdir {bp}data/bdd100k/tfrecords; " #creating dummy folder
        "python3 {bp}scripts/BDD_Driving_Model/data_prepare/prepare_tfrecords.py "
        "--video_index={bp}data/bdd100k/video_filtered_38_60.txt "
        "--output_directory={bp}data/bdd100k/tfrecords "
        "--temp_dir_root={bp}mytemp --num_threads=40 "
        "--truncate_frames=1200; "
        "mv {bp}data/bdd100k/info {bp}data/" #moving info back, keeping only single copy
        ).format(bp=base_path)
    ], stderr=PIPE, shell=True)

    #removing redundant files and clearing trash due to storage constraints
    run(['rm','-rf', base_path + 'data/bdd100k/videos'], stderr=PIPE)
    run(['rm','-rf', base_path + '.local/share/Trash/*'], stderr=PIPE)

    logging.info('=== Video Part Fully Processed ===\n')

    list_of_tfrecord_files = os.listdir("/opt/awesome/data/bdd100k/tfrecords/")
    tfrecord_list = ['/opt/awesome/data/bdd100k/tfrecords/'+_ for _ in list_of_tfrecord_files] #s3://s-laion/ssd-videos/tfrecords/train_1/

    dataset = tf.data.TFRecordDataset(tfrecord_list)
    writer = tf.data.experimental.TFRecordWriter('/opt/awesome/data/{}.tfrecord'.format(str(it_num).zfill(6)))
    writer.write(dataset) #writing the final TFRecord locally
    #because write to AWS is bugged and not fixed with .io.TFRecordWriter

    #Moving final TFRecord to AWS
    run([
        'aws', 's3', 'mv', base_path + 'data/{}.tfrecord'.format(str(it_num).zfill(6)), 's3://s-laion/ssd-videos/'
    ], stderr=PIPE)

base_path = '/opt/awesome/' #should've `/` in the end

log_file = '{}data/tqdm_progress.log'.format(base_path)
tqdm_stream = TqdmLogger(log_file)

#setup stream for streaming TQDM logs
tqdm_stream.reset()

#Processing each part of the dataset
for it_index, _ in tqdm(enumerate(final_url_list), file = tqdm_stream):
    convert_chunk_to_tfrecords(_, it_index)
    
#Closing up everything and preventing execution of following celss
exit()

861G
- `540` --> `6G`
- `1200` --> `14G`
- `1080` --> `13G` [__FASTER__]

In [1]:
!cd /opt/awesome/data/sample_tfrecs; ls | wc -l
!cd /opt/awesome/data/; du -h ./sample_tfrecs

729
14G	./sample_tfrecs


In [19]:
import tensorflow as tf
import json
from google.protobuf.json_format import MessageToJson

path = "/opt/awesome/data/sample_tfrecs/00a2e3ca-5c856cde.tfrecords"
#path = "./000001.tfrecord"

raw_dataset = tf.data.TFRecordDataset(path)

#inspect a single record
for raw_record in raw_dataset.take(1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    #print(example.features.feature['image/height'].int64_list.value)
    #print(example.features.feature['image/width'].int64_list.value)
    #print(example.features.feature['image/format'].bytes_list.value)
    speeds = example.features.feature['image/speeds']
    plt_speeds = speeds.float_list.value
    break

```py
for d in raw_dataset:
    ex = tf.train.Example()
    ex.ParseFromString(d.numpy())
    m = json.loads(MessageToJson(ex))
    break

m['features']['feature']['image/speeds']['floatList']['value']
```

In [None]:
%cd /home/awesome/
!mkdir /home/awesome/data/bdd100k/tfrecords
!clear; python3 scripts/BDD_Driving_Model/data_prepare/prepare_tfrecords.py --video_index='./data/bdd100k/video_filtered_38_60.txt' --output_directory='/home/awesome/data/bdd100k/tfrecords' --temp_dir_root=/home/awesome/mytemp --num_threads=40

## Sample testing of the dataset

In [21]:
#import the module "MyDataset" from /opt/awesome/scripts/BDD_Driving_Model/data_providers/nexar_large_speed.py
#I'm currently in /opt/awesome/scripts
import sys
sys.path.append('/opt/awesome/scripts/BDD_Driving_Model/')
from data_providers.nexar_large_speed import MyDataset

In [37]:
data_obj = MyDataset('train')
out

UnrecognizedFlagError: Unknown command line flag 'ip'