# NuQA Mini Dataset Creator

In [None]:
## Import preparation

from nuscenes.nuscenes import NuScenes

import numpy as np
import torch
import torch.nn as nn
from pathlib import Path
from tqdm import tqdm
from PIL import Image, ImageFilter, ImageEnhance
import datasets
import matplotlib.pyplot as pltq
import pandas as pd
import json
import io
import os
import sys
import pickle
import base64
import random
import torch
import gc
import copy
import time
import multiprocessing

#from rangevit.rangevit_extract import RangeVitEncoderAltWrapper


## NuscenesQA preparation

* Nuscenes-QA: https://github.com/qiantianwen/NuScenes-QA
* https://drive.google.com/drive/folders/1jIkICT23wZWZYPrWCa0x-ubjpClSzOuU

In [None]:
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.data_classes import LidarPointCloud
from tqdm import tqdm
from PIL import Image, ImageFilter, ImageEnhance
from mini_lidar_dataset_creator import RangeProjection
import numpy as np
import datasets
import matplotlib.pyplot as plt
import pandas as pd
import json
import io
import os
import base64
import random
import pickle
import torch
import gc
import copy
from pathlib import Path
import multiprocessing
from torchvision.transforms.functional import center_crop
import pyblur3



class NuQAMiniDatasetCreator:
    cam_directions = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT',
                      'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT']
    
    def __init__(self, version='v1.0-mini', path='data', lidarnpypath='data'):
        self.nusc = NuScenes(version=version, dataroot=path, verbose=False)
        self.lidarnpypath = lidarnpypath
        self.scene_token_to_description = {}
        for s in self.nusc.scene:
            self.scene_token_to_description[s['token']] = s['description']

                
    def extract_matched_samples(self, split='train'):
        mininuscene_sample_tokens = []

        for sc in self.nusc.scene:
            sample_token = sc['first_sample_token']
            while len(sample_token) > 0:
                sample = self.nusc.get('sample', sample_token)
                sample_token = sample['next']
                mininuscene_sample_tokens.append(sample_token)
                
        matched_count = 0
        hop_0_count = 0
        hop_1_count = 0
        matched_samples = []

        with open(f"dataset/nuscenes-qa/NuScenes_{split}_questions.json", "r") as json_file:
            questions = json.loads(json_file.read())
            for qa_sample in tqdm(questions['questions']):
                if qa_sample['sample_token'] in mininuscene_sample_tokens:
                    sample = self.nusc.get('sample', qa_sample['sample_token'])
                    matched_samples.append({
                        'scene description': self.scene_token_to_description[sample['scene_token']],
                        'token': qa_sample['sample_token'],
                        'data': sample['data'], 
                        'question': qa_sample['question'],
                        'answer': qa_sample['answer'],
                    })
                    matched_count += 1
                    if qa_sample['num_hop'] == 0:
                        hop_0_count += 1
                    elif qa_sample['num_hop'] == 1:
                        hop_1_count += 1
        print(f"#### Extracted from {split} split:")
        print(f"matched count = {matched_count}")
        print(f"hop0 count = {hop_0_count}")
        print(f"hop1 count = {hop_1_count}")
        return matched_samples
    
    
    def simplify_scene_description(self, data):
        for sample in data:
            if 'Night' in sample['scene description']:
                sample['scene description'] = 'night'
            else:
                sample['scene description'] = 'day'
        return data
    
    
    def summarize_scene_info(self, data):
        total_count = 0
        day_count = 0
        night_count = 0
        for sample in data:
            if sample['scene description'] == 'night':
                night_count += 1
            else:
                day_count += 1
            total_count += 1
        print('#####')
        print(f"day={day_count} ({day_count/total_count*100:.2f} %)")
        print(f"night={night_count} ({night_count/total_count*100:.2f} %)")
        print(f"total={total_count} (100 %)")
        
    
    def merge_and_distribute(self, train, val, seed=0):
        random.seed(seed)
        data = train + val
        
        day_samples = []
        night_samples = []
        for sample in data:
            if sample['scene description'] == 'night':
                night_samples += [sample]
            else:
                day_samples += [sample]
                
        random.shuffle(day_samples)
        random.shuffle(night_samples)
        
        num_day_samples = len(day_samples)
        num_night_samples = len(night_samples)
        
        day_train = day_samples[:num_day_samples // 2]
        day_val = day_samples[num_day_samples // 2:]
        night_train = night_samples[:num_night_samples // 2]
        night_val = night_samples[num_night_samples // 2:]
        
        return day_train, day_val, night_train, night_val
    
    
    # def process_img_and_lidar_v0(self, sample, cam_direction='CAM_FRONT'):
    #     lidar_top = sample['data']['LIDAR_TOP']
    #     cam = sample['data'][cam_direction]
    #     points, coloring, im = self.nusc.explorer.map_pointcloud_to_image(
    #         pointsensor_token=lidar_top,
    #         camera_token=cam,
    #     )
    #     fig, ax = plt.subplots(1, 1, figsize=(9, 16))
    #     ax.imshow(im)
    #     image = self.ghost_savefig(fig)
    #     ax.imshow(np.zeros_like(np.array(im))) # keep this line to use black background
    #     ax.scatter(points[0, :], points[1, :], c=coloring, s=5, cmap='Greys') # adjust colar map?
    #     lidar = self.ghost_savefig(fig) # lidar point clouds overlay image
    #     plt.close(fig)
    #     return image, lidar
    
    @classmethod
    def process_image_nothing(self, im_raw):
        im = np.array(im_raw.resize((224, 224)).convert('RGB'))#.tolist()
        return im
    
    @classmethod
    def process_image_halfdim(self, im_raw):
        dim_enhancer = ImageEnhance.Brightness(im_raw)
        # https://stackoverflow.com/questions/31360526/low-the-brightness-of-an-image-using-pillow
        im1 = dim_enhancer.enhance(0.5)
        im = np.array(im1.resize((224, 224)).convert('RGB'))
        return im
    
    @classmethod
    def process_image_80dim(self, im_raw):
        dim_enhancer = ImageEnhance.Brightness(im_raw)
        im1 = dim_enhancer.enhance(0.2)
        im = np.array(im1.resize((224, 224)).convert('RGB'))
        im2 = im1.resize((224, 224)).convert('RGB')
        import time
        t = time.time()
        im_t = im_raw.resize((224, 224)).convert('RGB')
        im2.save('test/{}_dim80.png'.format(t))
        im_t.save('test/{}_orig.png'.format(t))
        return im
    
    @classmethod
    def process_image_80dim50contrast(self, im_raw):
        dim_enhancer = ImageEnhance.Brightness(im_raw)
        im1 = dim_enhancer.enhance(0.2)
        contrast_enhancer = ImageEnhance.Contrast(im1)
        im2 = contrast_enhancer.enhance(0.5)
        im = np.array(im2.resize((224, 224)).convert('RGB'))

        import time
        t = time.time()
        im2 = im1.resize((224, 224)).convert('RGB')
        im_t = im_raw.resize((224, 224)).convert('RGB')
        im2.save('test2/{}_dim80contrast50.png'.format(t))
        im_t.save('test2/{}_orig.png'.format(t))

        return im

    @classmethod
    def process_image_gaussian5(self, im_raw):
        im1 = im_raw.filter(ImageFilter.GaussianBlur(radius = 5))
        im = np.array(im1.resize((224, 224)).convert('RGB'))      
        return im
    
    @classmethod
    def process_image_80dimgaussian7(self, im_raw):
        im1 = im_raw.filter(ImageFilter.GaussianBlur(radius = 7))
        dim_enhancer = ImageEnhance.Brightness(im1)
        im2 = dim_enhancer.enhance(0.2)
        im3 = im2.resize((224, 224)).convert('RGB')
        im = np.array(im3)
        
        t = time.time()
        im_orig = im_raw.resize((224, 224)).convert('RGB')
        im_orig.save('test4/{}_orig.png'.format(t))
        im3.save('test4/{}_80dimgaussian7.png'.format(t))
        
        return im
    
    @classmethod
    def process_image_defocus(self, im_raw):
        kernelsize = 3
        im1 = pyblur3.DefocusBlur(im_raw, kernelsize)
        im2 = im1.resize((224, 224)).convert('RGB')
        im = np.array(im2)
        t = time.time()
        im_orig = im_raw.resize((224, 224)).convert('RGB')
        im_orig.save('test3/{}_orig.png'.format(t))
        im2.save('test3/{}_defocus.png'.format(t))
        return im
        
    
    def process_image(self, sample, cam_direction='CAM_FRONT', vartype: str = None):
        camera_token = sample['data'][cam_direction]
        cam = self.nusc.get('sample_data', camera_token)
        im_raw = Image.open(os.path.join(self.nusc.dataroot, cam['filename']))
        
        if vartype is None:
            im = self.process_image_nothing(im_raw)
        elif vartype == 'halfdim':
            im = self.process_image_halfdim(im_raw)
        elif vartype == 'gaussian5':
            im = self.process_image_gaussian5(im_raw)
        elif vartype == '80dim':
            im = self.process_image_80dim(im_raw)
        elif vartype == '80dim50contrast':
            im = self.process_image_80dim50contrast(im_raw)
        elif vartype == 'defocus':
            im = self.process_image_defocus(im_raw)
        elif vartype == '80dimgaussian7':
            im = self.process_image_80dimgaussian7(im_raw)
        else:
            raise Exception('Unrecognized process_image() vartype! Aborting...')
            
        im_raw.close()
        return im
    
    def process_lidar(self, sample):
        pointsensor_token = sample['data']['LIDAR_TOP']
        #print('pointsensor_token is: {}'.format(pointsensor_token)) # This is token name
        # Read npy file, not list of list
        lidar_filename = str(Path(self.lidarnpypath) / "{}.npy".format(pointsensor_token))
        lidar_npy = np.load(lidar_filename)
        return lidar_npy
    
    def ghost_savefig(self, fig):
        '''Based on answers from https://stackoverflow.com/questions/8598673/'''
        buf = io.BytesIO()
        self.save(buf, fig=fig)
        buf.seek(0)
        im = Image.open(buf)
        im = im.copy()
        buf.close()
        return im
    
    @staticmethod
    def save(filepath="image.jpg", format='jpg', fig=None):
        '''Save the current image with no whitespace
        Example filepath: "myfig.png" or r"C:\myfig.pdf" 
        Based on answers from https://stackoverflow.com/questions/11837979/
        '''
        import matplotlib.pyplot as plt
        if not fig:
            fig = plt.gcf()
            
        plt.subplots_adjust(0,0,1,1,0,0)
        for ax in fig.axes:
            ax.axis('off')
            ax.margins(0,0)
            ax.xaxis.set_major_locator(plt.NullLocator())
            ax.yaxis.set_major_locator(plt.NullLocator())
        
        fig.savefig(filepath, format=format, pad_inches=0, bbox_inches='tight')
    
    def create_arrow_v2(self, dataset, basepath, c1: str, c2: str, vartype: str = None):
        print("#### Generating nuscenses_qa_mini::{}/{} dataset... (vartype: {})".format(c1, c2, vartype))
        path = str(Path(basepath) / c1 / c2)
        data = {}
        data['token'] = []
        for cam in self.cam_directions:
            data[cam] = []
        data['LIDAR_TOP'] = []
        data['question'] = []
        data['answer'] = []
        counter = 0
        for idx, sample in enumerate(tqdm(dataset)):
            data['token'] += [sample['token']]
            for cam in self.cam_directions:
                data[cam] += [self.process_image(sample, cam, vartype)]
            data['question'] += [sample['question']]
            data['answer'] += [sample['answer']]
            data['LIDAR_TOP'] += [self.process_lidar(sample)]
            counter = counter + 1

        data_arrow = datasets.Dataset.from_dict(data)
        data_arrow.save_to_disk(path)
        print("#### Completed!")
        return


In [None]:
def load_test():
    out_path = 'nuscenes_qa_mini_224/'
    ds = datasets.load_from_disk(out_path + 'train')
    img = ds['CAM_FRONT_LIDAR_OVERLAY'][100]
    print(type(img))
    feature = datasets.Image(decode=True)
    img = feature.decode_example(img)
    img.show()
    print(type(img.convert('RGB')))


def make_dirs(dirs):
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)


def create_arrow_v2_parallel(ds_instance, exec_params, blocking=False):
    '''
    Example: ds.create_arrow_v2(night_train, out_path, c1='night', c2='train', vartype='halfdim')
    '''
    def mp_create_arrow_wrapper(ds, params):
        i = params
        ds.create_arrow_v2(i[0], i[1], c1=i[2], c2=i[3], vartype=i[4])
        return
    total_len = len(exec_params)
    p_list = list()
    for param_ref in exec_params:
        param = copy.deepcopy(param_ref)
        ds = copy.deepcopy(ds_instance)
        p = multiprocessing.Process(target=mp_create_arrow_wrapper, args=(ds, param))
        p.start()
        p_list.append(p)
    if blocking:
        for p in p_list:
            p.join()
    print('\n### Parallel create_arrow_v2 finished!\n')
    return


def main():
    out_path = 'nuscenes_qa_mini_withdim/'
    make_dirs([
        out_path,
        # out_path + 'day',
        # out_path + 'night',
        # out_path + 'day_gaussian5',
        # out_path + 'night_gaussian5',
        # out_path + 'day_halfdim',
        # out_path + 'night_halfdim',
        out_path + 'night_80dim',
        out_path + 'night_80dim50contrast',
        out_path + 'night_defocus',
        out_path + 'night_80dimgaussian7',
    ])
    ds = NuQAMiniDatasetCreator(
        version='v1.0-mini', path='dataset/v1.0-mini/data/sets/nuscenes/',
        lidarnpypath='dataset/v1.0-mini/data/sets/range_projection_outputs/')
    # extract intersection between nuscenes-mini and nuscenes-qa
    ds_train = ds.extract_matched_samples('train')
    ds_val = ds.extract_matched_samples('val')
    # refactor scene description
    ds_train = ds.simplify_scene_description(ds_train)
    ds_val = ds.simplify_scene_description(ds_val)
    # merge and redistribute scenes
    day_train, day_val, night_train, night_val = ds.merge_and_distribute(ds_train, ds_val, seed=123)
    ds.summarize_scene_info(day_train)
    ds.summarize_scene_info(day_val)
    ds.summarize_scene_info(night_train)
    ds.summarize_scene_info(night_val)
    gc.collect()
    # create arrow datasets
    print("#### Generating arrows...")

    ## Regular data generation
    exec_params_orig = [
        (night_train, out_path, 'night', 'train', None),
        (night_val, out_path, 'night', 'validation', None),
        (day_train, out_path, 'day', 'train', None),
        (day_val, out_path, 'day', 'validation', None),
    ]
    
    # ## Variation of dimming: halfdim
    exec_params_halfdim = [
        (night_train, out_path, 'night_halfdim', 'train', 'halfdim'),
        (night_val, out_path, 'night_halfdim', 'validation', 'halfdim'),
        (day_train, out_path, 'day_halfdim', 'train', 'halfdim'),
        (day_val, out_path, 'day_halfdim', 'validation', 'halfdim'),
    ]
    # create_arrow_v2_parallel(ds, exec_params2, blocking=False)
  
    # ## Variation of dimming: gaussian5
    exec_params_gaussian5 = [
        (night_train, out_path, 'night_gaussian5', 'train', 'gaussian5'),
        (night_val, out_path, 'night_gaussian5', 'validation', 'gaussian5'),
        (day_train, out_path, 'day_gaussian5', 'train', 'gaussian5'),
        (day_val, out_path, 'day_gaussian5', 'validation', 'guassian5'),
    ]
    
    ## Variation of dimming: 80dim
    exec_params_80dim = [
        #(night_train, out_path, 'night_80dim', 'train', '80dim'),
        #(night_val, out_path, 'night_80dim', 'validation', '80dim'),
        #(night_train, out_path, 'night_80dim50contrast', 'train', '80dim50contrast'),
        #(night_val, out_path, 'night_80dim50contrast', 'validation', '80dim50contrast'),
    ]
    exec_params_80dimgaussian7 = [
        (night_train, out_path, 'night_80dimgaussian7', 'train', '80dimgaussian7'),
        (night_val, out_path, 'night_80dimgaussian7', 'validation', '80dimgaussian7'),
        (day_train, out_path, 'day', 'train', None),
        (day_val, out_path, 'day', 'validation', None),
    ]
    
    ## Execution (actual)
    create_arrow_v2_parallel(ds, exec_params_80dimgaussian7, blocking=True)
    
    # TODO: modify create_arrow() and resize() methods, option to downscale brightness
    return ds
    

if __name__ == "__main__":
    ds = main()

    # load_test()
    # create_dataset_with_resized_images(split='train')
    # create_dataset_with_resized_images(split='validation')
    