In [1]:
import ast
import sys
import json
import timeit

import numpy as np
import pandas as pd
from brtdevkit.core.db.athena import AthenaClient
from brtdevkit.data import Dataset

Internal AWS credentials have been removed from brt-devkit.
To setup AWS credentials, Please follow instructions at : https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html


In [2]:
europa_path = '/home/li.yu/code/JupiterCVML/europa/base/src/europa'
if europa_path not in sys.path:
    sys.path.append(europa_path)

from aletheia_dataset_creator.dataset_tools import aletheia_dataset_helpers as adh
from aletheia_dataset_creator.config.dataset_config import HALO_CENTER_CAMERA_PAIRS, HALO_NON_CENTER_CAMERA_PAIRS
HALO_LEFT_CAMERAS = ['T01', 'T02', 'T05', 'T06', 'T09', 'T10', 'T13', 'T14', 'I01', 'I02']
HALO_IMPLEMENT_CAMERA_PAIRS = {"I01": "I03", "I02": "I04"}
HALO_CENTER_IMPLEMENT_CAMERA_PAIRS = {"I02": "I03"}

In [5]:
# find previously selected latest date
prev_ds = Dataset.retrieve(name="20231206_halo_rgb_al_train_to_label_pt1")
prev_df = prev_ds.to_dataframe()
prev_ds.id, prev_df.shape

('6574f5d508030defaa929ddc', (17049, 157))

In [6]:
last_max = prev_df["created_at"].sort_values()
print(last_max)

15000    2023-11-18T08:13:35.410000
10000    2023-11-18T08:29:47.912000
10001    2023-11-18T08:32:44.198000
0        2023-11-18T08:32:55.776000
5000     2023-11-18T08:33:18.731000
                    ...            
17048    2023-12-05T16:37:29.032000
14997    2023-12-05T16:42:18.177000
14998    2023-12-05T16:42:55.815000
4999     2023-12-05T16:43:56.119000
14999    2023-12-05T16:44:55.257000
Name: created_at, Length: 17049, dtype: object


In [8]:
left_cameras_no_implement = ('T01', 'T02', 'T05', 'T06', 'T09', 'T10', 'T13', 'T14') # 'I01', 'I02')
all_left_cameras = ('T01', 'T02', 'T05', 'T06', 'T09', 'T10', 'T13', 'T14', 'I01', 'I02')

# note: change the below timestamp to right after the last_max timestamp from above
# note: currently excluding rear cams from HHH_16X (162) because it is grain cart and out-of-domain
query = f"""
    SELECT collected_on, id, created_at, robot_name, gps_can_data__json, geohash, implement,
           calibration_data__json, camera_location, bag_name, operation_time, tractor_type
    FROM image_jupiter
    WHERE sensor_type = 'VD6763'
    AND created_at BETWEEN TIMESTAMP '2023-12-05 16:45:00' AND TIMESTAMP '2024-08-03 00:00:00'
    AND camera_location IN {all_left_cameras}
    AND robot_name NOT LIKE 'bedrock%'
    AND robot_name != 'halohitchhiker_162'
    AND robot_name NOT LIKE 'halohitchhiker_9%'
    AND (hard_drive_name != 'JUPD-085_2023-9-12' OR robot_name NOT LIKE 'halohitchhiker_22%')
    AND gps_can_data__json IS NOT NULL
    AND geohash NOT LIKE '7zzzz%'
    ORDER BY 'collected_on'
"""

start_time = timeit.default_timer()
athena = AthenaClient()
df = athena.get_df(query)
print(timeit.default_timer() - start_time)
df.shape

62.75927849113941


((1504325, 12), 752162)

In [9]:
df.head()

Unnamed: 0,collected_on,id,created_at,robot_name,gps_can_data__json,geohash,implement,calibration_data__json,camera_location,bag_name,operation_time,tractor_type
0,2024-01-08 21:50:48.747,65a03b9e9135244e85310f1b,2024-01-11 19:03:58.465,halohitchhiker_212,"{""mbrtk_bearing"": 511.9921875, ""jupiter_header...",9zqtjzfyygsj,NO_IMPLEMENT,"{""intrinsics"": {""k"": [1440.08643, 0.0, 972.264...",T10,01_08_2024-21_49_22,daytime,peaty
1,2023-12-15 21:04:32.859,6584af4709317d9315d8ae49,2023-12-21 21:33:59.409,halohitchhiker_202,"{""mbrtk_bearing"": 511.9921875, ""jupiter_header...",dnd2k0q4xj5w,NO_IMPLEMENT,"{""intrinsics"": {""k"": [1438.00122, 0.0, 973.356...",T13,12_15_2023-21_03_07,daytime,peaty
2,2023-12-18 22:31:39.037,6595a7ce247b2923f64fb4ae,2024-01-03 18:30:38.144,halohitchhiker_222,"{""mbrtk_bearing"": 511.9921875, ""jupiter_header...",9zn2n3679g38,NO_IMPLEMENT,"{""intrinsics"": {""k"": [1432.30676, 0.0, 968.764...",T09,12_18_2023-22_31_12,daytime,peaty
3,2023-12-07 20:05:44.897,65814bbb9f4a4feab507634e,2023-12-19 07:52:27.971,halohitchhiker_111,"{""mbrtk_bearing"": 511.9921875, ""jupiter_header...",9zmjhmnqkfbx,NO_IMPLEMENT,"{""intrinsics"": {""k"": [1433.86499, 0.0, 979.238...",T06,12_07_2023-20_05_43,daytime,8RW
4,2023-12-20 22:43:18.848,65971e3982f40946d133a44c,2024-01-04 21:08:09.633,halohitchhiker_212,"{""mbrtk_bearing"": 511.9921875, ""jupiter_header...",9zq7yhestsv9,NO_IMPLEMENT,"{""intrinsics"": {""k"": [1444.94604, 0.0, 971.625...",T14,12_20_2023-22_41_28,dawn_dusk,peaty


In [10]:
def get_calibration(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}

# clean data with bad metadata
df = df[~df["gps_can_data__json"].isna()]
df.shape

(1504325, 12)

In [11]:
# get required metadata
geohash_length = 7 # 150x150 meters
df['geohash_short'] = df.geohash.apply(lambda x: x[:geohash_length])
df['collected_on_datetime'] = pd.to_datetime(df.collected_on)
df['calibration_data'] = df.calibration_data__json.apply(lambda x: get_calibration(x))
df['exposure_ms'] = df.calibration_data.apply(lambda x: x.get('exposure_ms', '-1'))
df['short_exposure'] = df.exposure_ms.apply(lambda x: float(x[1]))
df['identifier'] = df.apply(lambda x: x['geohash'] + '_' + x['camera_location'], axis=1)
df['gps_can_data'] = df.gps_can_data__json.apply(lambda x: json.loads(x))
df['speed'] = df['gps_can_data'].apply(lambda x: x.get('speed', np.nan))
df.shape

(1504325, 20)

In [12]:
dset_type = "stereo" # "image"
dset_name = "halo_rgb_1000_samples"
dset_description = "1000 stereo images from collected RGB."
downsampled_df = df.sample(1000)
print(len(downsampled_df))

if dset_type == "stereo":
    # create the stereo dataset
    adh.imageids_to_dataset(
        image_ids=list(downsampled_df.id.values),
        dataset_name=dset_name,
        dataset_description=dset_description,
        camera_location=HALO_LEFT_CAMERAS,
        dataset_kind='image',
        mode='stereo',
        camera_pairs_list=[HALO_CENTER_CAMERA_PAIRS, HALO_NON_CENTER_CAMERA_PAIRS, HALO_IMPLEMENT_CAMERA_PAIRS, HALO_CENTER_IMPLEMENT_CAMERA_PAIRS]
    )
else:
    Dataset.create(
        name=dset_name,
        description=dset_description,
        kind=Dataset.KIND_IMAGE,
        image_ids=list(downsampled_df.id.values),
    )

1000
Preparing stereo dataframe for {'T02': 'T03', 'T06': 'T07', 'T10': 'T11', 'T14': 'T15'}...
Size of left dataframe: 470
Size of stereo dataframe: 470
Preparing stereo dataframe for {'T01': 'T03', 'T02': 'T04', 'T05': 'T07', 'T06': 'T08', 'T09': 'T11', 'T10': 'T12', 'T13': 'T15', 'T14': 'T16'}...
Size of left dataframe: 945
Size of stereo dataframe: 945
Preparing stereo dataframe for {'I01': 'I03', 'I02': 'I04'}...
Size of left dataframe: 55
Size of stereo dataframe: 55
Preparing stereo dataframe for {'I02': 'I03'}...
Size of left dataframe: 24
Size of stereo dataframe: 24
Sending 2493 image ids for creating dataset
Time taken to prepare data for dataset creation job: 4.04 mins


ReadTimeout: HTTPSConnectionPool(host='tartarus-api.brtws.com', port=443): Read timed out. (read timeout=80)