In [71]:
from google.cloud import storage
from tqdm import tqdm
import json 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import geopandas
import random
from scipy.spatial import cKDTree

# Generate Object Detection Dataset

**Author:** Madhava Paliyam (madhavapaliyam@gmail.com)

**Description:** Converts the images that have been labeled so far into a dataset on google cloud for future training of the YOLO v5 model. 



**Inputs**: Train/Validation split ratio (ex. .2), distance threshold for close points (ex. .001 degrees)

**Outputs**: train.csv, val.csv updated in google cloud through dvc

This is done through the following steps: 

1. Obtain and parse all .jsonl files from labeled bucket 
2. Split the data into train and validation sets randomly stratified by country 
3. Filter the sets to ensure nearby points are in the same set 
4. Track updates to train.csv and val.csv in google cloud 

In [4]:
##### SET PARAMETERS HERE ######

TRAIN_VAL_SPLIT = .2        # proportion of validation/train images 
DISTANCE_THRESHOLD = .001   # min degrees of seperation between images in train and val 

#### Obtain and Parse all .jsonl files from labeled bucket

In [32]:
# Initialize connections to cloud storage and database
client = storage.Client()

gcloud_labeling_bucket_str = 'street2sat-gcloud-labeling'
# get all jsonl objects
jsonl_blobs = [file for file in client.list_blobs(gcloud_labeling_bucket_str) if file.name.endswith(".jsonl")]

# get all info about all images
all_paths = pd.read_csv('gs://street2sat-database-csv/database-info.csv')

print(f"Found {len(jsonl_blobs)} jsonl files in the bucket.")

Found 5 jsonl files in the bucket.


  exec(code_obj, self.user_global_ns, self.user_ns)


In [52]:
# iterates through the jsonl files and creates two dataframes 
# image_info_df: 
def get_image_and_bbox_df(blobs: list):
    images = []
    country_codes = []
    bounding_boxes = []
    lat = []
    lon = []
    for blob in blobs:
        str_rep = blob.download_as_string()
        for image_labels in tqdm(str_rep.splitlines()):
            # contains all the bounding box info and image path
            info_image = json.loads(image_labels)
            image_path = info_image['imageGcsUri']
            bounding_box_info = info_image['boundingBoxAnnotations']
            for b in bounding_box_info: 
                b['path'] = image_path

            # obtain image location from database and convert to tuple
            row = all_paths[all_paths['input_img'] == image_path]
            assert len(row) == 1, f'got {len(row)} instead of 1'
            row = row.iloc[0]

            if (row['cc'] == np.nan):
                # dont add image if no location is available 
                continue 

            lat.append(row['latitude'])
            lon.append(row['longitude'])
            country_codes.append(row['cc'])
            images.append(image_path)
            bounding_boxes.extend(bounding_box_info)
   
    df = pd.DataFrame()
    df['path'] = images
    df['cc'] = country_codes
    df['lat'] = lat 
    df['lon'] = lon
    df_bounding_boxes = pd.DataFrame(bounding_boxes)
    return df, df_bounding_boxes

image_info_df,bounding_box_df = get_image_and_bbox_df(jsonl_blobs)

100%|██████████| 100/100 [00:08<00:00, 11.71it/s]
100%|██████████| 100/100 [00:08<00:00, 11.59it/s]
100%|██████████| 100/100 [00:08<00:00, 11.59it/s]
100%|██████████| 100/100 [00:08<00:00, 11.56it/s]
100%|██████████| 99/99 [00:08<00:00, 11.38it/s]


#### Split the data into train and validation sets randomly stratified by country 

In [53]:
vc = image_info_df['cc'].value_counts()
countries = vc.keys()
print(vc)

UG    264
KE    119
US     93
Name: cc, dtype: int64


In [54]:
train_set = pd.DataFrame()
val_set = pd.DataFrame()
for country in countries:
    train, test = train_test_split(image_info_df[image_info_df['cc'] == country], test_size=TRAIN_VAL_SPLIT, shuffle = True)
    train_set = train_set.append(train)
    val_set = val_set.append(test)

# shuffle 
train_set = train_set.sample(frac = 1)
val_set = val_set.sample(frac = 1)

In [55]:
train_set.sample(10)

Unnamed: 0,path,cc,lat,lon
107,gs://street2sat-uploaded/Uganda/2021-06-27_Dri...,UG,1.144557,31.006343
83,gs://street2sat-uploaded/Uganda/1829244/2020-0...,UG,0.987803,34.172346
151,gs://street2sat-uploaded/KENYA/2021-07-17-T1/G...,KE,-0.66911,34.754026
148,gs://street2sat-uploaded/USA/2021-08-20-cropto...,US,40.522853,-85.825824
297,gs://street2sat-uploaded/KENYA/2021_07_12_T2/1...,KE,1.173325,35.110874
280,gs://street2sat-uploaded/KENYA/2021_07_13_T2/1...,KE,1.051638,34.927209
319,gs://street2sat-uploaded/Uganda/2021-06-25_Edr...,UG,0.211524,30.129432
481,gs://street2sat-uploaded/Uganda/1829244/2020-0...,UG,0.973047,34.308031
85,gs://street2sat-uploaded/Uganda/2021-06-25_Edr...,UG,0.007917,29.963238
345,gs://street2sat-uploaded/Uganda/2021-06-17_NAC...,UG,0.462347,32.618019


#### Filter the sets to ensure nearby points are in the same set 

In [56]:
'Returns distance and name of each neighbor in gdB from each point in gdA using a kd tree'
def ckdnearest(gdA, gdB):
    
    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf


'filters on closeby points and returns the new datasets'
random.seed(10)
def filter_closeby_points(train_set, val_set): 

    train_set_g = geopandas.GeoDataFrame(train_set.copy(), geometry=geopandas.points_from_xy(train_set.lon, train_set.lat, crs='EPSG:4326'))
    val_set_g = geopandas.GeoDataFrame(val_set.copy(), geometry=geopandas.points_from_xy(val_set.lon, val_set.lat, crs='EPSG:4326'))

    val_set_g.rename(columns = {'path':'val_path'}, inplace = True)
    train_set_g.rename(columns = {'path':'train_path'}, inplace = True)


    res = ckdnearest(val_set_g, train_set_g)


    print('Found {} closeby points.'.format(len(res[res['dist'] < DISTANCE_THRESHOLD])))

    # add images that are too close to each other to the same set randomly
    old_val_len = len(val_set)
    old_train_len = len(train_set)

    for i,r in res[res['dist'] < .001].iterrows():
        val_row = val_set[val_set['path'] == r['val_path']]
        train_row = train_set[train_set['path'] == r['train_path']]
        
        which = random.choice(['train', 'val'])
        if which == 'train':
            val_set = val_set.drop(val_row.index)
            train_set = pd.concat([train_set, val_row])
        elif which == 'val':
            train_set = train_set.drop(train_row.index)
            val_set = pd.concat([val_set, train_row])
        
        assert old_train_len - len(train_set) == len(val_set) - old_val_len, 'lengths are incorrect'

    
    

    return train_set, val_set, len(res[res['dist'] < DISTANCE_THRESHOLD])

In [57]:
# run till 0 closeby points are found 
closeby = -1
while closeby != 0:
    train_set, val_set, closeby = filter_closeby_points(train_set, val_set)

Found 13 closeby points.
Found 3 closeby points.
Found 2 closeby points.
Found 2 closeby points.
Found 3 closeby points.
Found 4 closeby points.
Found 4 closeby points.
Found 3 closeby points.
Found 2 closeby points.
Found 0 closeby points.


In [67]:
# converts google cloud labels to yolo format
def convert_to_yolo(xmin, ymin, xmax, ymax):
    xcenter = (xmax + xmin) / 2 
    ycenter = (ymax + ymin) / 2

    width = xmax - xmin 
    height = ymax - ymin 

    return xcenter, ycenter, width, height 

# open the classes to index dictionary
classes_dict = {}
with open('../street2sat_utils/crop_info/classes.txt') as classes_file: 
    for i, line in enumerate(classes_file):
        classes_dict[line.strip()] = i


# gets bounding boxes for each set in yolo format for the dataset
def get_bounding_boxes_for_df(dataset, bounding_box_df, classes_dict):
    bounding_box_list = []
    for i,image in tqdm(dataset.iterrows(), total = dataset.shape[0]):
        annotations_for_img = bounding_box_df[bounding_box_df['path'] == image['path']]
        boxes = ''
        for _,row in annotations_for_img.iterrows():
            xcenter, ycenter, width, height = convert_to_yolo(row['xMin'], row['yMin'], row['xMax'], row['yMax'])
            obj_class = classes_dict[row['displayName']]
            boxes += f'{obj_class} {xcenter} {ycenter} {width} {height}\n'

        bounding_box_list.append(boxes)

    return bounding_box_list


train_set['bounding_boxes'] = get_bounding_boxes_for_df(train_set, bounding_box_df, classes_dict)
val_set['bounding_boxes'] = get_bounding_boxes_for_df(val_set, bounding_box_df, classes_dict)


100%|██████████| 383/383 [00:00<00:00, 1702.02it/s]
100%|██████████| 93/93 [00:00<00:00, 1695.59it/s]


In [69]:
train_set.sample(5)

Unnamed: 0,path,cc,lat,lon,bounding_boxes
413,gs://street2sat-uploaded/KENYA/2021_08_05_T2/1...,KE,0.789159,34.871739,
84,gs://street2sat-uploaded/KENYA/2021-07-05-T1/G...,KE,-0.171622,35.975577,5 0.849385529756546 0.6006783843040466 0.06457...
423,gs://street2sat-uploaded/Uganda/2021-06-23_lut...,UG,-0.342029,30.113613,2 0.6035992217898832 0.3190661478599222 0.0671...
64,gs://street2sat-uploaded/USA/2021-08-20-cropto...,US,40.450689,-85.393629,
435,gs://street2sat-uploaded/KENYA/2021_07_20_T2/1...,KE,0.129701,34.313333,5 0.40175097276264593 0.5979247730220493 0.058...


#### Track updates to train.csv and val.csv in google cloud 
- Add and commit changes by doing: 

``` 
dvc commit data/train.csv.dvc
dvc commit data/val.csv.dvc
dvc push data/train.csv.dvc
dvc push data/val.csv.dvc

git add data/train.csv.dvc
git add data/val.csv.dvc
git commit -m "changed train and val"
git push
```


In [70]:
# upload from data/train.csv and data/val.csv 
train_set.to_csv('../data/train.csv')
val_set.to_csv('../data/val.csv')