## Creating the Test Set 
- Sampled uniformly across the collected areas 
- 

In [8]:
from google.cloud import firestore
from google.cloud import storage
from tqdm import tqdm
import geopandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
import reverse_geocoder as rg
import random
import collections
import plotly.express as px

#### Initialize Connection to gcloud

In [9]:
# Initialize connections to cloud storage
client = storage.Client()
db = firestore.Client()
coll = db.collection("street2sat")

In [10]:
uploaded_bucket = client.get_bucket('street2sat-uploaded')


# all_paths = [blob.name for blob in tqdm(client.list_blobs('street2sat-uploaded', prefix=''))]  
# random.shuffle(all_paths)

In [11]:
def get_images_already_being_labelled():
    """Gets images already labelled"""
    images_already_being_labelled = []
    csv_names = [blob.name for blob in client.list_blobs('street2sat-gcloud-labeling', prefix="") if blob.name.endswith(".csv")]

    for csv_name in tqdm(csv_names, desc="Get already labelled"):
        uris = pd.read_csv(f"gs://street2sat-gcloud-labeling/{csv_name}", header=None, sep="\n")[0]
        images_already_being_labelled += uris.to_list()

    # Ensure there are no duplicates in images already being labelled
    dupes = [item for item, count in collections.Counter(images_already_being_labelled).items() if count > 1]
    dupes.remove('0') # An index of 0 was erroneously output in previous csv
    assert len(dupes) == 0, "Found duplicates in images being labelled. One of the labeling tasks needs to be removed."
    images_already_being_labelled = [x.replace('gs://street2sat-uploaded/', '') for x in images_already_being_labelled]
    return set(images_already_being_labelled)
already_labeled = get_images_already_being_labelled()

Get already labelled: 100%|██████████| 24/24 [00:03<00:00,  7.18it/s]


In [57]:
lat = []
lon = []
name = []
being_labeled = []
country = []
i = 0
for image in tqdm(coll.stream()): 
    d_image = image.to_dict()

    if d_image['coord'][0] == None or d_image['coord'][1] == None: 
        continue

    lat.append(d_image['coord'][0])
    lon.append(d_image['coord'][1])
    name.append(d_image['input_img'])

    if d_image['input_img'].split('/')[3].startswith('2021'):
        country.append('NA')
    else: 
        country.append(d_image['input_img'].split('/')[3])
    

    if d_image['input_img'] in already_labeled: 
        being_labeled.append(True)
    else: 
        being_labeled.append(False)
        
    if i > 30000: 
        break
    i += 1
    if i % 10000 == 0: 
        print(i)

10623it [00:14, 690.88it/s]

10000


26998it [00:30, 973.05it/s] 

20000


33370it [00:39, 851.33it/s] 


KeyboardInterrupt: 

In [53]:
df = pd.DataFrame()
df['name'] = name
df['latitude'] = lat
df['longitude'] = lon
df['being_labeled'] = being_labeled
df['country'] = country
df.head()

Unnamed: 0,name,latitude,longitude,being_labeled,country
0,gs://street2sat-uploaded/2021-07-08-T1/GPEJ8113.JPG,-0.004148,36.22377,False,
1,gs://street2sat-uploaded/2021-07-08-T1/GPEJ8114.JPG,-0.004141,36.223775,False,
2,gs://street2sat-uploaded/2021-07-08-T1/GPEJ8115.JPG,-0.00413,36.223784,False,
3,gs://street2sat-uploaded/2021-07-08-T1/GPEJ8116.JPG,-0.00412,36.223791,False,
4,gs://street2sat-uploaded/2021-07-08-T1/GPEJ8117.JPG,-0.004105,36.223802,False,


In [54]:
px.set_mapbox_access_token('pk.eyJ1IjoibWFuZ29tYWRoYXZhIiwiYSI6ImNrdWcyNHh2OTIwMmQzMW56eWFibjUwY3QifQ.aQydTOk0ne3KrV87Ib_TrQ')
fig = px.scatter_mapbox(df, lat='latitude', lon='longitude', size_max=15, zoom=10)
fig.show()