## Creating the Test Set 
- Sampled uniformly across the collected areas 
- Split by country and sample randomly 
- Upload to test_set.csv on google cloud 

In [17]:
from google.cloud import firestore
from google.cloud import storage
from tqdm import tqdm
import geopandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
import reverse_geocoder as rg
import random
import collections
import plotly.express as px

#### Initialize Connection to gcloud

In [18]:
# Initialize connections to cloud storage
client = storage.Client()
# csv storage bucket
csv_bucket = client.get_bucket('street2sat-database-csv')

db = pd.read_csv('gs://street2sat-database-csv/database-info.csv')
db.size

14799410

In [19]:
# filter to get rid of images currently being labeled 
db = db[db['being_labeled'] == False]
db.size

14776670

In [20]:
db_usa_sample = db[db['country'] == "USA"].sample(1000)
db_kenya_sample = db[db['country'] == 'KENYA'].sample(1000)
db_uganda_sample = db[db['country'] == 'Uganda'].sample(1000)

test_set_concat = pd.concat([db_usa_sample, db_kenya_sample, db_uganda_sample])

In [21]:
px.set_mapbox_access_token('pk.eyJ1IjoibWFuZ29tYWRoYXZhIiwiYSI6ImNrdWcyNHh2OTIwMmQzMW56eWFibjUwY3QifQ.aQydTOk0ne3KrV87Ib_TrQ')
fig = px.scatter_mapbox(test_set_concat, lat='latitude', lon='longitude', size_max=15, zoom=10)
fig.show()

In [22]:
# upload test set to database 
import random
import string  
random_string = ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(8))

# upload two times, once called test-set.csv which may overwrite if another one exists
blob = csv_bucket.blob('test-set.csv')
blob.upload_from_string(test_set_concat.to_csv(), 'text/csv')

# the other with a random string so that we can roll back to old test sets if needed 
blob = csv_bucket.blob(f'test-set-DUPLICATE-{random_string}.csv')
blob.upload_from_string(test_set_concat.to_csv(), 'text/csv')


print(f'Uploaded test-set.csv')


Uploaded test-set.csv


In [24]:
(test_set_concat['being_labeled'] == False).all()

True