### VERGE: Vector-mode Regional Geospatial Embeddings

# Count classes

## Outputs:
* dat/class_info.csv: info on distribution of classes

## Processing setup

In [27]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)
!pip install geo-encodings

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
# Local processing setup
# project_home = '..'

## Notebook setup

In [29]:
import pandas as pd
import numpy as np
import glob
import geopandas
import os
import pickle
import json
from geo_encodings import MPPEncoder

## Parameters

In [30]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# Sample this many tiles.
sample_size = 1000



## Data prep

In [31]:
# Read the list of labels.
fname = '%s/labels.csv' % data_home
labels = pd.read_csv(fname)
num_classes = len(labels)
print('%d labels' % num_classes)

label_id_lookup = {
    z['label']: z['id']
    for z in labels.to_dict('records')
}

22 labels


## Processing

In [32]:
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles in this ROI' % len(tile_info))
tile_info.head(3)


15793 tiles in this ROI


Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726


In [33]:
# In the loop below, we will also accumulate a histogram of class labels.
# This will be saved and used during model training to balance sampling.
class_counts = {z: 0 for z in range(num_classes)}

In [34]:

tile_records = tile_info.to_dict('records')
for k, rec in enumerate(np.random.choice(tile_records, sample_size)):
    if k % 10 == 0:
      print('sampling tile %d / %d' % (k, sample_size))
    tile_fname = '%s/tiles/%s/%s.pq' % (roi_home, rec['aoi_tag'], rec['tile_tag'])
    gdf = geopandas.read_parquet(tile_fname)
    for _, gent in gdf.iterrows():

        etype = np.zeros(num_classes)
        label_string = '%s : %s' % (gent['category'], gent['label'])
        label_id = label_id_lookup[label_string]
        class_counts[label_id] += 1

#

sampling tile 0 / 1000
sampling tile 1 / 1000
sampling tile 2 / 1000
sampling tile 3 / 1000
sampling tile 4 / 1000
sampling tile 5 / 1000
sampling tile 6 / 1000
sampling tile 7 / 1000
sampling tile 8 / 1000
sampling tile 9 / 1000
sampling tile 10 / 1000
sampling tile 11 / 1000
sampling tile 12 / 1000
sampling tile 13 / 1000
sampling tile 14 / 1000
sampling tile 15 / 1000
sampling tile 16 / 1000
sampling tile 17 / 1000
sampling tile 18 / 1000
sampling tile 19 / 1000
sampling tile 20 / 1000
sampling tile 21 / 1000
sampling tile 22 / 1000
sampling tile 23 / 1000
sampling tile 24 / 1000
sampling tile 25 / 1000
sampling tile 26 / 1000
sampling tile 27 / 1000
sampling tile 28 / 1000
sampling tile 29 / 1000
sampling tile 30 / 1000
sampling tile 31 / 1000
sampling tile 32 / 1000
sampling tile 33 / 1000
sampling tile 34 / 1000
sampling tile 35 / 1000
sampling tile 36 / 1000
sampling tile 37 / 1000
sampling tile 38 / 1000
sampling tile 39 / 1000
sampling tile 40 / 1000
sampling tile 41 / 1000
sa

In [35]:
class_counts

{0: 1024,
 1: 1423,
 2: 12499,
 3: 57,
 4: 1593,
 5: 732,
 6: 558,
 7: 672,
 8: 540,
 9: 659,
 10: 3822,
 11: 425,
 12: 2033,
 13: 150,
 14: 4458,
 15: 6261,
 16: 48741,
 17: 6914,
 18: 5119,
 19: 1716,
 20: 1000,
 21: 11196}

In [36]:
# Save the label probabilities.
total = 0.0
for key in class_counts:
    total += class_counts[key]

records = [
    {
        'label': z,
        'count': class_counts[z],
        'prob': class_counts[z] / total
    } for z in range(num_classes)
]
fname = '%s/class_info.csv' % roi_home
pd.DataFrame(records).to_csv(fname, index=False)
print('wrote %s' % fname)

wrote /content/drive/MyDrive/Projects/verge/data/newengland/class_info.csv
