### VERGE: Vector-mode Regional Geospatial Encoding
# Split definition

Divide the input instances into training and vlidation sets.


## Processing Setup

In [1]:
# Google colab setup
import os
from google.colab import drive
drive.mount('/content/drive')
project_home = '/content/drive/MyDrive/Projects/verge'
os.chdir(project_home)
!pip install geo_encodings

Mounted at /content/drive
Collecting geo_encodings
  Downloading geo_encodings-1.0.4-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading geo_encodings-1.0.4-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: geo_encodings
Successfully installed geo_encodings-1.0.4


In [2]:
# Local processing setup
# project_home = '..'

## Notebook setup

In [3]:
import pandas as pd
import numpy as np
import glob
import pickle
import os


## Parameters

In [4]:
# The name of the ROI to use.
roi_name = 'newengland'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# A unique identifier for this run. This will be a component of any
# output file names.
run_id = '201'

# Fraction of cases to use for training.
train_fraction = 0.8

# Always...
np.random.seed(5)

## Processing


In [5]:
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))

aoi_tags = np.unique(tile_info['aoi_tag'].values)
print('%d unique AOIs' % len(aoi_tags))

15793 tiles
263 unique AOIs


In [6]:
# Add a column for the split, with a defult vlue of "train".
# It will be re-set below.
tile_info['split'] = 'train'
splits = tile_info['split'].values

In [7]:
tile_info.head()

Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat,split
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272,train
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266,train
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726,train
3,0735w-413n,005-008,-73.50111,41.326983,-73.47798,41.345527,-73.489545,41.336255,train
4,0735w-413n,005-009,-73.50148,41.335977,-73.478347,41.354521,-73.489914,41.345249,train


In [8]:
# Replace split designation for a fraction of the AOIs.
for aoi_tag in aoi_tags:
    if np.random.random() > train_fraction:
        iok = tile_info['aoi_tag'] == aoi_tag
        splits[iok] = 'val'


In [9]:
tile_info['split'] = splits
tile_info[['aoi_tag', 'split']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
aoi_tag,split,Unnamed: 2_level_1
0709w-432n,val,70
0709w-431n,train,70
0714w-424n,val,70
0710w-429n,train,70
0710w-428n,train,70
...,...,...
0702w-440n,train,28
0699w-439n,train,27
0708w-422n,val,26
0732w-426n,val,22


In [10]:
tile_info['split'].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,12373
val,3420


In [11]:
# Save the split records
fname = '%s/splits-%s.csv' % (roi_home, run_id)
tile_info[['aoi_tag', 'tile_tag', 'split']].to_csv(fname, index=False)
print('saved splits to %s' % fname)

saved splits to /content/drive/MyDrive/Projects/verge/data/newengland/splits-201.csv
