### VERGE: Vector-mode Regional Geospatial Encoding
# Split definition

Divide the input instances into training and vlidation sets.


## Processing Setup

In [11]:
# Google colab setup
# import os
# from google.colab import drive
# drive.mount('/content/drive')
# project_home = '/content/drive/MyDrive/Projects/verge'
# os.chdir(project_home)

In [12]:
# !pip install geo_encodings

In [13]:
# Local processing setup
project_home = '..'

## Notebook setup

In [14]:
import pandas as pd
import numpy as np
import glob
import pickle
import os


## Parameters

In [15]:
# The name of the ROI to use.
roi_name = 'ne-laptop'

# The name of the general-purpose data directory.
data_home = '%s/data' % (project_home)

# The name of the ROI-specific data directory.
roi_home = '%s/data/%s' % (project_home, roi_name)

# A unique identifier for this run. This will be a component of any
# output file names.
run_id = '102'

# Fraction of cases to use for training.
train_fraction = 0.8

# Always...
np.random.seed(5)

## Processing


In [16]:
fname = '%s/tiles.csv' % roi_home
tile_info = pd.read_csv(fname)
print('%d tiles' % len(tile_info))

aoi_tags = np.unique(tile_info['aoi_tag'].values)
print('%d unique AOIs' % len(aoi_tags))

1056 tiles
20 unique AOIs


In [17]:
# Add a column for the split, with a defult vlue of "train".
# It will be re-set below.
tile_info['split'] = 'train'
splits = tile_info['split'].values

In [18]:
tile_info.head()

Unnamed: 0,aoi_tag,tile_tag,lon0,lat0,lon1,lat1,center_lon,center_lat,split
0,0735w-413n,005-005,-73.5,41.3,-73.476879,41.318544,-73.48844,41.309272,train
1,0735w-413n,005-006,-73.50037,41.308994,-73.477246,41.327538,-73.488808,41.318266,train
2,0735w-413n,005-007,-73.50074,41.317988,-73.477613,41.336533,-73.489176,41.32726,train
3,0735w-413n,005-008,-73.50111,41.326983,-73.47798,41.345527,-73.489545,41.336255,train
4,0735w-413n,005-009,-73.50148,41.335977,-73.478347,41.354521,-73.489914,41.345249,train


In [19]:
# Replace split designation for a fraction of the AOIs.
for aoi_tag in aoi_tags:
    if np.random.random() > train_fraction:
        iok = tile_info['aoi_tag'] == aoi_tag
        splits[iok] = 'val'
        

In [20]:
tile_info['split'] = splits
tile_info[['aoi_tag', 'split']].value_counts()

aoi_tag     split
0731w-413n  train    63
0733w-413n  train    63
0731w-415n  train    63
0732w-413n  val      63
0735w-414n  train    63
0735w-413n  train    63
0734w-414n  train    63
0731w-414n  val      63
0733w-414n  train    62
0732w-444n  train    61
0732w-415n  train    60
0733w-424n  train    59
0735w-415n  train    58
0732w-445n  train    52
0733w-445n  val      44
0733w-423n  train    41
0735w-446n  train    39
0732w-418n  train    36
0732w-426n  train    22
0733w-444n  train    18
Name: count, dtype: int64

In [21]:
tile_info['split'].value_counts()

split
train    886
val      170
Name: count, dtype: int64

In [22]:
# Save the split records
fname = '%s/splits-%s.csv' % (roi_home, run_id)
tile_info[['aoi_tag', 'tile_tag', 'split']].to_csv(fname, index=False)
print('saved splits to %s' % fname)

saved splits to ../data/ne-laptop/splits-102.csv
