# MobiML Sampling Demo

Randomly sample trajectories in a region of interest. Based on https://github.com/microsoft/torchgeo and https://james-brennan.github.io/posts/fast_gridding_geopandas/

In [24]:
import numpy as np
import geopandas as gpd
import shapely
import folium

import sys

sys.path.append("..")
from mobiml.datasets import AISDK

In [None]:
data = AISDK(r"../examples/data/aisdk_20180208_sample.zip")
data.df.head()

In [3]:
trajs = data.to_trajs()

In [4]:
start = trajs.get_start_locations()

In [5]:
start = start.set_crs("EPSG:4326", inplace=True)

#### Set bounding box

In [None]:
xmin, ymin, xmax, ymax = start.total_bounds
print(xmin, ymin, xmax, ymax)

Buffer bounding box by 0.01&deg;, which is about 1 km

In [7]:
xmin = xmin - 0.01
ymin = ymin - 0.01
xmax = xmax + 0.01
ymax = ymax + 0.01

In [None]:
print("Buffered bounding box:", xmin, ymin, xmax, ymax)

#### Calculate number of grid cells

In [9]:
n_cells = 2  # you can specify different values for width and height


def stride(value):
    if isinstance(value, tuple):
        cell_size_x = (xmax - xmin) / value[0]
        cell_size_y = (ymax - ymin) / value[1]
    elif isinstance(value, int):
        cell_size_x = (xmax - xmin) / value
        cell_size_y = (ymax - ymin) / value
    elif isinstance(value, float):
        cell_size_x = (xmax - xmin) / value
        cell_size_y = (ymax - ymin) / value
    else:
        print("Please provide a tuple, int or float.")
    return cell_size_x, cell_size_y


cell_size_x, cell_size_y = stride(n_cells)

In [10]:
grid_cells = []
for x0 in np.arange(xmin, xmax, cell_size_x):
    for y0 in np.arange(ymin, ymax, cell_size_y):
        x1 = x0 + cell_size_x
        y1 = y0 + cell_size_y
        grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))

In [11]:
cell = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs="EPSG:4326")

In [12]:
cell["cell"] = cell.index

In [None]:
m = cell.explore()

start.explore(
    m=m,
    column="traj_id",
    legend=False,
)

folium.TileLayer("CartoDB positron").add_to(m)
folium.LayerControl().add_to(m)

m

#### Determine dataset sample

In [14]:
merged = gpd.sjoin(start, cell, how="left", predicate="within")

In [15]:
merged = merged.drop(columns="index_right")

In [None]:
print("Points per cell:\n", merged.cell.value_counts())

In [17]:
n_sample = 0.1  # specify how many samples you would like from each cell either as a percentage in decimals or as an absolute value


def calc_sample_size(n_sample):
    if n_sample < 1:
        n_sample = int(n_sample * len(merged))
        return n_sample
    else:
        n_sample = int(n_sample)
        return n_sample


n_sample = calc_sample_size(n_sample)

In [18]:
def get_cell_sample(n_sample):
    if n_sample > merged.cell.value_counts().min():
        print(
            "Your cell sample of",
            n_sample,
            "cannot be greater than the minimum number of points in a cell:",
            merged.cell.value_counts().min(),
        )
        print("Setting the cell sample to:", merged.cell.value_counts().min())
        n_sample = merged.cell.value_counts().min()
    df_sample = merged.groupby("cell").sample(n=n_sample)
    df_sample["split"] = 2
    df_sample = df_sample[["traj_id", "split"]]
    combined = merged.merge(df_sample, how="left")
    combined.loc[combined["split"] != 2, "split"] = 1
    return combined


combined = get_cell_sample(n_sample)

In [None]:
def keep_sample():
    df_sample = combined.loc[combined["split"] == 2]
    df_sample = df_sample.drop(columns="split")
    print("Your sample contains", len(df_sample), "records.")
    return df_sample


df_sample = keep_sample()

In [None]:
m = cell.explore()

start.explore(
    m=m,
    color="blue",
    legend=False,
)

df_sample.explore(
    m=m,
    color="red",
    legend=False,
)

folium.TileLayer("CartoDB positron").add_to(m)
folium.LayerControl().add_to(m)

m

## Dataset sampling with RandomTrajSampler

In [None]:
from mobiml.transforms.dataset_sampler import RandomTrajSampler

data = AISDK(r"../examples/data/aisdk_20180208_sample.zip")
data.df.head()

In [None]:
random_sample = RandomTrajSampler(data).random_sample(2, 20)
random_sample.df.head()

In [None]:
sample_data = RandomTrajSampler(data).get_sample_data((2, 1), 10)
sample_data.df.head()