# MobiML Sampling Demo

Randomly sample trajectories in a region of interest. Based on: https://github.com/microsoft/torchgeo and https://james-brennan.github.io/posts/fast_gridding_geopandas/

In [None]:
import numpy as np
import geopandas as gpd
import shapely
import folium
import math
from datetime import datetime

import sys

sys.path.append("../src")
from mobiml.datasets import AISDK

### Loading AISDK data

This dataset can be downloaded from: http://web.ais.dk/aisdata/aisdk-2018-02.zip

In [None]:
data = AISDK(r"../examples/data/aisdk_20180208_sample.zip")
data.df.head()

### Use trajectory start locations for sampling

In [None]:
trajs = data.to_trajs()

In [None]:
start = trajs.get_start_locations()

### Set bounding box

In [None]:
xmin, ymin, xmax, ymax = start.total_bounds
print(xmin, ymin, xmax, ymax)

Buffer bounding box by 0.01&deg;, which is about 1 km.

In [None]:
xmin = xmin - 0.01
ymin = ymin - 0.01
xmax = xmax + 0.01
ymax = ymax + 0.01

In [None]:
print("Buffered bounding box:", xmin, ymin, xmax, ymax)

### Calculate number of grid cells

In [None]:
n_cells = 2  # you can specify different values for width and height


def stride(value):
    if isinstance(value, tuple):
        cell_size_x = (xmax - xmin) / value[0]
        cell_size_y = (ymax - ymin) / value[1]
    elif isinstance(value, int):
        cell_size_x = (xmax - xmin) / value
        cell_size_y = (ymax - ymin) / value
    else:
        print("Please provide a tuple or int.")
    return cell_size_x, cell_size_y


cell_size_x, cell_size_y = stride(n_cells)

In [None]:
grid_cells = []
for x0 in np.arange(xmin, xmax, cell_size_x):
    for y0 in np.arange(ymin, ymax, cell_size_y):
        x1 = x0 + cell_size_x
        y1 = y0 + cell_size_y
        grid_cells.append(shapely.geometry.box(x0, y0, x1, y1))

In [None]:
cell = gpd.GeoDataFrame(grid_cells, columns=["geometry"], crs="EPSG:4326")

In [None]:
cell["cell"] = cell.index

In [None]:
m = cell.explore(name="cells")

start.explore(
    m=m,
    column="traj_id",
    popup=True,
    legend=False,
    name="trajectory start points",
)

folium.TileLayer("CartoDB positron").add_to(m)
folium.LayerControl().add_to(m)

m

### Determine dataset sample

In [None]:
merged = gpd.sjoin(start, cell, how="left", predicate="within")

In [None]:
merged = merged.drop(columns="index_right")

Determine empty cells that will not be used for sampling.

In [None]:
filled_cells = merged.cell.unique()

In [None]:
all_cells = merged.merge(cell, how="outer").cell.unique().tolist()

In [None]:
diff = [cell for cell in all_cells if cell not in filled_cells]

if not diff:
    print("All cells can be used for sampling.")
else:
    print("The following cells are empty and will not be used for sampling:", diff)

In [None]:
print("Number of cells used for sampling:", len(filled_cells))

In [None]:
print("Points per cell:\n", merged.cell.value_counts())

Specify how many samples you would like in total, either as a percentage in decimals or as an absolute value.

In [None]:
def get_cell_sample(n_sample=None, percent_sample=None, **kwargs):
    if percent_sample:
        n_sample = percent_sample * len(merged)

    if n_sample > len(start):
        try:
            raise ValueError("Sample too big.")
        except ValueError:
            print(
                "Your sample of",
                n_sample,
                "cannot be greater than the dataset:",
                len(start),
            )
            raise

    sample_size = n_sample / len(filled_cells)
    n_sample = math.ceil(sample_size)
    print("Number of samples per cell:", n_sample)

    if n_sample > merged.cell.value_counts().min():
        num = merged["cell"].value_counts()
        count = 0
        for n in num:
            if n < n_sample:
                count += 1
            else:
                pass
        print(
            "Not enough points in",
            count,
            "cell(s), so all points in the cell(s) will be used for sampling.",
        )

    df_sample = merged.groupby(["cell"], as_index=False, group_keys=False).apply(
        lambda x: x.sample(min(n_sample, len(x))), include_groups=False
    )
    df_sample["split"] = 2
    df_sample = df_sample[["traj_id", "split"]]
    combined = merged.merge(df_sample, how="left")
    combined.loc[combined["split"] != 2, "split"] = 1
    return combined


combined = get_cell_sample(n_sample=100)

In [None]:
combined.head()

### Separate sampled data

In [None]:
def keep_sample():
    df_sample = combined.loc[combined["split"] == 2]
    df_sample = df_sample.drop(columns="split")
    print("Your sample contains", len(df_sample), "records.")
    return df_sample


df_sample = keep_sample()

In [None]:
df_sample.head()

In [None]:
m = cell.explore(name="cells")

start.explore(
    m=m,
    color="blue",
    popup=True,
    legend=False,
    name="all data",
)

df_sample.explore(
    m=m,
    color="red",
    popup=True,
    legend=False,
    name="sample data",
)

folium.TileLayer("CartoDB positron").add_to(m)
folium.LayerControl().add_to(m)

m

## Dataset sampling with RandomTrajSampler

In [None]:
from mobiml.samplers import RandomTrajSampler

data = AISDK(r"../examples/data/aisdk_20180208_sample.zip")
data.to_trajs()

In [None]:
random_sample = RandomTrajSampler(data).split(n_cells=(2, 2), n_sample=100)
random_sample.df.head()

In [None]:
sample_data = RandomTrajSampler(data).sample(n_cells=(2, 1), percent_sample=0.4)
sample_data.to_trajs()

## Splitting at timestamps with TemporalSplitter

Split dataset temporally at timestamp into train/dev and into train/dev/test if two timestamps are provided.

In [None]:
aisdk = AISDK(r"../examples/data/aisdk_20180208_sample.zip")
aisdk.df.head()

In [None]:
from mobiml.samplers import TemporalSplitter

aisdk = TemporalSplitter(aisdk).split_at_timestamp(
    timestamp=datetime(2018, 2, 8, 8, 0, 0)
)
aisdk.df

In [None]:
aisdk = TemporalSplitter(aisdk).split_at_timestamp(
    timestamp=datetime(2018, 2, 8, 8, 0, 0), timestamp_2=datetime(2018, 2, 8, 16, 0, 0)
)
aisdk.df