```
# coding: utf-8

# MIT License
#
# Copyright (c) 2018 Duong Nguyen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# ==============================================================================
```

## Imports

In [20]:
import numpy as np
import os
import sys
import pickle
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
import movingpandas as mpd
from datetime import datetime, timedelta

sys.path.append("..")
from mobiml.datasets import AISDK
from mobiml.preprocessing import TrajectoryFilter
from mobiml.transforms import TripExtractor, TemporalSplitter

## Setup

In [21]:
# AISDK dataset
min_lon, min_lat, max_lon, max_lat = 11.0, 57.0, 13.0, 58.0

SOG_MIN = 1.0
SOG_MAX = 30.0  # SOG is truncated to 30.0 knots max

# Pkl filenames
pkl_filename_train = "aisdk_20180208_train.pkl"
pkl_filename_valid = "aisdk_20180208_valid.pkl"
pkl_filename_test = "aisdk_20180208_test.pkl"

# Path to csv files
data_path = "../examples/data/"
csv_filename = "aisdk_20180208_sample.zip"

# Output path
out_path = "../examples/data/"

## Loading data

##### Filter to ROI by specifying bounding box

In [None]:
path = os.path.join(data_path, csv_filename)
print(f"{datetime.now()} Loading data from {path}")
aisdk = AISDK(path, min_lon, min_lat, max_lon, max_lat)

## Preprocessing

##### Remove missing values

In [None]:
aisdk.df = aisdk.df.dropna()
aisdk.df.head()

In [None]:
print("After removing missing values we have...")
print("Total number of AIS messages: ", aisdk.df.shape[0])
print("Total number of vessels:", len(aisdk.df.traj_id.unique()))
print("Lat min: ", aisdk.df.y.min(), "Lat max: ", aisdk.df.y.max())
print("Lon min: ", aisdk.df.x.min(), "Lon max: ", aisdk.df.x.max())
print("Time min: ", aisdk.df.timestamp.min(), "Time max: ", aisdk.df.timestamp.max())

##### Remove 'Moored' and 'At anchor' AIS messages

In [None]:
moored_at_anchor = aisdk.df[
    (aisdk.df["nav_status"] == "Moored") | (aisdk.df["nav_status"] == "At anchor")
]
aisdk.df = pd.concat([aisdk.df, moored_at_anchor]).drop_duplicates(keep=False)
print("After removing 'Moored' or 'At anchor' AIS messages we have...")
print("Total number of AIS messages: ", aisdk.df.shape[0])

##### Drop trajectories with fewer than $Points_{min}$ locations

In [None]:
aisdk = TrajectoryFilter(aisdk).filter_min_pts(min_pts=20)
print("After removing trajectories with too few points we have...")
print("Total number of AIS messages: ", aisdk.df.shape[0])

##### Drop speed outliers

In [None]:
aisdk = TrajectoryFilter(aisdk).filter_speed(min_speed=SOG_MIN, max_speed=SOG_MAX)
print("After removing speed outliers we have...")
print("Total number of AIS messages: ", aisdk.df.shape[0])

##### Temporal train/valid/test split

In [None]:
aisdk = TemporalSplitter(aisdk).split_hr()
aisdk.df

In [None]:
aisdk_train = aisdk.df[(aisdk.df["split"] == 1.0)]
aisdk_valid = aisdk.df[(aisdk.df["split"] == 2.0)]
aisdk_test = aisdk.df[(aisdk.df["split"] == 3.0)]

print("Total number of AIS messages: ", len(aisdk.df))
print("Number of msgs in the training set: ", len(aisdk_train))
print("Number of msgs in the validation set: ", len(aisdk_valid))
print("Number of msgs in the test set: ", len(aisdk_test))

##### Remove short trajectories

In [30]:
aisdk_train = gpd.GeoDataFrame(
    aisdk_train,
    geometry=gpd.points_from_xy(aisdk_train.x, aisdk_train.y),
    crs="EPSG:4326",
)
aisdk_valid = gpd.GeoDataFrame(
    aisdk_valid,
    geometry=gpd.points_from_xy(aisdk_valid.x, aisdk_valid.y),
    crs="EPSG:4326",
)
aisdk_test = gpd.GeoDataFrame(
    aisdk_test, geometry=gpd.points_from_xy(aisdk_test.x, aisdk_test.y), crs="EPSG:4326"
)

In [None]:
print(
    "Removing AIS trajectories with a length less than 20 or with a duration of less than 1h..."
)
aisdk_train = TripExtractor(
    aisdk_train, min_length=20, min_duration=timedelta(minutes=60)
)

In [None]:
aisdk_valid = TripExtractor(
    aisdk_valid, min_length=20, min_duration=timedelta(minutes=60)
)

In [None]:
aisdk_test = TripExtractor(
    aisdk_test, min_length=20, min_duration=timedelta(minutes=60)
)

In [None]:
aisdk_train = aisdk_train.tc.to_traj_gdf(
    agg={"speed": "mean", "Name": "max", "nav_status": "max", "ship_type": "max"}
)
aisdk_train = aisdk_train.rename(
    columns={
        "speed_mean": "speed",
        "Name_max": "name",
        "nav_status_max": "nav_status",
        "ship_type_max": "ship_type",
    }
)

aisdk_valid = aisdk_valid.tc.to_traj_gdf(
    agg={"speed": "mean", "Name": "max", "nav_status": "max", "ship_type": "max"}
)
aisdk_valid = aisdk_valid.rename(
    columns={
        "speed_mean": "speed",
        "Name_max": "name",
        "nav_status_max": "nav_status",
        "ship_type_max": "ship_type",
    }
)

aisdk_test = aisdk_test.tc.to_traj_gdf(
    agg={"speed": "mean", "Name": "max", "nav_status": "max", "ship_type": "max"}
)
aisdk_test = aisdk_test.rename(
    columns={
        "speed_mean": "speed",
        "Name_max": "name",
        "nav_status_max": "nav_status",
        "ship_type_max": "ship_type",
    }
)

print("Total number of train trajectories:", len(aisdk_train.traj_id.unique()))
print("Total number of valid trajectories:", len(aisdk_valid.traj_id.unique()))
print("Total number of test trajectories:", len(aisdk_test.traj_id.unique()))

##### Format ouput in ndarrays

In [35]:
# aisdk_train = np.array(aisdk_train)
# aisdk_train

##### Output pickle files

In [None]:
for filename, filedict in zip(
    [pkl_filename_train, pkl_filename_valid, pkl_filename_test],
    [aisdk_train, aisdk_valid, aisdk_test],
):
    print("Writing to", os.path.join(out_path, filename))
    with open(os.path.join(out_path, filename), "wb") as f:
        pickle.dump(filedict, f)

##### Select vessel type

In [37]:
# TYPE = "Fishing"
# aisdk_type = aisdk.df[aisdk.df["ship_type"] == TYPE]
# print("Total number of vessels:", len(aisdk.df.traj_id.unique()))
# print("Total number of", TYPE, "vessels:", len(aisdk_type.traj_id.unique()))

In [38]:
# aisdk_type = np.array(aisdk_type)