### Data preparation

The point of this notebook is to prepare all the features that will be used to predict ship-trajectories and store them in a cleaned and prepared csv file

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import geodesic
from timezonefinder import TimezoneFinder
import pytz

In [4]:
ais_train_data_path = '../../Project materials/ais_train.csv'
ais_test_data_path = '../../Project materials/ais_test.csv'
ports_data_path = '../../Project materials/ports.csv'
vessels_data_path = '../../Project materials/vessels.csv'
schedules_data_path = '../../Project materials/schedules_to_may_2024.csv'



ais_data_train = pd.read_csv(ais_train_data_path, sep='|')
ais_data_test = pd.read_csv(ais_test_data_path, sep=',')
ports = pd.read_csv(ports_data_path, sep='|')
vessels = pd.read_csv(vessels_data_path, sep='|')
schedules = pd.read_csv(schedules_data_path, sep='|')

##### Relevant AIS-data features

Navstat:

In [9]:
# EXTRACT MOVEMENT STATUS FROM NAVSTAT:

def categorize_navstat_contrast(navstat):
    if navstat in [0, 8]:
        return 1  # Underway
    elif navstat in [2, 3, 4]:
        return 0.5  # Restricted Movement
    elif navstat in [1, 5, 6]:
        return -1  # Stationary
    else:
        return 0  # Unknown

ais_data_train['movement_status'] = ais_data_train['navstat'].apply(categorize_navstat_contrast)


ETARAW:

In [33]:
current_year = 2024
ais_data_train['ETARAW_transformed'] = ais_data_train['etaRaw'].apply(lambda x: f"{current_year}-{x}")
ais_data_train['ETARAW_transformed'] = pd.to_datetime(ais_data_train['ETARAW_transformed'], format='%Y-%m-%d %H:%M', errors='coerce')
ais_data_train['time'] = pd.to_datetime(ais_data_train['time'])

In [36]:

ais_data_train =ais_data_train.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

grouped =ais_data_train.groupby('vesselId')

ais_data_train['ETARAW_transformed'] = grouped['ETARAW_transformed'].apply(lambda x: x.interpolate(method='linear')).reset_index(level=0, drop=True)

ais_data_train['estimated_time_to_destination'] = (ais_data_train['ETARAW_transformed'] -ais_data_train['time']).dt.total_seconds() / 3600



In [40]:
summary = ais_data_train['estimated_time_to_destination'].describe()
print(summary)

count    1.522065e+06
mean     1.459338e+02
std      1.110177e+03
min     -3.068984e+03
25%     -1.885750e+01
50%      3.527778e-01
75%      2.642722e+01
max      8.783835e+03
Name: estimated_time_to_destination, dtype: float64


Other:

In [35]:
desiered_ais_columns = ['latitude', 'longitude', 'cog', 'sog', 'movement_status', 'estimated_time_to_destination']

ship_train_groups = ais_data_train.groupby('vesselId')
ship_train_dataframes = {ship_id: group[desiered_ais_columns] for ship_id, group in ship_train_groups}

##### Most likely destination port

Should combine
- Last known port 
- Scheduled ports and the most common next port
- The port from the ais-data
- The port from the vessels current schedule
- If not moored check if the ship has gotten closer to the port than it was a day ago