In [None]:
import pandas as pd
from geopy.distance import geodesicimport pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os

df_train = pd.read_csv('data/ais_train.csv', delimiter='|')
df_test = pd.read_csv('data/ais_test.csv', delimiter=',')
df_schedules = pd.read_csv('data/schedules_to_may_2024.csv', delimiter='|')



In [None]:
# Feature Extraction
# Compute the time difference between consecutive points (grouped by vesselId)
df_train['time_difference'] = df_train.groupby('vesselId')['time'].diff().dt.total_seconds()
df_test['time_difference'] = df_test.groupby('vesselId')['time'].diff().dt.total_seconds()

# Calculate previous latitude and longitude
df_train['lat_prev'] = df_train.groupby('vesselId')['latitude'].shift(1)
df_train['lon_prev'] = df_train.groupby('vesselId')['longitude'].shift(1)

# Apply vectorized function for distance calculation
df_train['distance'] = df_train.apply(lambda row: get_distance(row['latitude'], row['longitude'], row['lat_prev'], row['lon_prev']), axis=1)

# Ensure time_difference and distance are present
df_train['time_difference'] = df_train['time_difference'].fillna(0)
df_train['distance'] = df_train['distance'].fillna(0)

# Calculate average speed (in meters per second)
df_train['avg_speed'] = df_train['distance'] / df_train['time_difference']

# Handle cases where time_difference is zero to avoid division by zero errors
df_train['avg_speed'] = df_train['avg_speed'].replace([np.inf, -np.inf], 0).fillna(0)

# Convert avg_speed to knots (optional)
df_train['avg_speed_knots'] = df_train['avg_speed'] * 1.94384  # 1 m/s = 1.94384 knots


df_train['moving'] = df_train['distance'].apply(lambda x: 1 if x > 100 else 0)

# add a moving_speed column that is 1 if "avg_speed_knots" > 1, otherwise 0
df_train['moving_speed'] = df_train['avg_speed_knots'].apply(lambda x: 1 if x > 1 else 0)

# add column "moving_based_on_speed" that is 1 if "avg_speed_knots" > 1, otherwise 0
df_train['moving_based_on_speed'] = df_train['avg_speed_knots'].apply(lambda x: 1 if x > 1 else 0)

# Calculate bearing between consecutive points
df_train['lat_prev'] = df_train.groupby('vesselId')['latitude'].shift(1)
df_train['lon_prev'] = df_train.groupby('vesselId')['longitude'].shift(1)

df_train['bearing'] = df_train.apply(lambda row: calculate_bearing(row['lat_prev'], row['lon_prev'], row['latitude'], row['longitude']), axis=1)

# Calculate change in speed (acceleration) over time
df_train['prev_speed'] = df_train.groupby('vesselId')['sog'].shift(1)  # Shift previous speed over ground (sog)
df_train['acceleration'] = (df_train['sog'] - df_train['prev_speed']) / df_train['time_difference']

# Fill NaN values with 0 for the first row of each vessel
df_train['acceleration'] = df_train['acceleration'].fillna(0)

# Calculate heading change (turn rate)
df_train['prev_heading'] = df_train.groupby('vesselId')['heading'].shift(1)
df_train['heading_change'] = df_train['heading'] - df_train['prev_heading']

# Normalize heading change to -180 to 180 degrees to avoid large jumps in the values
df_train['heading_change'] = df_train['heading_change'].apply(lambda x: (x + 180) % 360 - 180)





In [None]:
# Convert type of "time" column to datetime
df_train['time'] = pd.to_datetime(df_train['time'])

# Convert type of "time" column to datetime
df_test['time'] = pd.to_datetime(df_test['time'])

# Extract hour of the day
df_train['hour_of_day'] = df_train['time'].dt.hour

# Extract day of the week (0 = Monday, 6 = Sunday)
df_train['day_of_week'] = df_train['time'].dt.dayofweek

In [1]:
## Helper functions

# Get distance moved between two consecutive points
def get_distance(lat, lon, lat_prev, lon_prev):
    if pd.notna(lat_prev) and pd.notna(lon_prev):
        return geodesic((lat, lon), (lat_prev, lon_prev)).meters
    else:
        return 0  # No previous data point means no movement


def calculate_bearing(lat1, lon1, lat2, lon2):
    """
    Calculate the bearing between two latitude/longitude points.
    The formula is based on the initial bearing from point 1 to point 2.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1
    x = np.sin(dlon) * np.cos(lat2)
    y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(dlon)
    bearing = np.degrees(np.arctan2(x, y))
    return (bearing + 360) % 360  # Normalize to 0-360 degrees


In [None]:
## Feature selection
# Drop unnecessary columns
df_train.drop(['lat_prev', 'lon_prev'], axis=1, inplace=True)

# Drop the unnecessary columns
df_train.drop(['lat_prev', 'lon_prev'], axis=1, inplace=True)

df_train.drop('prev_speed', axis=1, inplace=True)  # Clean up unnecessary columns

df_train.drop('prev_heading', axis=1, inplace=True)


In [None]:
# Combine the unique vessel IDs from both train and test
all_vessels = pd.concat([df_test['vesselId'], df_train['vesselId']]).unique()
# Map the vesselId in both DataFrames using the created mapping
df_train['vesselId_mapped'] = df_train['vesselId'].map(vessel_mapping)
df_test['vesselId_mapped'] = df_test['vesselId'].map(vessel_mapping)
df_schedules['vesselId_mapped'] = df_schedules['vesselId'].map(vessel_mapping)

# Create a dictionary to store the first time for each vesselId_mapped
first_time = df_test.groupby('vesselId_mapped')['time'].min().to_dict()

# Map the first time to the vesselId_mapped in the test data
df_test['time_difference_since_start'] = df_test.apply(lambda row: (row['time'] - first_time[row['vesselId_mapped']]).total_seconds(), axis=1)

# Create a dictionary to store the first time for each vesselId_mapped
first_time = df_train.groupby('vesselId_mapped')['time'].min().to_dict()

# Map the first time to the vesselId_mapped in the train data
df_train['time_difference_since_start'] = df_train.apply(lambda row: (row['time'] - first_time[row['vesselId_mapped']]).total_seconds(), axis=1)

# add time difference sinc start of train to test
# Create a dictionary to store the first time for each vesselId_mapped
first_time = df_train.groupby('vesselId_mapped')['time'].min().to_dict()

# Map the first time to the vesselId_mapped in the test data
df_test['time_difference_since_start_train'] = df_test.apply(lambda row: (row['time'] - first_time[row['vesselId_mapped']]).total_seconds(), axis=1)

In [None]:
## Feature extraction - Did not improve the predictions:

def haversine(lat1, lon1, lat2, lon2):
    # Haversine formula to calculate distance between two latitude-longitude points
    R = 6371  # Radius of Earth in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

df['haversine_distance'] = df.apply(lambda row: haversine(row['latitude'], row['longitude'],
                                                          row['latitude'] - row['latitude_difference'],
                                                          row['longitude'] - row['longitude_difference']), axis=1)

df_train['cumulative_speed'] = df_train.groupby('vesselId')['avg_speed'].cumsum()

df_train['cumulative_distance'] = df_train.groupby('vesselId')['distance'].cumsum()

df_train['delta_cog'] = df_train.groupby('vesselId')['cog'].diff()
df_train['delta_rot'] = df_train.groupby('vesselId')['rot'].diff()

df_train['turn_rate'] = df_train.groupby('vesselId')['cog'].diff() / df_train['time_difference']
