In [1]:
# Authors:[146] The Italian Butei

# Amadori Luca, ID: 133429, lucaam@stud.ntnu.no
# Coppola Rodolfo Emanuele, ID: 133173, rodolfoc@stud.ntnu.no
# Meschieri Andrea, ID: 133527, andremes@stud.ntnu.no

In [2]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\lucaa\Documents\Università\Python\EPLF\venvEPLF\Scripts\python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
import pandas as pd
import sklearn
import geopandas as gpd
from sklearn.ensemble import RandomForestRegressor


# FUNCTIONS:

# Function to compute the Haversine distance between two points on Earth
def haversine(lat1, lon1, lat2, lon2):
    # Average Earth radius expressed in km
    R = 6371.0

    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Differences between coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    # Final distance
    distance = R * c
    return distance

# Function to implement the iterative model
def cycle(model, X_test, df, N):
    # Predict latitude and longitude

    data = {}
    for vessel_id, group in df.groupby('vesselId'):
        # Each group becomes a list of tuples
        data[vessel_id] = list(zip(group['latitude_x'], group['longitude_x'], group['time']))

    # Initialize the features in X_test
    X_test['latitude_x'] = np.nan
    X_test['longitude_x'] = np.nan
    X_test['latitude_ar1'] = np.nan
    X_test['longitude_ar1'] = np.nan
    X_test['latitude_ar2'] = np.nan
    X_test['longitude_ar2'] = np.nan
    X_test['latitude_ar3'] = np.nan
    X_test['longitude_ar3'] = np.nan
    X_test['time_horizon'] = np.nan
    X_test['time_ar2'] = np.nan
    X_test['time_ar3'] = np.nan
    X_test['time_ar1'] = np.nan
    X_test['distance_to_port'] = np.nan
    X_test['bearing_to_port'] = np.nan
    X_test['latitude_ma10'] = np.nan
    X_test['longitude_ma10'] = np.nan

    # Dictionary to hold the last known informations for each vessel from the training set
    vessel_last_positions = df[['vesselId', 'latitude_x', 'longitude_x', 'time', 'latitudePort', 'longitudePort']].groupby('vesselId').last().to_dict(orient='index')

    # Lists to store predictions
    predicted_lat = []
    predicted_lon = []

    # Loop through each row in the sorted X_test
    for i, row in X_test.iterrows():
        vessel_id = row['vesselId']
        index_offset1 = len(data[vessel_id]) - N[0]
        index_offset2 = len(data[vessel_id]) - N[1]
        index_offset3 = len(data[vessel_id]) - N[2]

        # Initialize features for this vessel
        row['latitude_x'] = vessel_last_positions[vessel_id]['latitude_x']
        row['longitude_x'] = vessel_last_positions[vessel_id]['longitude_x']
        row['time_horizon'] = (row['time'] - vessel_last_positions[vessel_id]['time']).total_seconds()
        row['latitude_ar1'] = data[vessel_id][index_offset1][0]
        row['longitude_ar1'] = data[vessel_id][index_offset1][1]
        row['latitude_ar2'] = data[vessel_id][index_offset2][0]
        row['longitude_ar2'] = data[vessel_id][index_offset2][1]
        row['latitude_ar3'] = data[vessel_id][index_offset3][0]
        row['longitude_ar3'] = data[vessel_id][index_offset3][1]
        row['time_ar1'] = (row['time'] - data[vessel_id][index_offset1][2]).total_seconds()
        row['time_ar2'] = (row['time'] - data[vessel_id][index_offset2][2]).total_seconds()
        row['time_ar3'] = (row['time'] - data[vessel_id][index_offset3][2]).total_seconds()
        row['distance_to_port'] = haversine(vessel_last_positions[vessel_id]['latitude_x'],vessel_last_positions[vessel_id]['longitude_x'],vessel_last_positions[vessel_id]['latitudePort'],vessel_last_positions[vessel_id]['longitudePort'])
        row['latitude_ma10'] = np.mean([x[0] for x in data[vessel_id][-10:]])
        row['longitude_ma10'] = np.mean([x[1] for x in data[vessel_id][-10:]])

        delta_longitude = np.radians(vessel_last_positions[vessel_id]['longitudePort'] - row['longitude_x'])
        latitude1 = np.radians(row['latitude_x'])
        latitude2 = np.radians(vessel_last_positions[vessel_id]['latitudePort'])

        row['bearing_to_port'] = np.degrees(np.arctan2(np.sin(delta_longitude) * np.cos(latitude2),np.cos(latitude1) * np.sin(latitude2) - np.sin(latitude1) * np.cos(latitude2) * np.cos(delta_longitude)))


        # Reorder the row to match the feature order expected by the model
        row_reordered = row[model.feature_names_in_]
        row_np = np.array(row_reordered).reshape(1, -1)
        row_df=pd.DataFrame(row_np, columns=['latitude_x', 'longitude_x', 'latitude_ar1', 'longitude_ar1', 'latitude_ar2', 'longitude_ar2', 'latitude_ar3', 'longitude_ar3', 'time_horizon', 'time_ar1', 'time_ar2', 'time_ar3', 'distance_to_port','bearing_to_port','latitude_ma10','longitude_ma10'])

        # Predict latitude and longitude
        pred = model.predict(row_df)

        # Assuming the model outputs a 2D array, where pred[0][0] is latitude and pred[0][1] is longitude
        predicted_lat.append(pred[0][0])
        predicted_lon.append(pred[0][1])

        # Update latitude and longitude in the vessel_last_positions dictionary
        vessel_last_positions[vessel_id] = {'latitude_x': pred[0][0], 'longitude_x': pred[0][1], 'time': row['time'], 'latitudePort' : vessel_last_positions[vessel_id]['latitudePort'], 'longitudePort' : vessel_last_positions[vessel_id]['longitudePort']}

        data[vessel_id].append((pred[0][0], pred[0][1], row['time']))

    return predicted_lat, predicted_lon

In [4]:
# Dataset loading
data_train = pd.read_csv("ais_train.csv", sep='|', header=0)
data_test = pd.read_csv("ais_test.csv", sep=',', header=0)
schedule_dataset = pd.read_csv("schedules_to_may_2024.csv", sep='|', header=0)
ports_dataset = pd.read_csv("ports.csv", sep='|', header=0)
vessel_dataset = pd.read_csv("vessels.csv", sep='|', header=0)

# Convert the 'time' column in datetime format
data_train['time'] = pd.to_datetime(data_train['time'])
data_test['time'] = pd.to_datetime(data_test['time'])

# Add the information about destination port to ais_train
data_train_merged = pd.merge(data_train, ports_dataset, on='portId', how='left')
data_train_merged.rename(columns={'latitude_y': 'latitudePort', 'longitude_y': 'longitudePort'}, inplace=True)

# Drop of the unuseful columns
data_train_merged.drop(columns=['name', 'portLocation', 'UN_LOCODE', 'countryName', 'ISO'], inplace=True)

# Add the information about the vessel and drop useless columns
train_preproc = pd.merge(data_train_merged, vessel_dataset, on='vesselId', how='left')
train_preproc.dropna(subset=['portId'], inplace=True)
train_preproc.drop(columns=['DWT','NT','vesselType','breadth','depth','draft','enginePower','freshWater','fuel','homePort','maxHeight','maxSpeed','maxWidth','rampCapacity','yearBuilt','etaRaw','shippingLineId'],inplace=True)

In [5]:
# DATA CLEANING
train_preproc = train_preproc[(train_preproc['cog'] < 360.0) & (train_preproc['sog'] <= 102.2) & (train_preproc['rot'] >= -127) & (train_preproc['rot'] <= 127) & (train_preproc['heading'] <= 359) & (train_preproc['navstat'] >= 0) & (train_preproc['navstat'] <= 8)]
train_preproc = train_preproc[(train_preproc['latitude_x'] <= 90) & (train_preproc['latitude_x'] >= -90) & (train_preproc['longitude_x'] <= 180) & (train_preproc['longitude_x'] >= -180)]

# Sorting the dataset in vessels and by time
train_preproc = train_preproc.sort_values(by=['vesselId','time'])

# Building a column with the time horizon for which the prediction is made (difference between the present timestamp and the following one)
train_preproc['time_horizon'] = -train_preproc.groupby('vesselId')['time'].diff(-1)
train_preproc['time_horizon'] = train_preproc['time_horizon'].dt.total_seconds()

In [6]:
# PREPARATION OF THE TEST SET

# Array with vesselId's for which a prediction is needed
distinct_Id_tobepred = data_test['vesselId'].unique()

# Create new columns containing lagged values of latitude and longitude
lag_1 = 1
lag_2 = 50
lag_3 = 100

train_preproc['latitude_ar1'] = train_preproc.groupby('vesselId')['latitude_x'].shift(lag_1)
train_preproc['longitude_ar1'] = train_preproc.groupby('vesselId')['longitude_x'].shift(lag_1)
train_preproc['latitude_ar2'] = train_preproc.groupby('vesselId')['latitude_x'].shift(lag_2)
train_preproc['longitude_ar2'] = train_preproc.groupby('vesselId')['longitude_x'].shift(lag_2)
train_preproc['latitude_ar3'] = train_preproc.groupby('vesselId')['latitude_x'].shift(lag_3)
train_preproc['longitude_ar3'] = train_preproc.groupby('vesselId')['longitude_x'].shift(lag_3)

# Introduce new columns containing the time difference between the lags
train_preproc['time_ar1'] = train_preproc.groupby('vesselId')['time'].diff(lag_1)
train_preproc['time_ar1'] = train_preproc['time_ar1'].dt.total_seconds()
train_preproc['time_ar2'] = train_preproc.groupby('vesselId')['time'].diff(lag_2)
train_preproc['time_ar2'] = train_preproc['time_ar2'].dt.total_seconds()
train_preproc['time_ar3'] = train_preproc.groupby('vesselId')['time'].diff(lag_3)
train_preproc['time_ar3'] = train_preproc['time_ar3'].dt.total_seconds()

# 'latitude_future' and 'longitude_future' represent the next observation's position
train_preproc['latitude_future'] = train_preproc.groupby('vesselId')['latitude_x'].shift(-1)
train_preproc['longitude_future'] = train_preproc.groupby('vesselId')['longitude_x'].shift(-1)

# Introduce a column containing the bearing to port
train_preproc['distance_to_port'] = train_preproc.apply(
    lambda row: haversine(row['latitude_x'], row['longitude_x'], row['latitudePort'], row['longitudePort']),
    axis=1
)
delta_longitude = np.radians(train_preproc['longitudePort'] - train_preproc['longitude_x'])
latitude1 = np.radians(train_preproc['latitude_x'])
latitude2 = np.radians(train_preproc['latitudePort'])

train_preproc['bearing_to_port'] = np.degrees(
    np.arctan2(
        np.sin(delta_longitude) * np.cos(latitude2),
        np.cos(latitude1) * np.sin(latitude2) - np.sin(latitude1) * np.cos(latitude2) * np.cos(delta_longitude)
    )
)

# Moving window of 10 observations
train_preproc['latitude_ma10'] = train_preproc.groupby('vesselId')['latitude_x'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
train_preproc['longitude_ma10'] = train_preproc.groupby('vesselId')['longitude_x'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
train_preproc_original = train_preproc.copy()

# Drop NaN values from train_preproc
train_preproc = train_preproc.dropna()

In [7]:
# APPLICATION OF THE MODEL

# Features and target selection
features = ['latitude_x', 'longitude_x', 'latitude_ar1', 'longitude_ar1', 'latitude_ar2', 'longitude_ar2', 'latitude_ar3', 'longitude_ar3', 'time_horizon', 'time_ar1', 'time_ar2', 'time_ar3', 'distance_to_port','bearing_to_port','latitude_ma10','longitude_ma10']
target = ['latitude_future','longitude_future']

# Train set
X = train_preproc[features]

# Target
Y = train_preproc[target]

# Definition of the model
Model = RandomForestRegressor(n_estimators = 50, random_state=42)

# Fitting of the model
Model.fit(X,Y)

In [8]:
# Compute the predictions using the iterative model
N = np.array([lag_1,lag_2,lag_3])
lat_pred, long_pred = cycle(Model,data_test,train_preproc_original,N)

In [9]:
# Saving of the prediction in the test dataset
data_test['latitude_predicted'] = lat_pred
data_test['longitude_predicted'] = long_pred

# Exporting the prediction in 'output_file'
output = data_test.sort_values(by='ID')
output = output.reset_index(drop=True)
output[['ID','longitude_predicted','latitude_predicted']].to_csv('output_file.csv', index=False)