In [4]:
# Authors:[146] The Italian Butei

# Amadori Luca, ID: 133429, lucaam@stud.ntnu.no
# Coppola Rodolfo Emanuele, ID: 133173, rodolfoc@stud.ntnu.no
# Meschieri Andrea, ID: 133527, andremes@stud.ntnu.no

In [5]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\lucaa\Documents\Università\Python\EPLF\venvEPLF\Scripts\python.exe -m pip install --upgrade pip


In [6]:
import numpy as np
import pandas as pd
import sklearn
import geopandas as gpd
from shapely.ops import nearest_points
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# FUNCTIONS:

# Function for converting nav_stat in a dummy variable
def convert_navstat(value):
    if value in [0, 8]:
        return 1  # MOVING
    else:
        return 0  # STOPPED

In [7]:
# Dataset loading
data_train = pd.read_csv("ais_train.csv", sep='|', header=0)
data_test = pd.read_csv("ais_test.csv", sep=',', header=0)
schedule_dataset = pd.read_csv("schedules_to_may_2024.csv", sep='|', header=0)
ports_dataset = pd.read_csv("ports.csv", sep='|', header=0)
vessel_dataset = pd.read_csv("vessels.csv", sep='|', header=0)

# Convert the 'time' column in datetime format
data_train['time'] = pd.to_datetime(data_train['time'])
data_test['time'] = pd.to_datetime(data_test['time'])

# Add the information about destination port to ais_train
data_train_merged = pd.merge(data_train, ports_dataset, on='portId', how='left')
data_train_merged.rename(columns={'latitude_y': 'latitudePort', 'longitude_y': 'longitudePort'}, inplace=True)
# Drop of the unuseful columns
data_train_merged.drop(columns=['name', 'portLocation', 'UN_LOCODE', 'countryName', 'ISO'], inplace=True)

# Add the information about the vessel and drop useless columns
train_preproc = pd.merge(data_train_merged, vessel_dataset, on='vesselId', how='left')
train_preproc.dropna(subset=['portId'], inplace=True)
train_preproc.drop(columns=['DWT','NT','vesselType','breadth','depth','draft','enginePower','freshWater','fuel','homePort','maxHeight','maxSpeed','maxWidth','rampCapacity','yearBuilt'],inplace=True)

# Configuration and application of the IterativeImputer
train_preproc['cog'] = train_preproc['cog'].replace(360, np.nan)
imputer = IterativeImputer(max_iter=30, random_state=0)
train_preproc[['cog']] = imputer.fit_transform(train_preproc[['cog']])

In [8]:
# DATA CLEANING
train_preproc = train_preproc[(train_preproc['cog'] < 360.0) & (train_preproc['sog'] <= 102.2) & (train_preproc['rot'] >= -127) & (train_preproc['rot'] <= 127) & (train_preproc['heading'] <= 359) & (train_preproc['navstat'] >= 0) & (train_preproc['navstat'] <= 8)]
train_preproc = train_preproc[(train_preproc['latitude_x'] <= 90) & (train_preproc['latitude_x'] >= -90) & (train_preproc['longitude_x'] <= 180) & (train_preproc['longitude_x'] >= -180)]

# Trasformation of the 'nav_stat' column in a dummy variable (1 = moving, 0 = not moving)
train_preproc['navstat_dummy'] = train_preproc['navstat'].apply(convert_navstat)

# Summarize the three variables 'CEU', 'GT' and 'length' in a single feature which captures the maximum variance: 'vessel_dimensions'
pca = PCA(n_components=1)
train_preproc['vessel_dimensions'] = pca.fit_transform(train_preproc[["CEU", "GT", "length"]])

# Sorting the dataset in vessels and by time
train_preproc = train_preproc.sort_values(by=['vesselId','time'])
# Filtering the observations: we keep the vessels with more than 10 observations
train_preproc = train_preproc.groupby('vesselId').filter(lambda x: len(x) >= 10)

# Building a column with the time horizon for which the prediction is made (difference between the present timestamp and the following one)
train_preproc['time_horizon'] = -train_preproc.groupby('vesselId')['time'].diff(-1)
train_preproc['time_horizon'] = train_preproc['time_horizon'].dt.total_seconds()

# 'latitude_future' and 'longitude_future' represent the next observation's position
train_preproc['latitude_future'] = train_preproc.groupby('vesselId')['latitude_x'].shift(-1)
train_preproc['longitude_future'] = train_preproc.groupby('vesselId')['longitude_x'].shift(-1)

In [9]:
# PREPARATION OF THE TEST SET

# Array with vesselId's for which a prediction is needed
distinct_Id_tobepred = data_test['vesselId'].unique()

# Train dataset filtered for those vessels
data_vessels_tobepred = train_preproc[train_preproc['vesselId'].isin(distinct_Id_tobepred)]

# Dataset containing the last observation for each vessel
last_observations = data_vessels_tobepred.groupby('vesselId').tail(1)

# Drop 'latitude_future' and 'longitude_future' from this dataset: they're all NaN
last_observations_clean = last_observations.drop(columns=['latitude_future', 'longitude_future'])

# Creation of the final dataset for test
data_test_complete = pd.merge(last_observations_clean, data_test, on='vesselId', how='left')
data_test_complete['time_horizon'] = data_test_complete['time_y']-data_test_complete['time_x']
data_test_complete['time_horizon'] = data_test_complete['time_horizon'].dt.total_seconds()

# Delete from the train the last row for each vessel
train_preproc = train_preproc.dropna(subset=['time_horizon'])


In [10]:
# APPLICATION OF THE MODEL

# Features and target selection
features = ['latitude_x','longitude_x', 'cog', 'sog', 'heading', 'time_horizon','latitudePort','longitudePort', 'vessel_dimensions', 'rot']
target = ['latitude_future','longitude_future']

# Initialization of the scaler for normalizing the features
scaler = StandardScaler()

# Train set
X = train_preproc[features]
X_scaled = scaler.fit_transform(X)

# Target
Y = train_preproc[target]

# Definition of the model
Model = RandomForestRegressor(n_estimators= 200, min_samples_split= 2, min_samples_leaf= 1, max_features= 'sqrt', max_depth= 20, bootstrap= True, random_state=11)

# Fitting of the model
Model.fit(X_scaled,Y)

# Predictions
X_test = data_test_complete[features]
X_test_scaled = scaler.transform(X_test)
Y_pred = Model.predict(X_test_scaled)

In [11]:
# Saving of the prediction in the test dataset
data_test_complete['latitude_predicted'] = Y_pred[:,0]
data_test_complete['longitude_predicted'] = Y_pred[:,1]

# Exporting the prediction in 'output_file'
output = data_test_complete.sort_values(by='ID')
output = output.reset_index(drop=True)
output[['ID','longitude_predicted','latitude_predicted']].to_csv('output_file.csv', index=False)

In [12]:
# POSTPROCESSING

# Configuration for Pandas Display Options
#pd.set_option('display.max_columns', None)

# Read the output file and the test set and merge them
df = pd.read_csv('output_file.csv', sep=',')
test = pd.read_csv('ais_test.csv', sep=',')
test = test.drop(columns=['scaling_factor']) 
show = pd.merge(df, test, on='ID', how='left')
show['time'] = pd.to_datetime(show['time'])

In [17]:
# Get the coordinates of lands and oceans from the files
land_world = gpd.read_file('ne_10m_land/ne_10m_land.shp').to_crs(4326)
ocean_world = gpd.read_file('ne_10m_ocean/ne_10m_ocean.shp').to_crs(4326)
gdf = gpd.GeoDataFrame(show, geometry=gpd.points_from_xy(df['longitude_predicted'], df['latitude_predicted'], crs="EPSG:4326"))

In [18]:
# Identify points on land
points_on_land = gpd.sjoin(gdf, land_world, how="inner", predicate="within")

# Find the closest point in ocean
closest_longitudes = []
closest_latitudes = []
for _, row in points_on_land.iterrows():
    closest_point, _ = nearest_points(ocean_world['geometry'], row['geometry'])
    closest_longitudes.append(closest_point.x)
    closest_latitudes.append(closest_point.y)

In [19]:
# Update predictions
points_on_land['closest_longitude'] = closest_longitudes
points_on_land['closest_latitude'] = closest_latitudes

gdf.loc[points_on_land.index, 'longitude_predicted'] = points_on_land['closest_longitude']
gdf.loc[points_on_land.index, 'latitude_predicted'] = points_on_land['closest_latitude']

 dtype: float64  0    3.219418
                 dtype: float64 0    18.659067
                                dtype: float64 ... 0    11.4463
                                                   dtype: float64
 0   -97.774281
 dtype: float64 0    7.428355
                dtype: float64]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  gdf.loc[points_on_land.index, 'longitude_predicted'] = points_on_land['closest_longitude']
 dtype: float64 0    51.348853
                dtype: float64 0    54.41325
                               dtype: float64 ... 0    53.906155
                                                  dtype: float64
 0    27.468492
 dtype: float64 0    53.693088
                dtype: float64]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  gdf.loc[points_on_land.index, 'latitude_predicted'] = points_on_land['closest_latitude']


In [20]:
# Final processing and saving
gdf['longitude_predicted'] = gdf['longitude_predicted'].astype(float)
gdf['latitude_predicted'] = gdf['latitude_predicted'].astype(float)

gdf = gdf.drop(columns=['vesselId', 'time', 'geometry'])
gdf.to_csv('processed_res.csv',index=False)

  gdf['longitude_predicted'] = gdf['longitude_predicted'].astype(float)
  gdf['latitude_predicted'] = gdf['latitude_predicted'].astype(float)
