## Importing libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#from geopy.distance import geodesic
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


2024-10-09 19:18:49.349066: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-09 19:18:51.721151: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-09 19:18:52.364211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 19:18:52.807332: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 19:18:52.966484: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 19:18:54.324193: I tensorflow/core/platform/cpu_feature_gu

## Data processing


In [2]:
# Load the AIS data
ais_train = pd.read_csv('data/ais_train.csv', sep='|')
ais_test = pd.read_csv('data/ais_test.csv', sep='|')

# Optionally, load vessels and ports datasets for additional features
#vessels = pd.read_csv('data/vessels.csv')
#ports = pd.read_csv('data/ports.csv')
#schedules = pd.read_csv('data/schedules_to_may_2024.csv')


## Feature engineering


In [3]:
ais_train['latitude'] = pd.to_numeric(ais_train['latitude'], errors='coerce')
ais_train['longitude'] = pd.to_numeric(ais_train['longitude'], errors='coerce')
ais_train['time'] = pd.to_datetime(ais_train['time'], errors='coerce')

# Function to calculate speed (delta_lat^2 + delta_lon^2) / delta_time
def calculate_speed(df):
    df['delta_lat'] = df.groupby('vesselId')['latitude'].diff()
    df['delta_lon'] = df.groupby('vesselId')['longitude'].diff()
    df['delta_time'] = df.groupby('vesselId')['time'].diff().dt.total_seconds() / 3600  # in hours
    df['speed'] = np.sqrt(df['delta_lat']**2 + df['delta_lon']**2) / df['delta_time']
    df['speed'].fillna(0, inplace=True)

# Calculate speed and other time-related features
calculate_speed(ais_train)

# Additional time-based features
ais_train['time'] = pd.to_datetime(ais_train['time'])  # Ensure 'time' column is in datetime format
ais_train['hour'] = ais_train['time'].dt.hour
ais_train['day_of_week'] = ais_train['time'].dt.dayofweek
ais_train['month'] = ais_train['time'].dt.month


# Resample AIS data to a regular interval (e.g., every 20 minutes)
def resample_vessel_data(df, vessel_id):
    vessel_df = df[df['vesselId'] == vessel_id]
    vessel_df.set_index('time', inplace=True)
    vessel_df = vessel_df.resample('20T').agg({
        'latitude': 'mean',
        'longitude': 'mean',
        'cog': 'mean',
        'sog': 'mean',
        'rot': 'mean',
        'heading': 'mean',
        'navstat': 'mean',
        'etaRaw': 'first',
        'vesselId': 'first',
        'portId': 'first',
        'delta_lat': 'mean',
        'delta_lon': 'mean',
        'delta_time': 'mean',
        'speed': 'mean',
        'hour': 'first',
        'day_of_week': 'first',
        'month': 'first'
    }).interpolate()
    vessel_df.reset_index(inplace=True)
    return vessel_df

# Resample all vessels
resampled_vessels = []
for vessel in ais_train['vesselId'].unique():
    resampled_vessels.append(resample_vessel_data(ais_train, vessel))

ais_train_resampled = pd.concat(resampled_vessels)


## Prepare data for LSTM


In [4]:
def create_sequences(data, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i + sequence_length])
        targets.append(data[i + sequence_length])
    return np.array(sequences), np.array(targets)

sequence_length = 20  # Use 20 historical points to predict the next one

# Prepare input features (latitude, longitude, speed, etc.) for LSTM
features = ['latitude', 'longitude', 'speed', 'cog', 'hour', 'day_of_week']
X_lstm, y_lstm = create_sequences(ais_train_resampled[features].values, sequence_length)

# Latitude and longitude as targets
y_lstm = ais_train_resampled[['latitude', 'longitude']].values[sequence_length:]

X_train_lstm, X_val_lstm, y_train_lstm, y_val_lstm = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42)


## LSTM model


In [None]:
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=100, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=2, activation='linear'))  # Predict latitude and longitude
    model.compile(optimizer='adam', loss='mse')
    return model

input_shape = (X_train_lstm.shape[1], X_train_lstm.shape[2])  # Adjust based on data
model = build_lstm_model(input_shape)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train_lstm, y_train_lstm, 
                    epochs=50, 
                    batch_size=64, 
                    validation_data=(X_val_lstm, y_val_lstm), 
                    callbacks=[early_stopping])

# Plot training and validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()


  super().__init__(**kwargs)


## Random Forest


In [None]:
# Features for Random Forest (LSTM predictions + additional features)
X_rf_train = pd.DataFrame(y_train_lstm, columns=['latitude_pred', 'longitude_pred'])
X_rf_train['speed'] = ais_train_resampled['speed'][sequence_length:]
X_rf_train['port_call'] = ais_train_resampled['port_call'][sequence_length:]

y_rf_train = ais_train_resampled[['latitude', 'longitude']].values[sequence_length:]

# Train-validation split for Random Forest
X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(X_rf_train, y_rf_train, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Validate
rf_predictions = rf_model.predict(X_val_rf)



## LSTM Predictions


In [None]:
X_test_lstm = create_sequences(ais_test[features].values, sequence_length)[0]
lstm_test_predictions = model.predict(X_test_lstm)


## Random Forest Predictions


In [None]:
X_rf_test = pd.DataFrame(lstm_test_predictions, columns=['latitude_pred', 'longitude_pred'])
X_rf_test['speed'] = ais_test['speed'][sequence_length:]
X_rf_test['port_call'] = ais_test['port_call'][sequence_length:]

rf_test_predictions = rf_model.predict(X_rf_test)


# Prepare submission file
submission = pd.DataFrame(rf_test_predictions, columns=['latitude', 'longitude'])
submission['vessel_id'] = ais_test['vessel_id'][sequence_length:]
submission['timestamp'] = ais_test['timestamp'][sequence_length:]

# Save to CSV for submission
submission.to_csv('submission.csv', index=False)
