In [53]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the training and test data
train_df = pd.read_csv('new_train.csv', parse_dates=['time'])
test_df = pd.read_csv('ais_test.csv', parse_dates=['time'])

# Ensure the data is sorted by vesselId and time
train_df = train_df.sort_values(by=['vesselId', 'time'])
test_df = test_df.sort_values(by=['vesselId', 'time'])


In [54]:


# Encode vesselId using LabelEncoder
label_encoder = LabelEncoder()
train_df['vesselId_encoded'] = label_encoder.fit_transform(train_df['vesselId'])

# Extract time features from the training data
train_df['hour'] = train_df['time'].dt.hour
train_df['day'] = train_df['time'].dt.day
train_df['month'] = train_df['time'].dt.month

# Create lag features in the training data
train_df['latitude_lag1'] = train_df.groupby('vesselId')['latitude'].shift(1)
train_df['longitude_lag1'] = train_df.groupby('vesselId')['longitude'].shift(1)
train_df['sog_lag1'] = train_df.groupby('vesselId')['sog'].shift(1)
train_df['cog_lag1'] = train_df.groupby('vesselId')['cog'].shift(1)
train_df['time_lag1'] = train_df.groupby('vesselId')['time'].shift(1)

# Calculate time difference in hours
train_df['time_diff_hours'] = (train_df['time'] - train_df['time_lag1']).dt.total_seconds() / 3600.0


In [55]:

# Drop rows with missing lag values
train_df = train_df.dropna(subset=['latitude_lag1', 'longitude_lag1', 'sog_lag1', 'cog_lag1', 'time_diff_hours'])


In [56]:

# Calculate changes in latitude and longitude
train_df['delta_latitude'] = train_df['latitude'] - train_df['latitude_lag1']
train_df['delta_longitude'] = train_df['longitude'] - train_df['longitude_lag1']
train_df['delta_longitude'] = train_df['delta_longitude'].clip(-1.0, 1.0)


In [57]:

# Define features and target variables
features = [
    'sog_lag1',
    'cog_lag1',
    'time_diff_hours',
    'vesselId_encoded',
    'hour',
    'day',
    'month'
]

target_lat = 'delta_latitude'
target_lon = 'delta_longitude'


In [58]:
data_for_split = train_df[features + ['latitude_lag1', 'longitude_lag1', target_lat, target_lon]]

In [59]:

# Split the data into training and validation sets
X_train, X_val = train_test_split(
    data_for_split,
    test_size=0.2,
    random_state=42
)

# Separate features and targets for training
X_train_features = X_train[features]
y_train_lat = X_train[target_lat]
y_train_lon = X_train[target_lon]

# Separate features and targets for validation
X_val_features = X_val[features]
y_val_lat = X_val[target_lat]
y_val_lon = X_val[target_lon]

# **Keep 'latitude_lag1' and 'longitude_lag1' for reconstructing actual positions**
latitude_lag1_val = X_val['latitude_lag1'].values
longitude_lag1_val = X_val['longitude_lag1'].values


In [60]:

# Initialize and train the XGBoost model for latitude
xgb_model_lat = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
xgb_model_lat.fit(X_train_features, y_train_lat)

# Initialize and train the XGBoost model for longitude
xgb_model_lon = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
xgb_model_lon.fit(X_train_features, y_train_lon)


In [61]:

# Evaluate the models on the validation set
# Predict changes in latitude and longitude
y_pred_lat = xgb_model_lat.predict(X_val_features)
y_pred_lon = xgb_model_lon.predict(X_val_features)

# Reconstruct predicted latitude and longitude
y_val_lat_pred = latitude_lag1_val + y_pred_lat
y_val_lon_pred = longitude_lag1_val + y_pred_lon

# Reconstruct actual latitude and longitude from validation set
y_val_lat_actual = latitude_lag1_val + y_val_lat
y_val_lon_actual = longitude_lag1_val + y_val_lon

In [62]:

# Calculate Mean Absolute Error
mae_lat = mean_absolute_error(y_val_lat_actual, y_val_lat_pred)
mae_lon = mean_absolute_error(y_val_lon_actual, y_val_lon_pred)

print(f'Mean Absolute Error for Latitude: {mae_lat}')
print(f'Mean Absolute Error for Longitude: {mae_lon}')


Mean Absolute Error for Latitude: 0.07737116765403995
Mean Absolute Error for Longitude: 0.01562623343007993


In [63]:

# Feature importance analysis (optional)
importance_lat = xgb_model_lat.feature_importances_
importance_lon = xgb_model_lon.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance_Latitude': importance_lat,
    'Importance_Longitude': importance_lon
})

print(feature_importance_df)

            Feature  Importance_Latitude  Importance_Longitude
0          sog_lag1             0.136251              0.295325
1          cog_lag1             0.280477              0.256505
2   time_diff_hours             0.139357              0.318558
3  vesselId_encoded             0.089056              0.039313
4              hour             0.097811              0.028269
5               day             0.112882              0.030552
6             month             0.144167              0.031478


In [64]:
# SUBMISSION PREDICTION
# 
# Load the training and test data

# Get the last known data point for each vesselId from the training set
last_known_df = train_df.groupby('vesselId').last().reset_index()


In [65]:
# Merge the last known data with the test set
test_merged_df = test_df.merge(
    last_known_df,
    on='vesselId',
    suffixes=('_test', '_train'),
    how='left'  # Keeps all records from the test set
)


In [66]:
# Find vessels in test set not present in training set
missing_vessels = test_merged_df[test_merged_df['time_train'].isnull()]['vesselId'].unique()

if len(missing_vessels) > 0:
    print("Warning: The following vesselIds are in the test set but not in the training set:")
    print(missing_vessels)
    # Decide how to handle these vessels
    # Optionally, you can remove them or assign default values


In [67]:
# Calculate time difference between test time and last known time
test_merged_df['time_diff_hours'] = (test_merged_df['time_test'] - test_merged_df['time_train']).dt.total_seconds() / 3600.0

# Use the last known features as lag features
test_merged_df['latitude_lag1'] = test_merged_df['latitude']
test_merged_df['longitude_lag1'] = test_merged_df['longitude']
test_merged_df['sog_lag1'] = test_merged_df['sog']
test_merged_df['cog_lag1'] = test_merged_df['cog']
test_merged_df['vesselId_encoded'] = test_merged_df['vesselId_encoded']  # If encoded previously

# Extract time features from test time
test_merged_df['hour'] = test_merged_df['time_test'].dt.hour
test_merged_df['day'] = test_merged_df['time_test'].dt.day
test_merged_df['month'] = test_merged_df['time_test'].dt.month


In [68]:

test_merged_df['vesselId_encoded'] = label_encoder.transform(test_merged_df['vesselId'])


In [69]:
# Features used in training
features = [
    'sog_lag1',
    'cog_lag1',
    'time_diff_hours',
    'vesselId_encoded',
    'hour',
    'day',
    'month'
]

# Prepare the feature matrix for the test set
X_test = test_merged_df[features]


In [70]:
# Predict delta_latitude and delta_longitude
delta_lat_pred = xgb_model_lat.predict(X_test)
delta_lon_pred = xgb_model_lon.predict(X_test)


In [71]:
# Add the predicted changes to the last known positions
test_merged_df['latitude_predicted'] = test_merged_df['latitude_lag1'] + delta_lat_pred
test_merged_df['longitude_predicted'] = test_merged_df['longitude_lag1'] + delta_lon_pred


In [72]:
# Latitude should be between -90 and 90
test_merged_df['latitude_predicted'] = test_merged_df['latitude_predicted'].clip(-90, 90)

# Longitude should be between -180 and 180
test_merged_df['longitude_predicted'] = ((test_merged_df['longitude_predicted'] + 180) % 360) - 180


In [73]:
# Select required columns
submission_df = test_merged_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Ensure correct column names and order
submission_df.columns = ['ID', 'longitude_predicted', 'latitude_predicted']


In [74]:
submission_df.to_csv('predictions.csv', index=False)
