In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold


In [2]:

# ----------------------------
# 1. Data Loading and Preparation
# ----------------------------

# Load the training dataset
df = pd.read_csv('ais_train.csv', sep='|')

# Load the test dataset
test_df = pd.read_csv('ais_test.csv', sep=',')

# Ensure data is sorted by time
df = df.sort_values(by='time').reset_index(drop=True)

# Convert 'time' column to datetime
df['time'] = pd.to_datetime(df['time'])


In [3]:

# ----------------------------
# 2. Feature Engineering
# ----------------------------

# Encode vesselId
label_encoder = LabelEncoder()
df['vesselId_encoded'] = label_encoder.fit_transform(df['vesselId'])

# Define lag features
lag_features_latitude = [1, 2]
lag_features_longitude = [1]

# Create lagged features, delta features, and cyclical transformations
for lag in lag_features_latitude:
    # Lagged features
    df[f'latitude_t-{lag}'] = df.groupby('vesselId')['latitude'].shift(lag)
    df[f'sog_t-{lag}'] = df.groupby('vesselId')['sog'].shift(lag)
    df[f'cog_t-{lag}'] = df.groupby('vesselId')['cog'].shift(lag)
    df[f'heading_t-{lag}'] = df.groupby('vesselId')['heading'].shift(lag)
    
    # Delta features
    df[f'delta_latitude_t-{lag}'] = df['latitude'] - df[f'latitude_t-{lag}']
    df[f'delta_sog_t-{lag}'] = df['sog'] - df[f'sog_t-{lag}']
    df[f'delta_cog_t-{lag}'] = df['cog'] - df[f'cog_t-{lag}']
    df[f'delta_heading_t-{lag}'] = df['heading'] - df[f'heading_t-{lag}']
    
    # Cyclical transformations for 'cog' and 'heading'
    df[f'cog_t-{lag}_sin'] = np.sin(np.radians(df[f'cog_t-{lag}']))
    df[f'cog_t-{lag}_cos'] = np.cos(np.radians(df[f'cog_t-{lag}']))
    df[f'heading_t-{lag}_sin'] = np.sin(np.radians(df[f'heading_t-{lag}']))
    df[f'heading_t-{lag}_cos'] = np.cos(np.radians(df[f'heading_t-{lag}']))

# Create lagged features and delta features for longitude
for lag in lag_features_longitude:
    df[f'longitude_t-{lag}'] = df.groupby('vesselId')['longitude'].shift(lag)
    df[f'delta_longitude_t-{lag}'] = df['longitude'] - df[f'longitude_t-{lag}']

# Create rolling averages and their cyclical transformations
df['sog_rolling_avg'] = df.groupby('vesselId')['sog'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['cog_rolling_avg'] = df.groupby('vesselId')['cog'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
df['cog_rolling_avg_sin'] = np.sin(np.radians(df['cog_rolling_avg']))
df['cog_rolling_avg_cos'] = np.cos(np.radians(df['cog_rolling_avg']))

# Extract time-based features and apply cyclical transformations
df['hour'] = df['time'].dt.hour
df['day_of_week'] = df['time'].dt.dayofweek
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

# Drop rows with NaN values resulting from lagging
df = df.dropna()


In [4]:

# ----------------------------
# 3. Define Features and Targets
# ----------------------------

features = (
    [f'latitude_t-{lag}' for lag in lag_features_latitude] +
    [f'longitude_t-{lag}' for lag in lag_features_longitude] +
    [f'sog_t-{lag}' for lag in lag_features_latitude] +
    [f'heading_t-{lag}' for lag in lag_features_latitude] +
    [f'delta_latitude_t-{lag}' for lag in lag_features_latitude] +
    [f'delta_longitude_t-{lag}' for lag in lag_features_longitude] +
    [f'delta_sog_t-{lag}' for lag in lag_features_latitude] +
    [f'delta_heading_t-{lag}' for lag in lag_features_latitude] +
    [f'cog_t-{lag}_sin' for lag in lag_features_latitude] +
    [f'cog_t-{lag}_cos' for lag in lag_features_latitude] +
    [f'heading_t-{lag}_sin' for lag in lag_features_latitude] +
    [f'heading_t-{lag}_cos' for lag in lag_features_latitude] +
    ['sog_rolling_avg', 'cog_rolling_avg_sin', 'cog_rolling_avg_cos',
     'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos',
     'vesselId_encoded']
)

# Prepare feature matrix X and target vectors y
X = df[features]
y_latitude = df['latitude']
y_longitude = df['longitude']


In [5]:

# ----------------------------
# 4. Prepare for Stacking Ensemble
# ----------------------------

# Split data into training and validation sets using time-based split
split_index = int(len(df) * 0.8)

X_train = X.iloc[:split_index]
X_val = X.iloc[split_index:]

y_lat_train = y_latitude.iloc[:split_index]
y_lat_val = y_latitude.iloc[split_index:]

y_lon_train = y_longitude.iloc[:split_index]
y_lon_val = y_longitude.iloc[split_index:]


In [6]:

# ----------------------------
# 5. Train Base Models
# ----------------------------

# Define base models
base_models_lat = [
    ('xgb', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )),
    ('rf', RandomForestRegressor(
        n_estimators=100,
        max_depth=6,
        random_state=42
    )),
    ('lgb', lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
]

base_models_lon = [
    ('xgb', xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )),
    ('rf', RandomForestRegressor(
        n_estimators=100,
        max_depth=6,
        random_state=42
    )),
    ('lgb', lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    ))
]

# Train base models for latitude
for name, model in base_models_lat:
    model.fit(X_train, y_lat_train)

# Train base models for longitude
for name, model in base_models_lon:
    model.fit(X_train, y_lon_train)


In [None]:

# ----------------------------
# 6. Generate Base Model Predictions for Meta-Model Training
# ----------------------------

# Create new feature set for meta-model (predictions of base models)
meta_X_lat_train = pd.DataFrame()
meta_X_lat_val = pd.DataFrame()

meta_X_lon_train = pd.DataFrame()
meta_X_lon_val = pd.DataFrame()

# Generate predictions for training and validation sets
for name, model in base_models_lat:
    meta_X_lat_train[name] = model.predict(X_train)
    meta_X_lat_val[name] = model.predict(X_val)

for name, model in base_models_lon:
    meta_X_lon_train[name] = model.predict(X_train)
    meta_X_lon_val[name] = model.predict(X_val)


In [None]:

# ----------------------------
# 7. Train Meta-Model
# ----------------------------

# Use Linear Regression as meta-model for simplicity
meta_model_lat = LinearRegression()
meta_model_lon = LinearRegression()

# Train meta-models
meta_model_lat.fit(meta_X_lat_train, y_lat_train)
meta_model_lon.fit(meta_X_lon_train, y_lon_train)


In [None]:

# ----------------------------
# 8. Evaluate Ensemble Model
# ----------------------------

# Predict on validation set using meta-model
ensemble_lat_pred = meta_model_lat.predict(meta_X_lat_val)
ensemble_lon_pred = meta_model_lon.predict(meta_X_lon_val)

# Calculate Mean Absolute Error
ensemble_lat_mae = mean_absolute_error(y_lat_val, ensemble_lat_pred)
ensemble_lon_mae = mean_absolute_error(y_lon_val, ensemble_lon_pred)

print(f"Ensemble Mean Absolute Error for Latitude: {ensemble_lat_mae}")
print(f"Ensemble Mean Absolute Error for Longitude: {ensemble_lon_mae}")


In [None]:

# ----------------------------
# 9. Prepare Test Data
# ----------------------------

# [Same preprocessing steps as training data]

# Ensure test data is sorted by time
test_df = test_df.sort_values(by='time').reset_index(drop=True)

# Convert 'time' column to datetime
test_df['time'] = pd.to_datetime(test_df['time'])

# Encode vesselId in test data using the same label encoder
test_df['vesselId_encoded'] = test_df['vesselId'].apply(map_vesselId)

# Merge the last known data from the training set with the test set
latest_known_data = df.groupby('vesselId').last().reset_index()

# Merge test data with latest known data
test_df = test_df.merge(
    latest_known_data[['vesselId', 'latitude', 'longitude', 'sog', 'cog', 'heading']],
    on='vesselId',
    how='left',
    suffixes=('', '_lag1')
)

# Create lagged features for the test set
# For lag=1
test_df[f'latitude_t-1'] = test_df['latitude']
test_df[f'sog_t-1'] = test_df['sog']
test_df[f'cog_t-1'] = test_df['cog']
test_df[f'heading_t-1'] = test_df['heading']
test_df[f'longitude_t-1'] = test_df['longitude']

# For lag=2, use lag=1 data as we don't have further history
for lag in [2]:
    test_df[f'latitude_t-{lag}'] = test_df[f'latitude_t-1']
    test_df[f'sog_t-{lag}'] = test_df[f'sog_t-1']
    test_df[f'cog_t-{lag}'] = test_df[f'cog_t-1']
    test_df[f'heading_t-{lag}'] = test_df[f'heading_t-1']

# Create delta features and cyclical transformations in the test set
for lag in lag_features_latitude:
    test_df[f'delta_latitude_t-{lag}'] = 0  # Assume no change
    test_df[f'delta_sog_t-{lag}'] = 0
    test_df[f'delta_cog_t-{lag}'] = 0
    test_df[f'delta_heading_t-{lag}'] = 0
    
    # Cyclical transformations
    test_df[f'cog_t-{lag}_sin'] = np.sin(np.radians(test_df[f'cog_t-{lag}']))
    test_df[f'cog_t-{lag}_cos'] = np.cos(np.radians(test_df[f'cog_t-{lag}']))
    test_df[f'heading_t-{lag}_sin'] = np.sin(np.radians(test_df[f'heading_t-{lag}']))
    test_df[f'heading_t-{lag}_cos'] = np.cos(np.radians(test_df[f'heading_t-{lag}']))

# For longitude delta features
for lag in lag_features_longitude:
    test_df[f'delta_longitude_t-{lag}'] = 0  # Assume no change

# Rolling averages for the test set
test_df['sog_rolling_avg'] = test_df['sog']
test_df['cog_rolling_avg'] = test_df['cog']
test_df['cog_rolling_avg_sin'] = np.sin(np.radians(test_df['cog_rolling_avg']))
test_df['cog_rolling_avg_cos'] = np.cos(np.radians(test_df['cog_rolling_avg']))

# Time-based features and cyclical transformations
test_df['hour'] = test_df['time'].dt.hour
test_df['day_of_week'] = test_df['time'].dt.dayofweek
test_df['hour_sin'] = np.sin(2 * np.pi * test_df['hour'] / 24)
test_df['hour_cos'] = np.cos(2 * np.pi * test_df['hour'] / 24)
test_df['day_of_week_sin'] = np.sin(2 * np.pi * test_df['day_of_week'] / 7)
test_df['day_of_week_cos'] = np.cos(2 * np.pi * test_df['day_of_week'] / 7)

# Fill any remaining missing values with zeros
test_df.fillna(0, inplace=True)

# Prepare features for test set
X_test = test_df[features]


In [None]:

# ----------------------------
# 10. Generate Base Model Predictions on Test Set
# ----------------------------

meta_X_lat_test = pd.DataFrame()
meta_X_lon_test = pd.DataFrame()

for name, model in base_models_lat:
    meta_X_lat_test[name] = model.predict(X_test)

for name, model in base_models_lon:
    meta_X_lon_test[name] = model.predict(X_test)


In [None]:

# ----------------------------
# 11. Make Final Predictions with Meta-Model
# ----------------------------

# Predict latitude and longitude
ensemble_lat_pred_test = meta_model_lat.predict(meta_X_lat_test)
ensemble_lon_pred_test = meta_model_lon.predict(meta_X_lon_test)

# Ensure predicted coordinates are within valid ranges
test_df['latitude_predicted'] = ensemble_lat_pred_test.clip(-90, 90)
test_df['longitude_predicted'] = ((ensemble_lon_pred_test + 180) % 360) - 180


In [None]:

# ----------------------------
# 12. Prepare Submission File
# ----------------------------

# Create the output DataFrame
output_df = test_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save the predictions to a CSV file
output_df.to_csv('predictions.csv', index=False)
