In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define file paths
ais_tracks_path = '../../data/tracks_ais.csv'
radar_detections_path = '../../data/detections_radar.csv'

ais_tracks = pd.read_csv(ais_tracks_path)
radar_detections = pd.read_csv(radar_detections_path)

In [25]:
BATCH_SIZE = 32
MAX_LENGTH = 256
HIDDEN_DIM = 64

## Preprocess

In [26]:
import pandas as pd
import torch
import warnings
from utilities import VesselTypeAggregator

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

# Preprocessing: add `datetime`, `time_elapsed`
def preprocess_group(group):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", UserWarning)
        group['datetime'] = pd.to_datetime(group['cdate'] + ' ' + group['ctime'])
        group = group.sort_values('datetime')  # Ensure chronological order
        group['time_diff'] = group['datetime'].diff().dt.total_seconds().fillna(0)
        group['time_elapsed'] = group['time_diff'].cumsum()
    return group

# Merge radar detections with AIS labels
merged_tracks = pd.merge(
    ais_tracks[['assoc_id', 'type_m2']],
    radar_detections,
    left_on='assoc_id',
    right_on='id_track',
    how='inner'
)

# Apply time-based preprocessing
merged_tracks = merged_tracks.groupby('id_track').apply(preprocess_group).reset_index(drop=True)

vessel_type_aggregator = VesselTypeAggregator()
vessel_type_aggregator.aggregate_vessel_type(merged_tracks)

# Feature and label columns
feature_cols = ['speed', 'course', 'time_elapsed', 'latitude', 'longitude']
label_col = 'type_m2_agg'

# Build label dictionary (id_track -> int class id)
label_classes = {label: i for i, label in enumerate(merged_tracks[label_col].unique())}
type_dict = merged_tracks.drop_duplicates('id_track').set_index('id_track')[label_col].map(label_classes).to_dict()

# Group by track
grouped = merged_tracks.groupby('id_track')

# Prepare tensor dataset
track_data = []
for id_track, group in grouped:
    if id_track not in type_dict:
        continue

    features = torch.tensor(group[feature_cols].values, dtype=torch.float32)
    track_data.append({
        'features': features.to(device),  # T x M
        'label': torch.tensor(type_dict[id_track], dtype=torch.long).to(device)
    })

print(f"✅ Prepared {len(track_data)} track tensors (raw features) on {device}")

Using device: mps


  merged_tracks = merged_tracks.groupby('id_track').apply(preprocess_group).reset_index(drop=True)


✅ Prepared 15252 track tensors (raw features) on mps


In [27]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sequences = [item['features'] for item in batch]
    labels = torch.stack([item['label'] for item in batch])
    lengths = torch.tensor([seq.size(0) for seq in sequences])
    padded_sequences = pad_sequence(sequences, batch_first=True)  # B, T_max, M

    return padded_sequences, lengths, labels

In [28]:
from torch.utils.data import random_split, DataLoader, Dataset

class VesselDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)

# Total size
total_size = len(track_data)

# Sizes for split
test_size = val_size = int(0.1 * total_size)
train_size = total_size - test_size - val_size

# Perform the split
full_dataset = VesselDataset(track_data)
train_set, val_set, test_set = random_split(full_dataset, [train_size, val_size, test_size])

# Sample 20% of the training set for lightweight training
small_train_size = int(0.2 * len(train_set))
indices = np.random.choice(len(train_set), small_train_size, replace=False)
small_train_subset = torch.utils.data.Subset(train_set, indices)

# DataLoaders
train_loader = DataLoader(small_train_subset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"✅ Data split: {len(small_train_subset)} small train, {len(val_set)} val, {len(test_set)} test")


✅ Data split: 2440 small train, 1525 val, 1525 test


In [32]:
import torch.nn as nn

class VesselRNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers=1):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, lengths):
        #lengths: length B, indicating real length of each sequence
        x_padded = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False) # N * T_max * M
        _, hidden = self.gru(x_padded) 
        logits = self.fc(hidden[-1])  # B * num_classes
        return logits


In [33]:
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

input_dim = len(feature_cols)
num_classes = len(label_classes)

model = VesselRNNClassifier(input_dim, HIDDEN_DIM, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)
scheduler = StepLR(optimizer, step_size=2, gamma=0.5) 

train_losses = []
val_losses = []

for epoch in range(10):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)

    for x_batch, lengths, y_batch in progress_bar:
        optimizer.zero_grad()
        logits = model(x_batch, lengths) #B * num_classes
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    scheduler.step()


    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Evaluate on validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_val, lengths_val, y_val in val_loader:
            output_val = model(x_val, lengths_val)
            loss_val = criterion(output_val, y_val)
            val_loss += loss_val.item()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"✅ Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

save_path = 'models/train_20_perc.pth'
save_path = 'models/train_20_perc.pth'
torch.save({
    'epoch': epoch + 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'train_losses': train_losses,
    'val_losses': val_losses,
    'input_dim': input_dim,
    'hidden_dim': HIDDEN_DIM,
    'num_classes': num_classes,
}, save_path)


                                                                   

KeyboardInterrupt: 