### Activity Model Using XGBoost & Filter Radar Dataset

* Dataset: Hand-labelled tracks with their activity from `tagged_detections` with experts' help
* Output a XGBoost model:
    * Turns out this model is really good at identifying transit and stopped vessel activities

Ran the activity model on `cleaned_radar_detections` and acquired activity_inferred and confidence_scores labels, stored in `radar_activity_labels_inferred.csv`


In [37]:
import pandas as pd
import numpy as np
import torch
import warnings
import sys, os

sys.path.append(os.path.abspath('..'))

tagged_detections_v6 = pd.read_csv('../../data/cleaned_data/tracks_tagged_v6.csv')
detections_tagged = pd.read_csv('../../data/raw_data/detections_tagged.csv')

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Using device: mps


In [38]:
#extract labels
tagged_detections_activity_labels = tagged_detections_v6[['id_track', 'activity']].dropna(subset = ['activity'])

save_path = '../../data/labels/activity_labels_from_tagged_detections.csv'
tagged_detections_activity_labels.to_csv(save_path, index = False)

#### Set up

In [39]:
activity_label = pd.read_csv('../../data/labels/activity_labels_from_tagged_detections.csv')
print(f'Length of activity_label: {len(activity_label)}')


Length of activity_label: 2594


In [40]:
from core.preprocess import DisruptionFilter

detections_tagged['datetime'] = pd.to_datetime(detections_tagged['cdate'] + ' ' + detections_tagged['ctime'], errors = 'coerce')
valid_detections = DisruptionFilter(detections_tagged)()
print(f'Number of unique tracks: {valid_detections["id_track"].nunique()}')


Number of unique tracks: 8684


In [41]:
from core.sum_stats import SumStatsBaseline

summary_df = SumStatsBaseline(valid_detections)()

  std_heading = np.sqrt(-np.log(meanCos*meanCos + meanSin*meanSin))
  std_heading = np.sqrt(-np.log(meanCos*meanCos + meanSin*meanSin))
  curviness = total_distance / distance_o
  curviness = total_distance / distance_o


In [42]:
summary_df.columns

Index(['id_track', 'duration', 'distance_o', 'detections', 'max_speed',
       'min_speed', 'avg_speed', 'curviness', 'heading_mean', 'heading_std',
       'turning_mean', 'turning_std', 'distance_total'],
      dtype='object')

In [43]:
detections_labeled_sumstats = pd.merge(summary_df, activity_label, on = 'id_track', how = 'inner')
print(f'Length of detections_labeled_sumstats: {len(detections_labeled_sumstats)}')

Length of detections_labeled_sumstats: 2558


In [44]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from core.DICT import ACITIVTY2NUM, NUM2ACTIVITY

features_subset = SumStatsBaseline.FEATURE_NAMES

# Prepare features and target
X = detections_labeled_sumstats[features_subset]
y = detections_labeled_sumstats['activity']

y_numeric = y.map(ACITIVTY2NUM)

# Train-test split (80:20), stratified sampling by vessel type
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, stratify=y_numeric, random_state=42)

# Define XGBoost model
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Use 'multi:softprob' if you want probability outputs
    num_class=len(y.unique()),
    eval_metric='mlogloss',
    random_state=42
)

# Hyperparameter tuning (after a few tries)
param_grid = {
    'max_depth': [13],
    'learning_rate': [0.1],
    'n_estimators': [350]
}

# Perform 4-fold cross-validation with grid search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=4,
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))
best_model = grid_search.best_estimator_

# Make predictions on test set
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print("\nTest Set Accuracy: {:.4f}".format(test_accuracy))

Fitting 4 folds for each of 1 candidates, totalling 4 fits


Best parameters: {'learning_rate': 0.1, 'max_depth': 13, 'n_estimators': 350}
Best cross-validation accuracy: 0.9223

Test Set Accuracy: 0.9277


In [45]:
# Convert numeric to string labels just once
y_test_str = y_test.map(NUM2ACTIVITY)
y_pred_str = pd.Series(y_pred).map(NUM2ACTIVITY)

# Create confusion matrix with string labels
labels_order = sorted(set(y_test_str) | set(y_pred_str))  # all unique class labels in order
cm = confusion_matrix(y_test_str, y_pred_str, labels=labels_order)

# Plot confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(cm, 
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=labels_order,
            yticklabels=labels_order)

plt.title('Confusion Matrix - Activity Classification')
plt.xlabel('Predicted Activity')
plt.ylabel('True Activity')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('models/xgboost_activity_cm.png', dpi = 300)
plt.close()


In [46]:
import os
import joblib
import json

save_path = 'models/xgboost_activity.pkl'
joblib.dump(best_model, save_path)

model_description = {
    "model_name": "xgboost_activity",
    "features": features_subset,
    "params": grid_search.best_params_,
    "cv_accuracy": grid_search.best_score_,
    "test_accuracy": test_accuracy,
    "notes": "trained on manually labelled tagged tracks (v6). 80:20 stratified split. 4-fold CV."
}

with open("models/xgboost_activity_desc.json", "w") as f:
    json.dump(model_description, f, indent=4)

#### Infer on Original Radar Detections

In [47]:
from core.sum_stats import SumStatsBaseline
import joblib
import json

radar_detections = pd.read_csv('../../data/cleaned_data/preprocessed_radar_detections.csv')

activity_model = joblib.load("models/xgboost_activity.pkl")
with open("models/xgboost_activity_desc.json", "r") as f:
    model_info = json.load(f)

features_subset = model_info['features']

radar_detections_sumstats = SumStatsBaseline(radar_detections)()
radar_detections_sumstats.columns

Index(['id_track', 'duration', 'distance_o', 'detections', 'max_speed',
       'min_speed', 'avg_speed', 'curviness', 'heading_mean', 'heading_std',
       'turning_mean', 'turning_std', 'distance_total'],
      dtype='object')

In [48]:
X = radar_detections_sumstats[features_subset]
y_pred_numeric = activity_model.predict(X)
y_pred = [NUM2ACTIVITY[i] for i in y_pred_numeric]

#get confidence score
y_pred_prob = activity_model.predict_proba(X)
confidence_scores = y_pred_prob.max(axis = 1)


radar_detections_sumstats['activity_inferred'] = y_pred
radar_detections_sumstats['activity_inferred'].value_counts()
radar_detections_sumstats['activity_confidence_score'] = confidence_scores

In [49]:
## Filter out detections with activity != 'transit'
radar_activity_inferred = radar_detections_sumstats[['id_track', 'activity_inferred','activity_confidence_score']]

radar_activity_inferred.to_csv('../../data/labels/radar_activity_labels_inferred.csv', index=False)