### Dataset Relation

`ais_tracks` & `radar_tracks`: both contain 27416 corresponding tracks identified by both radar & ais. Can consider them as training

`tracks_tagged`: 9013 tagged radar tracks by users in M2. 

`tagged_detections`: 6756272 timestamps and 9020 individual tracks in total, contains ALL tracks in `tracks_tagged`. The remaining 7 tracks are from AIS, 5 of which overlap with `ais_tracks`

`radar_detections`: 7387790 timestamps and 19947 radar tracks; 15345 of which are associated with radar/ais_tracks


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define file paths
ais_tracks_path = '../data/tracks_ais.csv'
radar_tracks_path = '../data/tracks_radar.csv'
radar_detections_path = '../data/detections_radar.csv'
tagged_detections_path = '../data/detections_tagged.csv'
tracks_tagged_path = '../data/tracks_tagged.csv'

ais_tracks = pd.read_csv(ais_tracks_path)
radar_tracks = pd.read_csv(radar_tracks_path)
radar_detections = pd.read_csv(radar_detections_path)
tagged_detections = pd.read_csv(tagged_detections_path)
tracks_tagged = pd.read_csv(tracks_tagged_path)

print("Length of ais_tracks:", len(ais_tracks))
print("Length of radar_tracks:", len(radar_tracks))
print("Length of radar_detections:", len(radar_detections))
print("Length of tagged_detections:", len(tagged_detections))
print("Length of tracks_tagged:", len(tracks_tagged))

Length of ais_tracks: 27416
Length of radar_tracks: 27416
Length of radar_detections: 7387790
Length of tagged_detections: 6756272
Length of tracks_tagged: 9013


In [2]:
ais_tracks_id = set(ais_tracks['id_track'])
radar_tracks_id = set(radar_tracks['id_track'])
radar_detections_id = set(radar_detections['id_track'])
tagged_detections_id = set(tagged_detections['id_track'])
tracks_tagged_id = set(tracks_tagged['id_track'])

print("Unique ais_tracks_id count:", len(ais_tracks_id))
print("Unique radar_tracks_id count:", len(radar_tracks_id))
print("Unique radar_detections_id count:", len(radar_detections_id))
print("Unique tagged_detections_id count:", len(tagged_detections_id))
print("Unique tracks_tagged_id count:", len(tracks_tagged_id))

Unique ais_tracks_id count: 27416
Unique radar_tracks_id count: 27416
Unique radar_detections_id count: 19947
Unique tagged_detections_id count: 9020
Unique tracks_tagged_id count: 9013


In [3]:
len1 = len(set(radar_detections['assoc_id']))
len2 = len(set(radar_detections['id_track']))
len3 = len(set(radar_detections['assoc_id']) & set(ais_tracks['id_track']))
len4 = len(radar_detections[['id_track', 'assoc_id']].drop_duplicates())

print(f'Number of unique assoc_id: {len1}')
print(f'Number of unique id_track: {len2}')
print(f'Number of unique assoc_id that are also in ais_tracks: {len3}')
print(f'Missing in AIS: {len1 - len3} tracks')
print(f'Total number of unique (id_track, assoc_id) pairings: {len4}')


Number of unique assoc_id: 16591
Number of unique id_track: 19947
Number of unique assoc_id that are also in ais_tracks: 15345
Missing in AIS: 1246 tracks
Total number of unique (id_track, assoc_id) pairings: 19947


In [21]:
missing_assoc_id = set(radar_detections['assoc_id']) - (set(radar_detections['assoc_id']) & set(ais_tracks['id_track']))
missing_rows = radar_detections[radar_detections['assoc_id'].isin(missing_assoc_id)]

missing_rows[['id_track', 'assoc_id']].sample(5, ignore_index=True)

Unnamed: 0,id_track,assoc_id
0,36979855,36979840
1,32063462,32065809
2,31251315,31251147
3,37731466,37731774
4,38363988,38366686


Unnamed: 0,id_track,id_site,id_m2,source,duration,alarm,min_speed,max_speed,avg_speed,curviness,...,dest,eta_month,eta_day,eta_hour,eta_minute,type_m2,sdate,stime,ldate,ltime


In [None]:
import pandas as pd

# Assuming radar_detections and ais_tracks are already loaded as DataFrames
filtered_radar_detections = radar_detections[~radar_detections['assoc_id'].isin(ais_tracks['id_track'])]
filtered_radar_detections['assoc_id'].sample(3).tolist()

### Radar Detections

In [None]:
# Group by assoc_id and collect all unique id_track values into sets
assoc_track_df = radar_detections.groupby('assoc_id')['id_track'].apply(lambda x: set(x.dropna().unique())).reset_index()

# Display the first few rows
print("Number of unique associated IDs:", len(assoc_track_df))
print("\nFirst few rows:")
display(assoc_track_df.head())

# Display some statistics
print("\nStatistics about track sets:")
print("Average number of tracks per assoc_id:", assoc_track_df['id_track'].apply(len).mean())
print("Maximum number of tracks for an assoc_id:", assoc_track_df['id_track'].apply(len).max())
print("Number of assoc_ids with no tracks:", assoc_track_df['id_track'].apply(len).eq(0).sum())

### Size EDAs

In [None]:
ais_tracks['length'] = ais_tracks['dim_a'] + ais_tracks['dim_b']
ais_tracks['width'] = ais_tracks['dim_c'] + ais_tracks['dim_d']

ais_tracks_with_size = ais_tracks.dropna(subset=['width', 'length'])
length_width_filter = (ais_tracks['length'] > 0) & (ais_tracks['width'] > 0)
ais_tracks_with_size = ais_tracks_with_size[length_width_filter]

print(f'Total number of AIS tracks with size info: {len(ais_tracks_with_size)}')

ais_tracks_with_size['length'].describe()
ais_tracks_with_size['width'].describe()
ais_tracks_with_size['length'].hist(bins=100, edgecolor='black')
plt.title('Histogram of AIS Track Lengths')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Get the top types by count
type_counts = ais_tracks_with_size['type_m2'].value_counts()
top_types = type_counts.head(len(type_counts))
print(top_types)

# Create a new column with aggregated types
ais_tracks_with_size['type_m2_agg'] = ais_tracks_with_size['type_m2'].apply(
    lambda x: x if x in top_types else 'other'
)

# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 7))

# Plot length distribution for each type
for type_name in ais_tracks_with_size['type_m2_agg'].unique():
    type_data = ais_tracks_with_size[ais_tracks_with_size['type_m2_agg'] == type_name]
    ax1.hist(type_data['length'], bins=30, alpha=0.5, label=type_name)

ax1.set_title('Length Distribution by Type')
ax1.set_xlabel('Length (meters)')
ax1.set_ylabel('Count')
ax1.legend()
ax1.grid(True)

# Plot width distribution for each type
for type_name in ais_tracks_with_size['type_m2_agg'].unique():
    type_data = ais_tracks_with_size[ais_tracks_with_size['type_m2_agg'] == type_name]
    ax2.hist(type_data['width'], bins=30, alpha=0.5, label=type_name)

ax2.set_title('Width Distribution by Type')
ax2.set_xlabel('Width (meters)')
ax2.set_ylabel('Count')
ax2.legend()
ax2.grid(True)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

# Print summary statistics for each type
print("\nSummary Statistics:")
print("\nLength Statistics by Type:")
print(ais_tracks_with_size.groupby('type_m2_agg')['length'].describe())
print("\nWidth Statistics by Type:")
print(ais_tracks_with_size.groupby('type_m2_agg')['width'].describe())

# Print the original type counts for reference
print("\nOriginal Type Counts:")
print(type_counts)

In [None]:
print ("Noted that class-b vessels don't have corresponding size information in AIS_data")

### Multinomial Regression

In [67]:
features_subset = ['min_speed', 'max_speed', 'avg_speed', 'curviness', 'heading_mean', 'heading_std', 'turning_mean', 'turning_std', 'duration_z', 'distance', 'distance_o']

In [None]:
# Create correlation matrix for features_subset
correlation_matrix = radar_tracks[features_subset].corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, 
            annot=True,  # Show correlation values
            cmap='coolwarm',  # Use a diverging color palette
            center=0,  # Center the colormap at 0
            fmt='.2f',  # Round correlation values to 2 decimal places
            square=True)  # Make the plot square

plt.title('Correlation Matrix of Radar Track Features')
plt.tight_layout()  # Adjust layout to prevent label cutoff
plt.show()

# Print the correlation matrix as a table for reference
print("\nCorrelation Matrix:")
display(correlation_matrix)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score


# Split features and target
X = radar_tracks[features_subset]  # Features (e.g., avg_speed, curviness, turning_mean, turning_std)

type_counts = radar_tracks['type_m2'].value_counts()
top_types = type_counts.head(len(type_counts)).index

# Create a new column with aggregated types
radar_tracks['type_m2_agg'] = radar_tracks['type_m2'].apply(
    lambda x: x if x in top_types else 'other'
)

y = radar_tracks['type_m2_agg']  # Target (Vessel type category)
print(y.value_counts())

In [98]:
ship2num = {}
num2ship = {}

num = 0
for vessel in y.unique():
    ship2num[vessel] = num
    num2ship[num] = vessel
    num += 1

y_numeric = y.map(ship2num)


In [None]:
# Train-test split (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, stratify=y_numeric, random_state=42)

# Define XGBoost model
model = xgb.XGBClassifier(
    objective='multi:softmax',  # Use 'multi:softprob' if you want probability outputs
    num_class=len(y.unique()),
    eval_metric='mlogloss',
    eta=0.1,  # Learning rate
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Perform 4-fold cross-validation
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')

print(f"4-Fold CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Train on full training set
model.fit(X_train, y_train)

# Test set predictions
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a figure with a larger size
plt.figure(figsize=(12, 8))

# Create heatmap of confusion matrix
sns.heatmap(cm, 
            annot=True,  # Show numbers in cells
            fmt='d',     # Format as integers
            cmap='Blues',  # Use blue color scheme
            xticklabels=list(ship2num.keys()),  # Use vessel type names for x-axis
            yticklabels=list(ship2num.keys()))  # Use vessel type names for y-axis

# Customize the plot
plt.title('Confusion Matrix - Vessel Type Classification')
plt.xlabel('Predicted Vessel Type')
plt.ylabel('True Vessel Type')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.yticks(rotation=0)
plt.tight_layout()  # Adjust layout to prevent label cutoff

# Show the plot
plt.show()

# Print classification report for detailed metrics
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred, target_names=list(ship2num.keys())))

In [None]:
print(f'cargo ship & tanker ship get mixed up: but it is probably ok!')

### Looking at Hand-Written Labels

In [None]:
# Merge radar_detections with ais_tracks using inner join
merged_detections = radar_detections.merge(
    ais_tracks[['id_track', 'type_m2']], 
    left_on='assoc_id',  
    right_on='id_track',  
    how='inner'  
)

print("Original radar_detections shape:", radar_detections.shape)
print("Merged dataset shape:", merged_detections.shape)
print("\nNumber of matched entries:", len(merged_detections))
print("Number of unmatched radar detections:", len(radar_detections) - len(merged_detections))

# Display the first few rows of the merged dataset
print("\nFirst few rows of merged dataset:")
display(merged_detections.head())

# Display type distribution in merged dataset
print("\nType distribution in merged dataset:")
print(merged_detections['type_m2'].value_counts())