# Deep Data Analysis (MC2)

This notebook provides deeper analysis of the data with the aim of spotting suspicious vessels using Data Mining tools and techniques to answer Q3 and Q4 of the challenge. The main goal is assessing similarity between **SouthSeafood Express Corp vessels** and other vessels, to identify potential links or patterns of suspicious behavior. More specifically: computing **similarity scores** and rank vessels according to risk.

In [24]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

### Load .json data and convert it to CSV

In [25]:
# Load .json data and read it as pandas DataFrame
folder_path = '../data/'

with open(folder_path + 'commodities.json') as f:
    commodities = pd.json_normalize(json.load(f))

with open(folder_path + 'vessels.json') as f:
    vessels = pd.json_normalize(json.load(f))

with open(folder_path + 'locations.json') as f:
    locations = pd.json_normalize(json.load(f))

with open(folder_path + 'documents.json') as f:
    documents = pd.json_normalize(json.load(f))

with open(folder_path + 'transponder_pings.json') as f:
    transponder_pings = pd.json_normalize(json.load(f))

with open(folder_path + 'harbor_reports.json') as f:
    harbor_reports = pd.json_normalize(json.load(f))

with open(folder_path + 'transactions.json') as f:
    transactions = pd.json_normalize(json.load(f))

Read timestamps

In [26]:
# Convert timestamps to datetime
transponder_pings['time'] = pd.to_datetime(transponder_pings['time'], errors='coerce')
transactions['date'] = pd.to_datetime(transactions['date'], errors='coerce')
harbor_reports['date'] = pd.to_datetime(harbor_reports['date'], errors='coerce')

# Convert dwell to numbers
transponder_pings['dwell'] = pd.to_numeric(transponder_pings['dwell'], errors='coerce')
transponder_pings['dwell_hours'] = pd.to_numeric(transponder_pings['dwell_hours'], errors='coerce')
transponder_pings['dwell_days'] = pd.to_numeric(transponder_pings['dwell_days'], errors='coerce')

### Computing Similarity scores

In [27]:
# Sort values
transponder_pings_sorted = transponder_pings.sort_values(['target', 'time'])

trajectories = {}

for vessel, group in transponder_pings_sorted.groupby('target'):
    #traj = list(zip(group['time'], group['source'], group['dwell'])) # list of (time, location, dwell)
    traj = list(group['source'])
    trajectories[vessel] = traj

# Create a df for each vessel with its trajectory
vessel_groups = {v: g[['time','source','dwell']].reset_index(drop=True)
                 for v, g in transponder_pings_sorted.groupby('target')}

# Lists of all zones/ports
all_ports = list(set(transponder_pings_sorted['source']))
port_to_idx = {p: i for i, p in enumerate(all_ports)}

In [28]:
vessel_groups.keys()

dict_keys(['albacoreangler47d', 'albacoreassaulter482', 'amberjackassaulterd52', 'americaneelenthusiastcfa', 'anchovyassaulterb1c', 'aquaticangler5c3', 'aquaticpursuitf31', 'aquatransit6bc', 'arcticgraylingangler094', 'athenad34', 'atlanticbluemarlinmarauder0b4', 'atlanticcodcatcherca6', 'baitedbreath538', 'barracudabaiter8b3', 'barracudabandit836', 'bassbaiterb9f', 'bassbandit0d5', 'bigeyetunabanditb73', 'bigeyetunabuccaneera16', 'blackbullheadbandit801', 'blackdrumbanditc5b', 'bluecatfishcatcher468', 'bluefintunabandit177', 'bluefishbandit8ec', 'bluegillbandita5f', 'blueharbor2c1', 'bluemarlinbandit292', 'bonefishbaiter565', 'breambanditc85', 'brillbandit0a1', 'brinebuccaneer9fd', 'brooktroutbuccaneerc0b', 'brownbullheadbriganded2', 'browntroutbandite67', 'bulkcarriers6cd', 'burbotbandit7bf', 'cargocatalyst39a7', 'cargocatalystb02', 'cargocentric443e', 'cargocentric4d0', 'cargocircuit26cc', 'cargocircuit545', 'cargocosmoscde', 'cargocrestb7c', 'cargocynosure29d', 'carpcapturer993', '

In [29]:
vessel_groups.get('albacoreangler47d') # Example vessel trajectory data

Unnamed: 0,time,source,dwell
0,2035-02-01 05:29:28.653,Cod Table,62410.977112
1,2035-02-01 22:49:39.630,Nav 2,6030.279121
2,2035-02-02 00:30:09.909,Cod Table,15499.004992
3,2035-02-02 04:48:28.914,Nav 2,6276.690890
4,2035-02-02 06:33:05.605,Cod Table,31291.196472
...,...,...,...
1230,NaT,Cod Table,23402.230536
1231,NaT,Cod Table,24781.174384
1232,NaT,Cod Table,11408.257794
1233,NaT,Cod Table,11594.106956


Similarity functions

In [30]:
def jaccard_similarity(seq1, seq2):
    """Jaccard similarity based on zone/port visits"""
    set1, set2 = set(seq1), set(seq2)
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

def dwell_vector(df):
    """Port dwell time vector"""
    vec = np.zeros(len(all_ports))
    for _, row in df.iterrows():
        port = row['source']
        if port is None:
            continue
        idx = port_to_idx[port]
        vec[idx] += row['dwell']
    return vec

Let's compute similarity w.r.t. *snappersnatcher7be*

In [31]:
ref_vessel = 'snappersnatcher7be'
ref_df = vessel_groups[ref_vessel]
ref_df = ref_df[ref_df['source'].notna()]
ref_dwell_vec = dwell_vector(ref_df)
ref_ports = list(ref_df['source'])

similarities_jaccard = {}
similarities_dwell = {}

for vessel, df in tqdm(vessel_groups.items(), desc="Calculating similarities"):
    if vessel == ref_vessel:
        continue
    df = df[df['source'].notna()]
    if df.empty:
        similarities_jaccard[vessel] = 0.0
        similarities_dwell[vessel] = 0.0
        continue  

    similarities_jaccard[vessel] = jaccard_similarity(list(df['source']), ref_ports) # Jaccard similarity

    vec = dwell_vector(df)
    similarities_dwell[vessel] = cosine_similarity([ref_dwell_vec], [vec])[0,0] # Cosine similarity on dwell vectors

# Combine similarities with weights
weight_jaccard = 0.6
weight_dwell = 0.4

final_similarity = {}
for vessel in similarities_jaccard:
    final_similarity[vessel] = weight_jaccard * similarities_jaccard[vessel] + weight_dwell * similarities_dwell[vessel]

# Sort by final similarity and print top 10
similar_vessels = sorted(final_similarity.items(), key=lambda x: x[1], reverse=True)
print("\nTop vessels similar to snappersnatcher7be:")
for vessel, sim in similar_vessels[:20]:
    print(f"{vessel}: {sim:.3f}")

Calculating similarities: 100%|██████████| 296/296 [00:13<00:00, 22.40it/s]


Top vessels similar to snappersnatcher7be:
swordfishsaboteur22f: 0.986
bigeyetunabuccaneera16: 0.964
fishfinderb9d: 0.963
browntroutbandite67: 0.962
salmonseeker630: 0.958
largemouthbasslooterf95: 0.956
cohosalmoncapturera7b: 0.953
prawnpredator5d7: 0.951
whitingwrangler842: 0.924
seabassbandit9ad: 0.899
whitemarlinwranglerbac: 0.877
crabcatcher1aa: 0.871
trawlertriumph31f: 0.869
aquaticpursuitf31: 0.867
codcatcher04c: 0.865
brillbandit0a1: 0.863
redfinpickerelraider744: 0.861
fishflingere29: 0.860
haddockhawkb7c: 0.857
spanishmackerelmaster037: 0.857



