# Deep Data Analysis (MC2)

This notebook provides deeper analysis of the data with the aim of spotting suspicious vessels using Data Mining tools and techniques to answer the questions of the challenge. The main goals are: 
* anlytically linking cargo deliveries with vessels;
* visualize trajectories;
* assessing similarity between **SouthSeafood Express Corp vessels** and other vessels, to identify potential links or patterns of suspicious behavior.

In [55]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

### Load .json data and convert it to CSV

In [56]:
# Load .json data and read it as pandas DataFrame
folder_path = '../data/'

with open(folder_path + 'commodities.json') as f:
    commodities = pd.json_normalize(json.load(f))

with open(folder_path + 'vessels.json') as f:
    vessels = pd.json_normalize(json.load(f))

with open(folder_path + 'locations.json') as f:
    locations = pd.json_normalize(json.load(f))

with open(folder_path + 'documents.json') as f:
    documents = pd.json_normalize(json.load(f))

with open(folder_path + 'transponder_pings.json') as f:
    transponder_pings = pd.json_normalize(json.load(f))

with open(folder_path + 'harbor_reports.json') as f:
    harbor_reports = pd.json_normalize(json.load(f))

with open(folder_path + 'transactions.json') as f:
    transactions = pd.json_normalize(json.load(f))

Read timestamps

In [57]:
# Convert timestamps to datetime
transponder_pings['time'] = pd.to_datetime(transponder_pings['time'], errors='coerce')
transactions['date'] = pd.to_datetime(transactions['date'], errors='coerce')
harbor_reports['date'] = pd.to_datetime(harbor_reports['date'], errors='coerce')

# Convert dwell to numbers
transponder_pings['dwell'] = pd.to_numeric(transponder_pings['dwell'], errors='coerce')

### Create Trajectories dataset

In [58]:
# Sort values
transponder_pings_sorted = transponder_pings.sort_values(['target', 'time'])

trajectories = {}

for vessel, group in transponder_pings_sorted.groupby('target'):
    #traj = list(zip(group['time'], group['source'], group['dwell'])) # list of (time, location, dwell)
    traj = list(group['source'])
    trajectories[vessel] = traj

# Create a df for each vessel with its trajectory
vessel_groups = {v: g[['time','source','dwell']].reset_index(drop=True)
                 for v, g in transponder_pings_sorted.groupby('target')}

# Lists of all zones/ports
all_ports = list(set(transponder_pings_sorted['source']))
port_to_idx = {p: i for i, p in enumerate(all_ports)}

In [59]:
vessel_groups.keys()

dict_keys(['albacoreangler47d', 'albacoreassaulter482', 'amberjackassaulterd52', 'americaneelenthusiastcfa', 'anchovyassaulterb1c', 'aquaticangler5c3', 'aquaticpursuitf31', 'aquatransit6bc', 'arcticgraylingangler094', 'athenad34', 'atlanticbluemarlinmarauder0b4', 'atlanticcodcatcherca6', 'baitedbreath538', 'barracudabaiter8b3', 'barracudabandit836', 'bassbaiterb9f', 'bassbandit0d5', 'bigeyetunabanditb73', 'bigeyetunabuccaneera16', 'blackbullheadbandit801', 'blackdrumbanditc5b', 'bluecatfishcatcher468', 'bluefintunabandit177', 'bluefishbandit8ec', 'bluegillbandita5f', 'blueharbor2c1', 'bluemarlinbandit292', 'bonefishbaiter565', 'breambanditc85', 'brillbandit0a1', 'brinebuccaneer9fd', 'brooktroutbuccaneerc0b', 'brownbullheadbriganded2', 'browntroutbandite67', 'bulkcarriers6cd', 'burbotbandit7bf', 'cargocatalyst39a7', 'cargocatalystb02', 'cargocentric443e', 'cargocentric4d0', 'cargocircuit26cc', 'cargocircuit545', 'cargocosmoscde', 'cargocrestb7c', 'cargocynosure29d', 'carpcapturer993', '

In [60]:
vessel_groups.get('albacoreangler47d') # Example vessel trajectory data

Unnamed: 0,time,source,dwell
0,2035-02-01 00:00:00.000,South Paackland,0.000000
1,2035-02-01 05:00:00.000,Nav 3,1768.653072
2,2035-02-01 05:29:28.653,Cod Table,62410.977112
3,2035-02-01 22:49:39.630,Nav 2,6030.279121
4,2035-02-02 00:30:09.909,Cod Table,15499.004992
...,...,...,...
1230,2035-11-27 00:34:17.915,Nav 3,6697.277299
1231,2035-11-27 02:25:55.193,South Paackland,164281.048365
1232,2035-11-29 05:00:00.000,Nav 3,2858.998607
1233,2035-11-29 05:47:38.998,Tuna Shelf,19885.173842


Save trajectories in a separate file 'trajectories.json'

In [61]:
# # convert in string the datetime objects
# for v in vessel_groups:
#     vessel_groups[v]['time'] = vessel_groups[v]['time'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f')
    
# # save vessel_groups to a json file 'trajectories.json' where each key is a vessel id and each value is a list of dictionaries with keys 'time', 'source', 'dwell'
# with open(folder_path + 'trajectories.json', 'w') as f:
#     json.dump({v: g[['time', 'source', 'dwell']].to_dict(orient='records') for v, g in vessel_groups.items()}, 
#               f,
#               default=str,
#               indent=2
#     )

### Load new 'trajectories.json'

In [62]:
with open(folder_path + 'trajectories.json', 'r') as f:
    trajectories = json.load(f)

## 1. Link vessels with cargo (analytically)

IF:
> A **VESSEL** is docked in a **HARBOR** where **FISH SPECIES** was exported (1 day after delivery!)...

AND:
> ...**FISH SPECIES** is present in a **REGION** where **VESSEL** had been

THEN:
> **FISH SPECIES** is probably **VESSEL**'s cargo!

In [63]:
# Merge transactions with documents
transactions_merged = transactions.merge(
    documents[['commodity', 'qty_tons']],
    left_on='source',
    right_on=documents['id'],
    how='left'
)

# Merge harbor reports with vessel details
harbor_reports_merged = harbor_reports.merge(
    vessels[['name', 'vessel_type', 'flag_country', 'company', 'tonnage', 'length_overall']],
    left_on='source',
    right_on=vessels['id'],
    how='left'
)

# Merge vessels with transponder pings to get vessel information: left_on = 'target', right_on = 'id', drop 'id' from merged df to avoid redundancy
transponder_pings_merged = transponder_pings.merge(
    vessels[['name', 'vessel_type', 'flag_country', 'company', 'tonnage', 'length_overall']],
    left_on='target',
    right_on=vessels['id'],
    how='left'
)

# Merge transponder pings with locations to get fish species present at each location
pings_with_species = transponder_pings_merged.merge(
    locations[['fish_species_present']],
    how='left',
    left_on='source',   # la nave o location?
    right_on=locations['id']
)

In [64]:
# Initialize suspected_vessels column
transactions_merged['suspected_vessels'] = None

for index, row in transactions_merged.iterrows():
    harbor = row['target']
    date = row['date']
    qty_tonnage = row['qty_tons']
    fish_species = row['commodity']

    # get locations where fish_species is present
    locations_with_species = locations[locations['fish_species_present']
                                        .apply(lambda species_list: fish_species in species_list)]['id'].tolist()

    # Select candidate vessels from harbor reports and transponder pings
    matches_harbor = harbor_reports_merged[
        (harbor_reports_merged['target'] == harbor) &
        (harbor_reports_merged['date'] == date - pd.Timedelta(days=1)) &
        (harbor_reports_merged['tonnage'] >= qty_tonnage) &
        (harbor_reports_merged['vessel_type'].isin(['FishingVessel', 'CargoVessel', 'Other']))
    ]

    matches_pings = pings_with_species[
        (pings_with_species['source'] == harbor) &
        (pings_with_species['time'].dt.date == (date - pd.Timedelta(days=1)).date()) & # only take day from datetime
        (pings_with_species['tonnage'] >= qty_tonnage) &
        (pings_with_species['vessel_type'].isin(['FishingVessel', 'CargoVessel', 'Other']))
    ]

    # Combine both sources
    matching_vessels_harbor = pd.concat([matches_harbor['source'], matches_pings['target']]).unique().tolist()

    # print(f"Suspected vessels: {matching_vessels_harbor}\n")

    # for each vessel in matching_vessels_harbor, check its trajectory: if the vessel was in locations where fish_species is present (before 'date'), add it to suspected_vessels
    for vessel in matching_vessels_harbor:
        df = pd.DataFrame(trajectories[vessel]).copy()
        df['time'] = pd.to_datetime(df['time'], errors='coerce')
        df = df[df['time'].dt.date < date.date()] # only take day from datetime
        if df['source'].isin(locations_with_species).any():
            if transactions_merged.at[index, 'suspected_vessels'] is None:
                transactions_merged.at[index, 'suspected_vessels'] = [vessel]
            else:
                transactions_merged.at[index, 'suspected_vessels'].append(vessel)
    
    # print(f"Suspected vessels after trajectory check: {transactions_merged.at[index, 'suspected_vessels']}\n")

    # Progress bar
    if (index + 1) % 100 == 0:
        print(f"Processed {index + 1} / {len(transactions_merged)} transactions")

Processed 100 / 5307 transactions
Processed 200 / 5307 transactions
Processed 300 / 5307 transactions
Processed 400 / 5307 transactions
Processed 500 / 5307 transactions
Processed 600 / 5307 transactions
Processed 700 / 5307 transactions
Processed 800 / 5307 transactions
Processed 900 / 5307 transactions
Processed 1000 / 5307 transactions
Processed 1100 / 5307 transactions
Processed 1200 / 5307 transactions
Processed 1300 / 5307 transactions
Processed 1400 / 5307 transactions
Processed 1500 / 5307 transactions
Processed 1600 / 5307 transactions
Processed 1700 / 5307 transactions
Processed 1800 / 5307 transactions
Processed 1900 / 5307 transactions
Processed 2000 / 5307 transactions
Processed 2100 / 5307 transactions
Processed 2200 / 5307 transactions
Processed 2300 / 5307 transactions
Processed 2400 / 5307 transactions
Processed 2500 / 5307 transactions
Processed 2600 / 5307 transactions
Processed 2700 / 5307 transactions
Processed 2800 / 5307 transactions
Processed 2900 / 5307 transac

In [65]:
# Assign the suspected vessels back to the original transactions DataFrame
transactions['suspected_vessels'] = transactions_merged['suspected_vessels']

In [66]:
transactions

Unnamed: 0,source,target,date,suspected_vessels
0,cargo_2035_2394778c,South Paackland,2035-11-03,"[albacoreangler47d, atlanticbluemarlinmarauder..."
1,cargo_2035_23956ba0,South Paackland,2035-08-16,"[roachraider7ce, barracudabandit836, laketrout..."
2,cargo_2035_23957cfd,South Paackland,2035-08-20,"[roachraider7ce, grasspickerelgangster7d1, aqu..."
3,cargo_2035_23958501,Paackland,2035-11-07,"[whitemarlinmasterfa1, wahoowrangler016, barra..."
4,cargo_2035_23959ab6,South Paackland,2035-08-24,"[squidsquad7fd, burbotbandit7bf, clamclaimer13..."
...,...,...,...,...
5302,cargo_2035_3119010f,Paackland,2035-08-24,"[tunatrawlerafd, bonefishbaiter565, wahoowrang..."
5303,cargo_2035_3119118e,Paackland,2035-08-25,"[whitemarlinmasterfa1, channelcatfishcapturer1..."
5304,cargo_2035_3119237c,Paackland,2035-08-28,
5305,cargo_2035_311936c4,Paackland,2035-08-30,"[burbotbandit7bf, dolphindasher004, cutthroatt..."


Convert to JSON and save the file

In [67]:
transactions.to_json("../data/transactions.json", orient='records', date_format='iso', indent=2)

Visualization Function

In [68]:
def plot_vessel_dwell_timeline(
    trajectories: dict,
    vessel_id: str,
    location_to_y: dict,
    harbor_reports: pd.DataFrame,
    transactions: pd.DataFrame,
    documents: pd.DataFrame,
    vessel_name: str = None,
    highlight_date: str = None
):
    """
    Visualizes vessel trajectory with dwell times as bars,
    and overlays full-day port report bars and transaction markers.
    Optionally highlights a specific date with a vertical red line.
    """

    # Check if the vessel exists
    if vessel_id not in trajectories:
        raise ValueError(f"Vessel '{vessel_id}' not found in trajectories.")

    # Prepare vessel trajectory DataFrame
    df = pd.DataFrame(trajectories[vessel_id]).copy()
    df['time'] = pd.to_datetime(df['time'], errors='coerce')
    df['dwell'] = pd.to_numeric(df['dwell'], errors='coerce')
    df = df.dropna(subset=['time', 'dwell', 'source'])
    df['end_time'] = df['time'] + pd.to_timedelta(df['dwell'], unit='s')
    df['dwell_hours'] = df['dwell'] / 3600
    df['y'] = df['source'].map(location_to_y)
    if vessel_name is None:
        vessel_name = vessel_id

    # Main bar chart for dwell times
    bars = alt.Chart(df).mark_bar().encode(
        x='time:T',
        x2='end_time:T',
        y=alt.Y('source:N', sort=list(location_to_y.keys()), title='Location'),
        color=alt.Color('source:N', legend=None),
        tooltip=['source', 'time', 'end_time', 'dwell_hours']
    )

    # --- Port reports ---
    vessel_reports = harbor_reports[harbor_reports['source'] == vessel_id].copy()
    vessel_reports['date'] = pd.to_datetime(vessel_reports['date'], errors='coerce')
    vessel_reports = vessel_reports.dropna(subset=['date'])
    vessel_reports = vessel_reports[vessel_reports['target'].isin(location_to_y.keys())]

    # --- Transactions ---
    vessel_transactions = transactions[
        transactions['suspected_vessels'].apply(lambda x: vessel_id in x if isinstance(x, list) else False)
    ].copy()
    vessel_transactions = vessel_transactions.dropna(subset=['date'])
    vessel_transactions = vessel_transactions[vessel_transactions['target'].isin(location_to_y.keys())]
    vessel_transactions['probable_cargo_transaction'] = True

    # Merge with documents
    vessel_transactions = vessel_transactions.merge(
        documents,
        left_on='source',
        right_on='id',
        how='left'
    )

    charts_to_combine = [bars]

    # Add report bars and markers if reports exist
    if not vessel_reports.empty:
        vessel_reports['day_start'] = vessel_reports['date'].dt.floor('D')
        vessel_reports['day_end'] = vessel_reports['date'].dt.floor('D') + pd.Timedelta(days=1)
        vessel_reports['day_mid'] = vessel_reports['day_start'] + pd.Timedelta(hours=12)

        report_bars = alt.Chart(vessel_reports).mark_bar(
            color='red',
            opacity=0.3
        ).encode(
            x='day_start:T',
            x2='day_end:T',
            y=alt.Y('target:N', sort=list(location_to_y.keys())),
            tooltip=['target', 'date', 'data_author']
        )

        markers_harbor = alt.Chart(vessel_reports).mark_circle(
            color='red',
            size=150,
            opacity=0.6
        ).encode(
            x='day_mid:T',
            y=alt.Y('target:N', sort=list(location_to_y.keys())),
            tooltip=['target', 'date', 'data_author']
        )

        charts_to_combine.extend([report_bars, markers_harbor])

    # Add transaction markers if transactions exist
    if not vessel_transactions.empty:
        vessel_transactions['day_mid'] = vessel_transactions['date'].dt.floor('D') + pd.Timedelta(hours=12)

        markers_transactions = alt.Chart(
            vessel_transactions[vessel_transactions['probable_cargo_transaction'] == True]
        ).mark_square(
            color='blue',
            size=100,
            opacity=0.6
        ).encode(
            x='day_mid:T',
            y=alt.Y('target:N', sort=list(location_to_y.keys())),
            tooltip=['target', 'date', 'qty_tons', 'commodity']
        )

        charts_to_combine.append(markers_transactions)

    # Add highlight line if a date is provided
    if highlight_date is not None:
        highlight_df = pd.DataFrame({'highlight_date': [highlight_date]})
        highlight_line = alt.Chart(highlight_df).mark_rule(
            color='red',
            strokeWidth=1.5,
            opacity=0.7
        ).encode(
            x='highlight_date:T'
        )
        charts_to_combine.append(highlight_line)

    # Combine all charts
    final_chart = alt.layer(*charts_to_combine).properties(
        width=950,
        height=300,
        title=f'Vessel Route Over Time: {vessel_name}'
    ).interactive()

    return final_chart

Let's apply the function to some examples

In [69]:
vessel_id = 'snappersnatcher7be'
vessel_name = vessels.loc[vessels['id'] == vessel_id, 'name'].values[0]

chart = plot_vessel_dwell_timeline(
    trajectories=trajectories,
    vessel_id=vessel_id,
    location_to_y=port_to_idx,
    harbor_reports=harbor_reports,
    transactions=transactions,
    documents=documents,
    vessel_name=vessel_name,
    highlight_date='2035-05-14'
)
chart.display()


In [71]:
vessel_id = 'bluemarlinbandit292'
vessel_name = vessels.loc[vessels['id'] == vessel_id, 'name'].values[0]

chart = plot_vessel_dwell_timeline(
    trajectories=trajectories,
    vessel_id=vessel_id,
    location_to_y=port_to_idx,
    harbor_reports=harbor_reports,
    transactions=transactions,
    documents=documents,
    vessel_name=vessel_name,
    highlight_date='2035-05-14'
)
chart.display()


## 2. Visually associating cargo deliveries with vessels

Prepare color palette

In [72]:
illegal_fish_species = {'piscessatisb87', 'piscesfoetidaae7', 'piscisosseusb6d'}

# Color palette for commodities
commodity_names = transactions_merged['commodity'].unique()
illegal_palette = sns.color_palette("Reds", len([name for name in commodity_names if name in illegal_fish_species]))
legal_palette = sns.color_palette("Blues", len([name for name in commodity_names if name not in illegal_fish_species]))

commodity_palette = {}
illegal_idx = 0
legal_idx = 0

for name in commodity_names:
    if name in illegal_fish_species:
        commodity_palette[name] = illegal_palette[illegal_idx]
        illegal_idx += 1
    else:
        commodity_palette[name] = legal_palette[legal_idx]
        legal_idx += 1

commodity_color_scale = alt.Scale(
    domain=list(commodity_palette.keys()),
    range=[f'#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}' 
           for r, g, b in commodity_palette.values()]
)

Cargo deliveries per day

In [73]:
# Handle large datasets
alt.data_transformers.disable_max_rows()

# Transform suspected_vessels list to string for tooltip
transactions_merged['suspected_vessels_str'] = transactions_merged['suspected_vessels'].apply(
    lambda v: ', '.join(v) if isinstance(v, list) else str(v)
)

# Orden the shipments for convenience (date, harbor, commodity)
agg = transactions_merged.sort_values(['date', 'target', 'commodity'])

# cumulative sum
agg['cum_qty'] = agg.groupby(['date', 'target'])['qty_tons'].cumsum()
agg['cum_qty_prev'] = agg['cum_qty'] - agg['qty_tons']  # la base di ogni carico

# Dropdown for harbor selection
city_dropdown = alt.binding_select(
    options=agg['target'].dropna().unique().tolist(),
    name='Harbor: '
)
city_select = alt.selection_point(
    fields=['target'],
    bind=city_dropdown,
    value=agg['target'].dropna().iloc[0]
)

# Add selection for commodity
commodity_select = alt.selection_point(
    fields=['commodity'],
    bind='legend' 
)

bars = alt.Chart(agg).mark_bar(size=6).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('cum_qty_prev:Q', title='Quantity Imported (tons)'),
    y2='cum_qty:Q',
    color=alt.Color(
        'commodity:N',
        title='Fish Type',
        legend=alt.Legend(
            orient="right",
            columns=1,
            title="Fish Type",
            labelFontSize=12,
            titleFontSize=14
        ),
        scale=commodity_color_scale
    ),
    opacity=alt.condition(commodity_select, alt.value(1.0), alt.value(0.2)),
    tooltip=[
        alt.Tooltip('date:T', title='Date'),
        alt.Tooltip('commodity:N', title='Fish Type'),
        alt.Tooltip('qty_tons:Q', title='Shipment (tons)'),
        alt.Tooltip('target:N', title='Harbor'),
        alt.Tooltip('suspected_vessels_str:N', title='Suspected Vessels')
    ]
).add_params(
    city_select,
    commodity_select
).transform_filter(
    city_select
)

# Horizontal lines between cargo blocks
lines = alt.Chart(agg).mark_tick(
    orient='horizontal',
    color='black',
    size=10,
    thickness=0.7
).encode(
    x='date:T',
    y='cum_qty:Q',
    opacity=alt.condition(commodity_select, alt.value(1.0), alt.value(0.2)),
).add_params(
    city_select
).transform_filter(
    city_select
)

final_chart = alt.layer(bars, lines).properties(
    width=850,
    height=300,
    title='Fish Cargoes Imported per Day (individual shipments)'
).interactive()

final_chart.display()


Visualize vessels docked in Harbor

In [74]:
# Handle large datasets
alt.data_transformers.disable_max_rows()

agg = harbor_reports_merged.sort_values(['date', 'target', 'vessel_type'])

# cumulative sum
agg['cum_tonnage'] = agg.groupby(['date', 'target'])['tonnage'].cumsum()
agg['cum_tonnage_prev'] = agg['cum_tonnage'] - agg['tonnage']

city_dropdown = alt.binding_select(
    options=agg['target'].dropna().unique().tolist(),
    name='Harbor: '
)
city_select = alt.selection_point(
    fields=['target'],
    bind=city_dropdown,
    value=agg['target'].dropna().iloc[0]
)

vessel_select = alt.selection_point(
    fields=['vessel_type'],
    bind='legend'
)

# Every bar is a ship segment stacked by tonnage
bars = alt.Chart(agg).mark_bar(size=6).encode(
    x=alt.X('date:T', title='Date'),
    y=alt.Y('cum_tonnage_prev:Q', title='Total Tonnage (tons)'),
    y2='cum_tonnage:Q',
    color=alt.Color(
        'vessel_type:N',
        title='Vessel Type',
        scale=alt.Scale(scheme='viridis'),
        legend=alt.Legend(
            orient="right",
            columns=1,
            title="Vessel Type",
            labelFontSize=12,
            titleFontSize=14
        )
    ),
    opacity=alt.condition(vessel_select, alt.value(1.0), alt.value(0.2)),
    tooltip=[
        alt.Tooltip('date:T', title='Date'),
        alt.Tooltip('name:N', title='Vessel Name'),
        alt.Tooltip('vessel_type:N', title='Vessel Type'),
        alt.Tooltip('flag_country:N', title='Flag Country'),
        alt.Tooltip('company:N', title='Company'),
        alt.Tooltip('tonnage:Q', title='Tonnage'),
        alt.Tooltip('length_overall:Q', title='Length Overall (m)')
    ]
).add_params(
    city_select,
    vessel_select
).transform_filter(
    city_select
)

# Horizontal lines between ship blocks
lines = alt.Chart(agg).mark_tick(
    orient='horizontal',
    color='black',
    size=10,
    thickness=0.7
).encode(
    x='date:T',
    y='cum_tonnage:Q',
    opacity=alt.condition(vessel_select, alt.value(1.0), alt.value(0.2))
).add_params(
    city_select,
    vessel_select
).transform_filter(
    city_select
)

final_chart = alt.layer(bars, lines).properties(
    width=850,
    height=400,
    title='Vessels Docked per Day (stacked by Tonnage)'
).interactive()

final_chart.display()


## 3. Compute similarity scores

Similarity functions

In [75]:
def jaccard_similarity(seq1, seq2):
    """Jaccard similarity based on zone/port visits"""
    set1, set2 = set(seq1), set(seq2)
    if not set1 or not set2:
        return 0.0
    return len(set1 & set2) / len(set1 | set2)

def dwell_vector(df):
    """Port dwell time vector"""
    vec = np.zeros(len(all_ports))
    for _, row in df.iterrows():
        port = row['source']
        if port is None:
            continue
        idx = port_to_idx[port]
        vec[idx] += row['dwell']
    return vec

Let's compute similarity w.r.t. *snappersnatcher7be*

In [76]:
ref_vessel = 'snappersnatcher7be'
ref_df = pd.DataFrame(trajectories[ref_vessel])
ref_df = ref_df[ref_df['source'].notna()]
ref_dwell_vec = dwell_vector(ref_df)
ref_ports = list(ref_df['source'])

similarities_jaccard = {}
similarities_dwell = {}

for vessel, df in tqdm(trajectories.items(), desc="Calculating similarities"):
    if vessel == ref_vessel:
        continue
    df = pd.DataFrame(df)
    df = df[df['source'].notna()]
    if df.empty:
        similarities_jaccard[vessel] = 0.0
        similarities_dwell[vessel] = 0.0
        continue  

    similarities_jaccard[vessel] = jaccard_similarity(list(df['source']), ref_ports) # Jaccard similarity

    vec = dwell_vector(df)
    similarities_dwell[vessel] = cosine_similarity([ref_dwell_vec], [vec])[0,0] # Cosine similarity on dwell vectors

# Combine similarities with weights
weight_jaccard = 0.6
weight_dwell = 0.4

final_similarity = {}
for vessel in similarities_jaccard:
    final_similarity[vessel] = weight_jaccard * similarities_jaccard[vessel] + weight_dwell * similarities_dwell[vessel]

# Sort by final similarity and print top 10
similar_vessels = sorted(final_similarity.items(), key=lambda x: x[1], reverse=True)
print("\nTop vessels similar to snappersnatcher7be:")
for vessel, sim in similar_vessels[:20]:
    print(f"{vessel}: {sim:.3f}")

Calculating similarities: 100%|██████████| 296/296 [00:16<00:00, 17.59it/s]


Top vessels similar to snappersnatcher7be:
swordfishsaboteur22f: 0.986
bigeyetunabuccaneera16: 0.964
fishfinderb9d: 0.963
browntroutbandite67: 0.962
salmonseeker630: 0.958
largemouthbasslooterf95: 0.956
cohosalmoncapturera7b: 0.953
prawnpredator5d7: 0.951
whitingwrangler842: 0.924
seabassbandit9ad: 0.899
whitemarlinwranglerbac: 0.877
crabcatcher1aa: 0.871
trawlertriumph31f: 0.869
aquaticpursuitf31: 0.867
codcatcher04c: 0.865
brillbandit0a1: 0.863
redfinpickerelraider744: 0.861
fishflingere29: 0.860
haddockhawkb7c: 0.857
spanishmackerelmaster037: 0.857



