# FIRST DATASET EXTRAPOLATION

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import seaborn as sns
import os

In [None]:
#CARICAMENTO DATI
try:
    nodes = pd.read_csv('data/nodes.csv')
    links = pd.read_csv('data/links.csv')
except FileNotFoundError:
    print("ERRORE: Assicurati che 'nodes.csv' e 'links.csv' siano nella cartella.")
    exit()

    # Mappa basata sui nodi Entity.Location.Region
REGION_ECOLOGY = {
    # ZONE DI PESCA LEGALI
    "Cod Table":           ("Fishing Ground", "Cod/Gadus n.specificatae"),
    "Tuna Shelf":          ("Fishing Ground", "Tuna/Thunnini n.vera"),
    "Wrasse Beds":         ("Fishing Ground", "Wrasse/Labridae n.refert"),
    
    # RISERVE ECOLOGICHE (Contengono specie miste, indichiamo quelle di valore)
    "Ghoti Preserve":      ("Ecological Preserve", "Wrasse/Labridae n.refert"),
    "Nemo Reef":           ("Ecological Preserve", "Mixed (Tuna/Wrasse)"),
    "Don Limpet Preserve": ("Ecological Preserve", "Tuna/Thunnini n.vera"),
}
target_locations = list(REGION_ECOLOGY.keys())

In [None]:
# PREPARAZIONE DATI NAVI (FILTRO SOLO PESCHERECCI)

target_vessel_type = 'Entity.Vessel.FishingVessel'
vessel_nodes = nodes[nodes['type'] == target_vessel_type][
    ['id', 'entity_name', 'company', 'flag_country', 'type']
]
vessel_ids = set(vessel_nodes['id'])

# Recuperiamo i movimenti (PING) solo per queste navi
vessels_links = links[
    (links['target'].isin(vessel_ids)) & 
    (links['type'] == 'Event.TransportEvent.TransponderPing')
].copy()

# Pulizia Date
vessels_links['time'] = pd.to_datetime(vessels_links['time'], errors='coerce')
vessels_geo = vessels_links.dropna(subset=['time'])[['source', 'target', 'time']].rename(columns={
    'source': 'location_id', 
    'target': 'vessel_id',
    'time': 'vessel_time'
})


In [None]:
# PREPARAZIONE REPORT
# Identifichiamo i Report
delivery_ids = set(nodes[nodes['type'] == 'Entity.Document.DeliveryReport']['id'])

# Troviamo dove e quando sono stati emessi
reports_links = links[
    (links['source'].isin(delivery_ids)) & 
    (links['target'].str.contains('City|Point|Region', na=False, case=False))
].copy()

reports_geo = reports_links[['source', 'target', 'date']].rename(columns={
    'source': 'report_id', 'target': 'location_id', 'date': 'report_date'
})
reports_geo['report_date'] = pd.to_datetime(reports_geo['report_date'])

In [None]:
# INITIAL MATCHING
merged = pd.merge(reports_geo, vessels_geo, on='location_id', how='inner')

# Calcolo differenza giorni
merged['time_diff_days'] = (merged['report_date'] - merged['vessel_time']).dt.total_seconds() / (3600 * 24)

# REGOLA: La nave deve arrivare PRIMA del report (0-2 giorni di tolleranza)
final_matches = merged[(merged['time_diff_days'] >= 0) & (merged['time_diff_days'] <= 2)].copy()

# Aggiungiamo i dettagli anagrafici
final_matches = final_matches.merge(vessel_nodes, left_on='vessel_id', right_on='id')
delivery_nodes = nodes[nodes['type'] == 'Entity.Document.DeliveryReport'][['id', 'qty_tons', 'commodity_name']]
final_matches = final_matches.merge(delivery_nodes, left_on='report_id', right_on='id')

print(f" -> Trovati {len(final_matches)} potenziali candidati per i report.")

In [None]:
# DATA ENRICHMENT: CALCOLO PROVENIENZA (MERGE_ASOF)
# Prepariamo lo storico dei pescherecci (Tabella Destra)

fishing_events = vessels_geo[vessels_geo['location_id'].isin(target_locations)].copy()
fishing_events = fishing_events.sort_values('vessel_time')
fishing_events = fishing_events[['vessel_id', 'vessel_time', 'location_id']].rename(columns={
    'vessel_time': 'fishing_time',
    'location_id': 'provenance'
})

# Prepariamo i candidati (Tabella Sinistra)
final_matches = final_matches.sort_values('vessel_time')

# MERGE ASOF (Vettoriale - Istantaneo)
# Per ogni arrivo in porto, trova l'ultimo evento di pesca avvenuto prima
enriched_matches = pd.merge_asof(
    final_matches,
    fishing_events,
    left_on='vessel_time',
    right_on='fishing_time',
    by='vessel_id',
    direction='backward',
    tolerance=pd.Timedelta(days=60) # Guarda indietro fino a 60gg
)

# Riempiamo i buchi
enriched_matches['provenance'] = enriched_matches['provenance'].fillna("Unknown")


In [None]:

# LOGICA DI CLASSIFICAZIONE

def classify_activity(row):
    location = row['provenance']
    # Nota: v_type sarà sempre FishingVessel grazie al filtro iniziale, 
    # ma lo lasciamo generico per robustezza
    v_type = str(row['type']) 
    
    if location not in REGION_ECOLOGY:
        return "Unknown"

    zone_type, fish_species = REGION_ECOLOGY[location]

    # CASO A: Zone di Pesca Legali
    if zone_type == "Fishing Ground":
        return fish_species # Es: "Cod/Gadus..."
    
    # CASO B: Riserve Ecologiche
    elif zone_type == "Ecological Preserve":
        # Se è un Peschereccio in Riserva -> ILLEGALE
        if "Fishing" in v_type:
            return f"Suspect Activity ({fish_species})"
        else:
            return "Suspicious Transit"
            
    return "Unknown"

enriched_matches['probable_cargo'] = enriched_matches.apply(classify_activity, axis=1)

# Se il report originale non ha commodity, usiamo quella inferita
def fill_commodity(row):
    cargo = row['probable_cargo']

    if "Illegal" in cargo:
        return cargo 

    # Logica originale per gli altri casi
    if pd.isna(row['commodity_name']) and "Unknown" not in cargo:
        return cargo
        
    return row['commodity_name']

enriched_matches['commodity_name'] = enriched_matches.apply(fill_commodity, axis=1)

In [None]:
# ESPORTAZIONE JSON
json_output = []

# Raggruppiamo per Report ID
for report_id, group in enriched_matches.groupby('report_id'):
    first = group.iloc[0]
    
    entry = {
        "report_id": str(report_id),
        "location": first['location_id'],
        "commodity": first['commodity_name'], 
        "qty_tons": float(first['qty_tons']) if pd.notnull(first['qty_tons']) else 0,
        "date": str(first['report_date']),
        "candidates": []
    }
    
    for _, row in group.iterrows():
        entry["candidates"].append({
            "vessel_name": row['entity_name'],
            "vessel_type": row['type'],
            "company": row['company'],
            "flag": row['flag_country'],
            "arrival_time": str(row['vessel_time']),
            "days_before": round(row['time_diff_days'], 2),
            
            # Campi chiave per Task 1
            "provenance": row['provenance'], 
            "probable_cargo": row['probable_cargo'] 
        })
    
    json_output.append(entry)

# Verifica se esiste cartella data, altrimenti root
output_path = 'vast-challenge-project/public/candidates_enriched.json'


with open(output_path, 'w') as f:
    json.dump(json_output, f, indent=4)