# SECOND DATASET EXTRAPOLATION

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import seaborn as sns
import os
import random

In [None]:
def extract_tracks_statistical():

    try:
        nodes = pd.read_csv("data/nodes.csv")
        links = pd.read_csv("data/links.csv", low_memory=False) 
    except FileNotFoundError:
        print("Errore: File csv non trovati.")
        return

    # 1. Calcolo Statistiche della Flotta
    print("Calcolo soglie statistiche (75° e 90° percentile) su tutta la flotta...")
    
    fishing_vessels = nodes[nodes['type'] == 'Entity.Vessel.FishingVessel']['id'].tolist()
    
    all_pings = links[
        (links['type'] == 'Event.TransportEvent.TransponderPing') & 
        (links['target'].isin(fishing_vessels))
    ].copy()
    
    if 'dwell' in all_pings.columns:
        all_pings['dwell'] = pd.to_numeric(all_pings['dwell'], errors='coerce').fillna(0)
    else:
        all_pings['dwell'] = 0

    all_pings['loc_clean'] = all_pings['source'].astype(str).str.replace("City of ", "")
    
    # --- SOGLIA STATISTICA ---
    stats_df = all_pings.groupby('loc_clean')['dwell'].quantile([0.75, 0.90]).unstack()
    
    # Convertiamo in due dizionari separati per accesso veloce
    thresholds_90 = stats_df[0.90].to_dict() # Soglia Rossa (Violation)
    thresholds_75 = stats_df[0.75].to_dict() # Soglia Arancione (Suspicious)

    # 2. Estrazione SouthSeafood
    vessels = nodes[nodes['type'] == 'Entity.Vessel.FishingVessel']
    all_vessel_ids = vessels['id'].tolist()
    id_to_name = dict(zip(vessels['id'], vessels['entity_name']))
    id_to_company = dict(zip(vessels['id'], vessels['company']))

    # Filtro pings
    all_pings['time'] = pd.to_datetime(all_pings['time'], format='mixed')
    all_pings = all_pings.sort_values(by=['target', 'time'])

    # Liste Zone
    restricted_zones = ["Ghoti Preserve", "Nemo Reef", "Don Limpet Preserve"]
    buoy_zones = [
        "Nav 1", "Nav 2", "Nav 3",
        "Nav A", "Nav B", "Nav C", "Nav D", "Nav E",
        "Exit East", "Exit North", "Exit South", "Exit West"
    ]
    fishing_zones = ["Wrasse Beds", "Cod Table", "Tuna Shelf"] # Corretto "Tuna"

    suspicious_scores = []
    TARGET_COMPANY = "SouthSeafood Express Corp"
    
    for vid in all_vessel_ids:
        v_pings = all_pings[all_pings['target'] == vid]
        if len(v_pings) == 0: continue
        
        score = 0
        track_data = []
        company = str(id_to_company.get(vid, "Unknown"))
        is_target_company = (company == TARGET_COMPANY)
        
        recent_buoy_visit = False

        for _, row in v_pings.iterrows():
            loc = row['loc_clean']
            dwell = row['dwell']
            
            thresh_extreme = thresholds_90.get(loc, 14400) 
            thresh_warning = thresholds_75.get(loc, 900)   
            
            pt_type = "port"
            
            # Definiamo i booleani statistici
            is_extreme = dwell > thresh_extreme
            is_warning = dwell > thresh_warning
            
            # Reset Boa
            if loc not in restricted_zones and loc not in buoy_zones:
                recent_buoy_visit = False

            # LOGICA SEMAFORO BASATA SUI PERCENTILI
            if loc in restricted_zones:
                if is_extreme:
                    pt_type = "violation"
                    points = 100
                    if recent_buoy_visit: points += 150
                    score += points
                elif is_warning: 
                    pt_type = "suspicious" 
                    score += 10
                else:
                    pt_type = "transit"
                    
            elif loc in buoy_zones:
                if is_extreme:
                    pt_type = "suspicious"
                    score += 20
                    recent_buoy_visit = True
                else:
                    pt_type = "transit" # BLU
                    recent_buoy_visit = True
            
            elif loc in fishing_zones:
                pt_type = "transit" # BLU
            
            track_data.append({
                "loc": loc,
                "time": row['time'].strftime("%Y-%m-%d %H:%M"),
                "timestamp": int(row['time'].timestamp() * 1000),
                "type": pt_type,
                "duration": int(dwell),
                "x_offset": random.randint(-5, 5), # Utile per la mappa jitter
                "y_offset": random.randint(-5, 5)
            })

        # Salvataggio se rilevante
        if is_target_company or score >= 0:
            suspicious_scores.append({
                "id": vid,
                "name": id_to_name[vid],
                "company": company,
                "score": score,
                "is_target": is_target_company,
                "track": track_data
            })

    # Ordina e Salva
    suspicious_scores.sort(key=lambda x: x['score'], reverse=True)
    
    output_tracks = {}
    for s in suspicious_scores:
        output_tracks[s['name']] = s['track']

    output_dir = "vast-challenge-project/public"

    with open(f"{output_dir}/other_candidates.json", "w") as f:
        json.dump(output_tracks, f, indent=2)
    
    # estraiamo solo le navi target
    ss_tracks = {k: v for k, v in output_tracks.items() if any(s['name'] == k and s['is_target'] for s in suspicious_scores)}
    with open(f"{output_dir}/south_seafood_pings.json", "w") as f:
        json.dump(ss_tracks, f, indent=2)
        
    print(f"File salvati in {output_dir}/: other_candidates.json e south_seafood_pings.json")

if __name__ == "__main__":
    extract_tracks_statistical()