# Procesado datos tracking

In [2]:
import polars as pl
import numpy as np

In [2]:
schema = {
    "individual_id": pl.String,
    "abbreviated_name": pl.Utf8,
    "date": pl.Datetime,
    "decimal_longitude": pl.Float64,
    "decimal_latitude": pl.Float64,
    "longitude_se": pl.Float64,
    "latitude_se": pl.Float64,
}

In [3]:
data = pl.read_csv("../../data/foca.csv", schema=schema)
data = data.drop(["longitude_se", "latitude_se", "abbreviated_name"])

In [None]:
data = (
    data
    .with_columns(pl.col("date").dt.year().alias("year"))
    .with_columns(pl.col("date").dt.month().alias("month"))
    .with_columns(pl.col("date").dt.day().alias("day"))
    .with_columns(pl.col("date").dt.hour().alias("hour"))
)

# Consecutive days

In [None]:
def longest_consecutive_days(df):

    unique_dates = df.select('date').unique().sort(by='date')['date'].to_list()

    # Encontrar la secuencia más larga de días consecutivos
    max_streak = 0
    current_streak = 1
    start_date = unique_dates[0]
    best_start, best_end = start_date, start_date
    
    for i in range(1, len(unique_dates)):
        if (unique_dates[i] - unique_dates[i - 1]).days <= 1:
            current_streak += 1
        else:
            if current_streak > max_streak:
                max_streak = current_streak
                best_start, best_end = start_date, unique_dates[i - 1]
            current_streak = 1
            start_date = unique_dates[i]
    
    if current_streak > max_streak:
        max_streak = current_streak
        best_start, best_end = start_date, unique_dates[-1]
    
    return max_streak, best_start, best_end

In [None]:
max_streak, best_start, best_end = longest_consecutive_days(data)

In [None]:
data = data.filter(pl.col("date") >= best_start).filter(pl.col("date") <= best_end)

In [None]:
data_processed = (
    data
    .sort(by = "hour")
    .group_by(["individual_id", "year", "month", "day"])
    .agg(pl.all().last())
)
data_processed = data_processed.sort(by=["year", "month", "day", "hour", "individual_id"], descending=[False, False, False, False, False])

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radio de la Tierra en km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c  # Distancia en km


In [None]:
data_processed = data_processed.with_columns([
    pl.col("decimal_latitude").shift(1).over("individual_id").alias("prev_lat"),
    pl.col("decimal_longitude").shift(1).over("individual_id").alias("prev_lon")
])

In [None]:
# Aplicar la función de Haversine usando `map`
data_processed = data_processed.with_columns(
    pl.struct(["decimal_latitude", "decimal_longitude", "prev_lat", "prev_lon"]).map_elements(
        lambda row: haversine(row["prev_lat"], row["prev_lon"], row["decimal_latitude"], row["decimal_longitude"])
        if row["prev_lat"] is not None else None, return_dtype=pl.Float64
    ).alias("distance_km")
)

In [None]:
data_processed = data_processed.with_columns(
    (pl.col("decimal_latitude") - pl.col("prev_lat")).alias("delta_lat"),
    (pl.col("decimal_longitude") - pl.col("prev_lon")).alias("delta_lon")
)

# Calcular el ángulo en radianes y convertirlo a grados
data_processed = data_processed.with_columns(
    pl.struct(["delta_lat", "delta_lon"])
    .map_elements(lambda d: np.degrees(np.arctan2(d["delta_lat"], d["delta_lon"])) if d["delta_lat"] is not None and d["delta_lon"] is not None else None,
                  return_dtype=pl.Float64)
    .alias("angle_degrees")
)

# Parse date

In [7]:
data_processed = data

In [8]:
from datetime import datetime

# Parsear y formatear la fecha para que se pueda concatenar con los datos de Copernicus
data_processed = (
    data_processed
    .with_columns(
        pl.col("date").map_elements(
            lambda x: datetime.strftime(x, "%Y-%m-%dT00:00:00"), 
            return_dtype=pl.String
        )
    )
)

# Write

In [4]:
schema = {
    "individual_id": pl.String,
    "date": pl.String,
    "decimal_longitude": pl.Float64,
    "decimal_latitude": pl.Float64,
    "year": pl.Int32,
    "month": pl.Int8,
    "day": pl.Int8,
    "hour": pl.Int8,
}
data = pl.read_csv("../../data/foca_procesado.csv", schema=schema)
data = data.drop("individual_id")
data.write_csv("../../data/datos_foca.csv")

In [11]:
data_processed.write_csv("../../data/foca_procesado.csv")