In [25]:
import os
import re
import pandas as pd
import numpy as np
from src.tools.geomap_tools import haversine

In [26]:
base_folder = "data/external/adsb/"
year_selected = "2023"
sites = pd.read_csv("data/external/sites.csv")
sites.loc[sites.name=="EIH", "name"] = "BRE"
sites.rename(columns={"id" : "site_id", "name" : "site_abrev_name", "basename" : "site_name", "code" : "site_code"}, inplace=True)
time_threshold = pd.Timedelta(minutes=20)
maximum_altitude = 1500 # feet
maximum_distance = 20 # km

In [27]:
final_table = []

# Traverse through all folders and subfolders
for root, dirs, files in os.walk(base_folder):
    folder_dfs = []
    # Iterate through files in the current folder
    for file in files:

        if file.endswith(".parquet"):
            
            # Extract site information from folder structure
            site = re.search(r"site=([^/]+)", root).group(1)
            year = re.search(r"year=([^/]+)", root).group(1)
            month = re.search(r"month=([^/]+)", root).group(1)
            
            if site!="BRU":
                break
            
            if year!=year_selected:
                break
            
            file_path = os.path.join(root, file)
            # Read the parquet file into a pandas DataFrame
            
            try:
                df = pd.read_parquet(file_path)
            except OSError:
                print(file + " is corrupted")
                continue
            
            print(file)
            
            # Filter by geometric altitude
            df = df[~df.GeometricAltitude.isnull()]
            df.GeometricAltitude = df.GeometricAltitude.astype(float)
            df = df[df.GeometricAltitude < maximum_altitude]
            
            df["Site"] = site
            df["Year"] = year
            df["Month"] = month
            
            # Get site latitude and longitude from `sites` DataFrame
            site_coords = sites[sites.site_abrev_name == site][["latitude", "longitude"]].values[0]
            site_lat, site_lon = site_coords
            
            # Calculate distance and filter rows within 50km
            df["Distance"] = haversine(df["Latitude"], df["Longitude"], site_lat, site_lon)
            df = df[df["Distance"] <= maximum_distance]
            
            # Drop the temporary 'Distance' column
            df.drop(columns=["Distance"], inplace=True)
            
            if not df.empty:
                folder_dfs.append(df)

    if folder_dfs:
        print("concatening...")
        # Combine DataFrames from the current folder (if needed)
        df = pd.concat(folder_dfs, ignore_index=True)

        # Sort the dataframe by AircraftAddress, Callsign, and TimeRecPosition
        print("sorting values...")
        df.TimeRecPosition = pd.to_datetime(df.TimeRecPosition, format="ISO8601")
        df = df.sort_values(by=['AircraftAddress', 'Callsign', 'TimeRecPosition'])

        # Calculate the time difference within each group
        print("computing time diff...")
        df.loc[df.Callsign.isnull(), "Callsign"] = "NONE"
        df['time_diff'] = df.groupby(['AircraftAddress', 'Callsign'])['TimeRecPosition'].diff()

        # Initialize the journey column
        df['journey'] = 0

        # Create a flag for new journey based on time difference and new Callsign-AircraftAddress combination
        df['new_journey'] = (df['time_diff'] > time_threshold) | (df['Callsign'] != df['Callsign'].shift(1)) | \
                            (df['Site'] != df['Site'].shift(1)) | (df['AircraftAddress'] != df['AircraftAddress'].shift(1))

        # Use cumulative sum to assign journey IDs
        df['journey'] = df['new_journey'].cumsum() + 1

        # Calculate the time flown for each journey
        print("computing time flown...")
        df['timeFlown'] = df.groupby('journey')['TimeRecPosition'].transform(lambda x: x.max() - x.min())
        dfTime = df.drop_duplicates(subset=['journey']).reset_index(drop=True)
        final_table.append(dfTime.groupby(["Site", "Year", "Month"])["timeFlown"].sum().reset_index())
        print("")
        
final_table = [df for df in final_table if not df.empty]
final_concat = pd.concat(final_table, ignore_index=True)
final_by_site = pd.concat(final_table, ignore_index=True).groupby("Site").timeFlown.sum()
final_by_site


Brussels_2023-06-23.parquet is corrupted
Brussels_2023-06-24.parquet is corrupted
Brussels_2023-06-25.parquet is corrupted
Brussels_2023-06-26.parquet is corrupted
Brussels_2023-06-27.parquet is corrupted
Brussels_2023-06-28.parquet is corrupted
Brussels_2023-06-29.parquet is corrupted
Brussels_2023-06-30.parquet is corrupted
Brussels_2023-07-01.parquet is corrupted
Brussels_2023-07-02.parquet is corrupted
Brussels_2023-07-03.parquet is corrupted
Brussels_2023-07-04.parquet is corrupted
Brussels_2023-07-05.parquet is corrupted
Brussels_2023-07-07.parquet
Brussels_2023-07-08.parquet is corrupted
Brussels_2023-07-09.parquet is corrupted
Brussels_2023-07-10.parquet is corrupted
Brussels_2023-07-11.parquet
Brussels_2023-07-12.parquet is corrupted
Brussels_2023-07-13.parquet is corrupted
Brussels_2023-07-14.parquet is corrupted
Brussels_2023-07-15.parquet is corrupted
Brussels_2023-07-16.parquet is corrupted
Brussels_2023-07-17.parquet is corrupted
Brussels_2023-07-18.parquet is corrupted
B

Site
BRU   37 days 03:15:29.780000
Name: timeFlown, dtype: timedelta64[ns]

In [28]:
pd.DataFrame(final_by_site)

Unnamed: 0_level_0,timeFlown
Site,Unnamed: 1_level_1
BRU,37 days 03:15:29.780000
