# 2. Data Split

Now that we have filtered out outliers and have made sanity checks on the dataset, we can split the dataset into train, test and validation.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold, GroupShuffleSplit

In [2]:
df = pd.read_csv("cleaned_trajectories.csv", index_col=False)

Dataset details:

In [3]:
def dataset_details(df: pd.DataFrame) -> None:
    print(f"Number of unique aircraft: {df['flight_id'].nunique()}")
    print(f"Data length: {len(df)}")
    print(f"Data length by 30s windows: {len(df) // 30}")
    print(f"Data length by 60s windows: {len(df) // 60}")
    print(f"Data length by 120s windows: {len(df) // 120}")

    print("\n" + 15 * "-")
    print("Dataset info:")
    print(df.info(show_counts=True))

    print("\n" + 15 * "-")
    print("Dataset na percentages:")
    na_pct = df.isna().mean().sort_values(ascending=False) * 100
    display(na_pct)
    print("Any na values in dataset:", df.isna().values.any())

    print("\n" + 15 * "-")
    print(f"Unique flight ids: {df['flight_id'].nunique()}")

    print("\n" + 15 * "-")
    dupes = df.duplicated(subset=["flight_id", "timestamp"])
    print("Exact duplicate rows:", dupes.sum())

    print("\n" + 15 * "-")
    print(
        f"Number of 7700 squawks: {df['is_7700'].sum()}, percent: {df['is_7700'].sum()/len(df)*100:.2f}%"
    )
    print(
        f"Number of flights that have the squawk: {df[df['is_7700']].flight_id.nunique()}, percent: {df[df['is_7700']].flight_id.nunique()/df['flight_id'].nunique()*100:.2f}%"
    )


dataset_details(df)

Number of unique aircraft: 803
Data length: 4175022
Data length by 30s windows: 139167
Data length by 60s windows: 69583
Data length by 120s windows: 34791

---------------
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4175022 entries, 0 to 4175021
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   timestamp      4175022 non-null  object 
 1   altitude       4175022 non-null  float64
 2   flight_id      4175022 non-null  object 
 3   groundspeed    4175022 non-null  float64
 4   latitude       4175022 non-null  float64
 5   longitude      4175022 non-null  float64
 6   track          4175022 non-null  float64
 7   vertical_rate  4175022 non-null  float64
 8   is_7700        4175022 non-null  bool   
dtypes: bool(1), float64(6), object(2)
memory usage: 258.8+ MB
None

---------------
Dataset na percentages:


timestamp        0.0
altitude         0.0
flight_id        0.0
groundspeed      0.0
latitude         0.0
longitude        0.0
track            0.0
vertical_rate    0.0
is_7700          0.0
dtype: float64

Any na values in dataset: False

---------------
Unique flight ids: 803

---------------
Exact duplicate rows: 0

---------------
Number of 7700 squawks: 1214765, percent: 29.10%
Number of flights that have the squawk: 803, percent: 100.00%


## Splitting options
- Flight Group
- Individual points (after window transformation)

Splitting by individual points (row-wise) introduces the risk of data-leakage, does not generalise to real-world scenarios and breaks the temporal correlation as adjacent rows are split apart.


On the other side, the split by flight group (**flight_id**) has zero risk in data-leakage, is conservative and realistic as the model can be deployed for future aircraft and preserves temporal correlation within each split (windows keep their chronology).

Moreover, for sequence taks, group-wise splitting is the standard.

The following split will be used:
- Train: 70% (approx. 562 flights)
- Validation: 15% (approx. 120 flights)
- Test: 15% (approx. 121 flights)

As seen in the dataset information, every single aircraft has at least **one 7700 row** (29% of all rows/windows), but the *ratio per flight* still varies.  

To address this matter, we will:
1. Compute per-flight emergency ratio;
2. Define emergency-rate bins for stratification
- We will use buckets which become a coarse "class" for emergency-share, so that when we split we preserve the overall distribution of **low-, mid-, and high-emergency flights**.
3. Split the data by ensuring all windows of any given **flight_id** stay together.

In [4]:
flights = (
    df.groupby("flight_id")["is_7700"]
    .mean()  # emergency share for that flight
    .reset_index()
    .rename(columns={"is_7700": "emg_ratio"})
)
flights.head(2)

Unnamed: 0,flight_id,emg_ratio
0,AAL110_20190528,0.45044
1,AAL1188_20190520,0.347334


Bin the per-flight emergency ratio into 4 buckets

In [5]:
bins = pd.cut(
    flights["emg_ratio"],
    bins=[0, 0.05, 0.5, 0.95, 1],
    labels=False,
    include_lowest=True,
)
bins

0      1
1      1
2      1
3      1
4      2
      ..
798    1
799    3
800    1
801    1
802    1
Name: emg_ratio, Length: 803, dtype: int64

Stratified split by those bins while repsecting the flight_id

In [6]:
sgkf = StratifiedGroupKFold(
    n_splits=5, shuffle=True, random_state=25
)  # 4/5 flights in train
train_idx, temp_idx = next(sgkf.split(X=flights, y=bins, groups=flights["flight_id"]))

train_flights = flights.iloc[train_idx]["flight_id"]
temp_flights = flights.iloc[temp_idx]["flight_id"]

In [7]:
gss = GroupShuffleSplit(test_size=0.5, n_splits=1, random_state=25)
val_idx, test_idx = next(gss.split(temp_flights, groups=temp_flights))
val_flights = temp_flights.iloc[val_idx]
test_flights = temp_flights.iloc[test_idx]

In [8]:
train_df = df[df["flight_id"].isin(train_flights)]
val_df = df[df["flight_id"].isin(val_flights)]
test_df = df[df["flight_id"].isin(test_flights)]

In [14]:
val_flights

5      AAL1630_20190415
10     AAL1895_20190708
15      AAL213_20180603
27     AAL2807_20180428
46      AAR762_20190807
             ...       
775    VLG3866_20190913
781     VOZ721_20200112
786    WEN3410_20191123
791     WJA439_20180904
797     WOW699_20180125
Name: flight_id, Length: 80, dtype: object

In [20]:
print(
    f"Split details: (percentages of flights - by flight_id): Train {len(train_flights)/len(flights):.4f}, Val {len(val_flights)/len(flights):.4f}, Test {len(test_flights)/len(flights):.4f}"
)
print(
    f"Split details: (percentage of split - by rows): Train {len(train_df)/len(df):.4f}, Val {len(val_df)/len(df):.4f}, Test {len(test_df)/len(df):.4f}"
)
print(
    f"Numer of train flights: {len(train_flights)}, share of emergency rows: {train_df['is_7700'].sum()/len(train_df):.4f}"
)
print(
    f"Numer of val flights: {len(val_flights)}, share of emergency rows: {val_df['is_7700'].sum()/len(val_df):.4f}"
)
print(
    f"Numer of test flights: {len(test_flights)}, share of emergency rows: {test_df['is_7700'].sum()/len(test_df):.4f}"
)

Split details: (percentages of flights - by flight_id): Train 0.8007, Val 0.0996, Test 0.0996
Split details: (percentage of split - by rows): Train 0.7993, Val 0.1037, Test 0.0970
Numer of train flights: 643, share of emergency rows: 0.2904
Numer of val flights: 80, share of emergency rows: 0.2779
Numer of test flights: 80, share of emergency rows: 0.3099


Check for the correctness of the split

In [None]:
check_train_flights = train_df["flight_id"].unique()
check_val_flights = val_df["flight_id"].unique()
check_test_flights = test_df["flight_id"].unique()

for ff in check_train_flights:
    if ff in check_val_flights:
        print(f"Train flight {ff} is also in val set")
    if ff in check_test_flights:
        print(f"Train flight {ff} is also in test set")

for ff in check_val_flights:
    if ff in check_train_flights:
        print(f"Val flight {ff} is also in train set")
    if ff in check_test_flights:
        print(f"Val flight {ff} is also in test set")

In [59]:
data_split_folder = "data_splits/"
train_df.to_csv(data_split_folder + "train.csv", index=False)
val_df.to_csv(data_split_folder + "val.csv", index=False)
test_df.to_csv(data_split_folder + "test.csv", index=False)
print(f"Train, val, and test sets saved to {data_split_folder} folder.")

Train, val, and test sets saved to data_splits/ folder.
