# 3. Data Preprocessing/Transformation

Now that we have the data correct data splits, we will transform the data.

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

In [4]:
data_splits_folder = "data_splits/"
train_df = pd.read_csv(data_splits_folder + "train.csv", parse_dates=["timestamp"])
test_df = pd.read_csv(data_splits_folder + "test.csv", parse_dates=["timestamp"])
val_df = pd.read_csv(data_splits_folder + "val.csv", parse_dates=["timestamp"])

In [5]:
def dataset_details(df: pd.DataFrame) -> None:
    print(f"Number of unique aircraft: {df['flight_id'].nunique()}")
    print(f"Data length: {len(df)}")
    print(f"Data length by 30s windows: {len(df) // 30}")
    print(f"Data length by 60s windows: {len(df) // 60}")
    print(f"Data length by 120s windows: {len(df) // 120}")

    print("\n" + 15 * "-")
    print("Dataset info:")
    print(df.info(show_counts=True))

    print("\n" + 15 * "-")
    print("Dataset na percentages:")
    na_pct = df.isna().mean().sort_values(ascending=False) * 100
    display(na_pct)
    print("Any na values in dataset:", df.isna().values.any())

    print("\n" + 15 * "-")
    print(f"Unique flight ids: {df['flight_id'].nunique()}")

    print("\n" + 15 * "-")
    dupes = df.duplicated(subset=["flight_id", "timestamp"])
    print("Exact duplicate rows:", dupes.sum())

    print("\n" + 15 * "-")
    print(
        f"Number of 7700 squawks: {df['is_7700'].sum()}, percent: {df['is_7700'].sum()/len(df)*100:.2f}%"
    )
    print(
        f"Number of flights that have the squawk: {df[df['is_7700']].flight_id.nunique()}, percent: {df[df['is_7700']].flight_id.nunique()/df['flight_id'].nunique()*100:.2f}%"
    )


dataset_details(train_df)

Number of unique aircraft: 643
Data length: 3337062
Data length by 30s windows: 111235
Data length by 60s windows: 55617
Data length by 120s windows: 27808

---------------
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3337062 entries, 0 to 3337061
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   timestamp      3337062 non-null  datetime64[ns, UTC]
 1   altitude       3337062 non-null  float64            
 2   flight_id      3337062 non-null  object             
 3   groundspeed    3337062 non-null  float64            
 4   latitude       3337062 non-null  float64            
 5   longitude      3337062 non-null  float64            
 6   track          3337062 non-null  float64            
 7   vertical_rate  3337062 non-null  float64            
 8   is_7700        3337062 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(6), object(1)


timestamp        0.0
altitude         0.0
flight_id        0.0
groundspeed      0.0
latitude         0.0
longitude        0.0
track            0.0
vertical_rate    0.0
is_7700          0.0
dtype: float64

Any na values in dataset: False

---------------
Unique flight ids: 643

---------------
Exact duplicate rows: 0

---------------
Number of 7700 squawks: 968948, percent: 29.04%
Number of flights that have the squawk: 643, percent: 100.00%


In [6]:
dataset_details(test_df)

Number of unique aircraft: 80
Data length: 405068
Data length by 30s windows: 13502
Data length by 60s windows: 6751
Data length by 120s windows: 3375

---------------
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405068 entries, 0 to 405067
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   timestamp      405068 non-null  datetime64[ns, UTC]
 1   altitude       405068 non-null  float64            
 2   flight_id      405068 non-null  object             
 3   groundspeed    405068 non-null  float64            
 4   latitude       405068 non-null  float64            
 5   longitude      405068 non-null  float64            
 6   track          405068 non-null  float64            
 7   vertical_rate  405068 non-null  float64            
 8   is_7700        405068 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(6), object(1)
memory usage: 25.1

timestamp        0.0
altitude         0.0
flight_id        0.0
groundspeed      0.0
latitude         0.0
longitude        0.0
track            0.0
vertical_rate    0.0
is_7700          0.0
dtype: float64

Any na values in dataset: False

---------------
Unique flight ids: 80

---------------
Exact duplicate rows: 0

---------------
Number of 7700 squawks: 125526, percent: 30.99%
Number of flights that have the squawk: 80, percent: 100.00%


In [7]:
dataset_details(val_df)

Number of unique aircraft: 80
Data length: 432892
Data length by 30s windows: 14429
Data length by 60s windows: 7214
Data length by 120s windows: 3607

---------------
Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432892 entries, 0 to 432891
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   timestamp      432892 non-null  datetime64[ns, UTC]
 1   altitude       432892 non-null  float64            
 2   flight_id      432892 non-null  object             
 3   groundspeed    432892 non-null  float64            
 4   latitude       432892 non-null  float64            
 5   longitude      432892 non-null  float64            
 6   track          432892 non-null  float64            
 7   vertical_rate  432892 non-null  float64            
 8   is_7700        432892 non-null  bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(6), object(1)
memory usage: 26.8

timestamp        0.0
altitude         0.0
flight_id        0.0
groundspeed      0.0
latitude         0.0
longitude        0.0
track            0.0
vertical_rate    0.0
is_7700          0.0
dtype: float64

Any na values in dataset: False

---------------
Unique flight ids: 80

---------------
Exact duplicate rows: 0

---------------
Number of 7700 squawks: 120291, percent: 27.79%
Number of flights that have the squawk: 80, percent: 100.00%


In [8]:
train_df.head()

Unnamed: 0,timestamp,altitude,flight_id,groundspeed,latitude,longitude,track,vertical_rate,is_7700
0,2018-01-01 12:29:49+00:00,11300.0,ARG1511_20180101,309.885195,-31.449051,-63.963318,130.441641,3008.0,True
1,2018-01-01 12:29:50+00:00,11375.0,ARG1511_20180101,310.264709,-31.450453,-63.961432,130.441641,3040.0,True
2,2018-01-01 12:29:51+00:00,11450.0,ARG1511_20180101,310.644223,-31.451856,-63.959546,130.441641,3072.0,True
3,2018-01-01 12:29:52+00:00,11500.0,ARG1511_20180101,311.192793,-31.452843,-63.958193,130.440361,3074.742857,True
4,2018-01-01 12:29:53+00:00,11550.0,ARG1511_20180101,311.741363,-31.453831,-63.95684,130.439081,3077.485714,True


In [9]:
# The dataset is cleaned of NaNs but we can use the following as future proof
# num_cols_to_fill = ["groundspeed", "track", "vertical_rate"]
# for _df in (train_df, val_df, test_df):
#     _df.sort_values("timestamp", inplace=True)
#     _df[num_cols_to_fill] = (_df.set_index("timestamp")[num_cols_to_fill]
#                                .interpolate("time")
#                                .fillna(method="ffill")
#                                .reset_index(drop=True))

## 1. Angle encoding
All circular features (latitude, longitude, track) are replaced by sine & cosine pairs. This turns each degree value into two continuous components (_sin, _cos), which avoids the discontinuity at 0°/360° and lets the model learn smooth angular patterns.


In [10]:
def add_angle_features(df, col_deg, prefix):
    rad = np.deg2rad(df[col_deg])
    df[f"{prefix}_sin"] = np.sin(rad)
    df[f"{prefix}_cos"] = np.cos(rad)
    df.drop(columns=[col_deg], inplace=True)


for _df in (train_df, val_df, test_df):
    add_angle_features(_df, "latitude", "lat")
    add_angle_features(_df, "longitude", "lon")
    add_angle_features(_df, "track", "trk")

In [11]:
train_df.head()

Unnamed: 0,timestamp,altitude,flight_id,groundspeed,vertical_rate,is_7700,lat_sin,lat_cos,lon_sin,lon_cos,trk_sin,trk_cos
0,2018-01-01 12:29:49+00:00,11300.0,ARG1511_20180101,309.885195,3008.0,True,-0.52174,0.853104,-0.898513,0.438946,0.761067,-0.648673
1,2018-01-01 12:29:50+00:00,11375.0,ARG1511_20180101,310.264709,3040.0,True,-0.521761,0.853092,-0.898499,0.438976,0.761067,-0.648673
2,2018-01-01 12:29:51+00:00,11450.0,ARG1511_20180101,310.644223,3072.0,True,-0.521782,0.853079,-0.898484,0.439006,0.761067,-0.648673
3,2018-01-01 12:29:52+00:00,11500.0,ARG1511_20180101,311.192793,3074.742857,True,-0.521797,0.85307,-0.898474,0.439027,0.761082,-0.648656
4,2018-01-01 12:29:53+00:00,11550.0,ARG1511_20180101,311.741363,3077.485714,True,-0.521811,0.853061,-0.898464,0.439048,0.761096,-0.648639


## 2. Turn-rate feature
A crude d(track)/dt is computed by differencing the trk_cos column within each flight.

This new column is then standardized using a second StandardScaler (again trained on train, then applied to val/test).

In [12]:
for _df in (train_df, val_df, test_df):
    _df["turn_rate"] = _df.groupby("flight_id")["trk_cos"].diff().fillna(0)

In [13]:
train_df.head()

Unnamed: 0,timestamp,altitude,flight_id,groundspeed,vertical_rate,is_7700,lat_sin,lat_cos,lon_sin,lon_cos,trk_sin,trk_cos,turn_rate
0,2018-01-01 12:29:49+00:00,11300.0,ARG1511_20180101,309.885195,3008.0,True,-0.52174,0.853104,-0.898513,0.438946,0.761067,-0.648673,0.0
1,2018-01-01 12:29:50+00:00,11375.0,ARG1511_20180101,310.264709,3040.0,True,-0.521761,0.853092,-0.898499,0.438976,0.761067,-0.648673,0.0
2,2018-01-01 12:29:51+00:00,11450.0,ARG1511_20180101,310.644223,3072.0,True,-0.521782,0.853079,-0.898484,0.439006,0.761067,-0.648673,0.0
3,2018-01-01 12:29:52+00:00,11500.0,ARG1511_20180101,311.192793,3074.742857,True,-0.521797,0.85307,-0.898474,0.439027,0.761082,-0.648656,1.7e-05
4,2018-01-01 12:29:53+00:00,11550.0,ARG1511_20180101,311.741363,3077.485714,True,-0.521811,0.853061,-0.898464,0.439048,0.761096,-0.648639,1.7e-05


## 3. Global z-score scaling
A StandardScaler is fitted only on the training split’s nine core features (altitude, groundspeed, vertical_rate, and the three sin/cos pairs); as well as the added turn rate.

- The training data are fit_transformed.

- Validation and test are then transformed with the same scaler.

- The fitted scaler is saved to scaler.pkl for later reuse.

In [14]:
feature_cols = [
    "altitude",
    "groundspeed",
    "vertical_rate",
    "lat_sin",
    "lat_cos",
    "lon_sin",
    "lon_cos",
    "trk_sin",
    "trk_cos",
    "turn_rate",
]

scaler = StandardScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
val_df[feature_cols] = scaler.transform(val_df[feature_cols])
test_df[feature_cols] = scaler.transform(test_df[feature_cols])

data_processing_folder = "baseline_data_processing/"
joblib.dump(scaler, data_processing_folder + "scaler.pkl")

['baseline_data_processing/scaler.pkl']

In [15]:
train_df.head()

Unnamed: 0,timestamp,altitude,flight_id,groundspeed,vertical_rate,is_7700,lat_sin,lat_cos,lon_sin,lon_cos,trk_sin,trk_cos,turn_rate
0,2018-01-01 12:29:49+00:00,-1.27223,ARG1511_20180101,-0.820034,2.942125,True,-5.302007,1.133218,-0.790238,0.063286,1.170042,-1.137935,0.000706
1,2018-01-01 12:29:50+00:00,-1.266013,ARG1511_20180101,-0.81631,2.973086,True,-5.302103,1.133093,-0.790213,0.063339,1.170042,-1.137935,0.000706
2,2018-01-01 12:29:51+00:00,-1.259797,ARG1511_20180101,-0.812586,3.004047,True,-5.302199,1.132967,-0.790187,0.063392,1.170042,-1.137935,0.000706
3,2018-01-01 12:29:52+00:00,-1.255652,ARG1511_20180101,-0.807202,3.0067,True,-5.302266,1.132879,-0.790169,0.063429,1.170062,-1.137908,0.003614
4,2018-01-01 12:29:53+00:00,-1.251507,ARG1511_20180101,-0.801819,3.009354,True,-5.302334,1.132791,-0.79015,0.063467,1.170081,-1.137882,0.003614




## 4. Sliding-window tensorization
For each flight:

- Extract its feature matrix (T rows × F features).

- Slide a window of length win_len with step stride along the time axis.

- For each window, stack its win_len × F values into a sample, and set its label to 1 if any is_7700 flag occurs in that window (otherwise 0).

- Return three arrays per split:

> X with shape (N_windows, win_len, n_features)

> y with shape (N_windows,)

Stride example:

win_len =  30, stride=10
- Window 1: rows **0–29**  
- Window 2: rows **10–39**  
- Window 3: rows **20–49**  

In [16]:
def build_windows(df, win_len, stride, feature_cols):
    data, labels = [], []
    for _, fl in df.groupby("flight_id"):
        X = fl[feature_cols].to_numpy(dtype=np.float32)
        y = fl["is_7700"].to_numpy(dtype=np.int8)  # chdanged from bool to int
        for i in range(0, len(X) - win_len + 1, stride):
            data.append(X[i : i + win_len])
            labels.append(int(y[i : i + win_len].any()))  # 1 if ANY emergency in window
    return np.stack(data), np.array(labels)

In [17]:
WIN = 30  # seconds, can be tuned
STRIDE = 10

In [28]:
X_train, y_train = build_windows(train_df, WIN, STRIDE, feature_cols)
X_val, y_val = build_windows(val_df, WIN, STRIDE, feature_cols)
X_test, y_test = build_windows(test_df, WIN, STRIDE, feature_cols)

In [52]:
X_train.shape

(332142, 30, 10)

The shape confirms the type of data - multivariate time-series.
It means that we have:
- 332142 samples (independent)
- ordered sequence of 30 -1Hz frames
- 10 parallel channels of time series

Save the dataset:

In [61]:
payload = {
    "X_train": X_train,
    "y_train": y_train,
    "X_val": X_val,
    "y_val": y_val,
    "X_test": X_test,
    "y_test": y_test,
    "scaler": scaler,
    "train_df": train_df,
    "val_df": val_df,
    "test_df": test_df,
    "feature_cols": feature_cols,
    "win_len": WIN,
    "stride": STRIDE,
}

joblib.dump(payload, data_processing_folder + "data_payload_30s.pkl")

['baseline_data_processing/data_payload_30s.pkl']