In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium

### Load Data

In [None]:
file_path = "flight_data.csv"
df = pd.read_csv(file_path)
print("Data shape: ", df.shape)
df.head()

### Inspect data

In [None]:
print(df.columns)
print(df.info())
print(df.describe())

### Data Cleaning

In [None]:
df = df.drop("sensors", axis=1)
df = df.drop_duplicates()
print(df.shape)

### Convert UNIX time to datetime

In [None]:
for col in ["time_position","last_contact"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], unit='s', errors="coerce")
print(df.head())

### Handle missing values

In [None]:
print(df.isna().sum())
df = df.dropna(subset=["longitude", "latitude"])

## **EXPLORATORY ANALYSIS**

### Set Graph Styles

In [None]:
pd.set_option("display.max_columns", None)
sns.set_style("darkgrid")  # adds seaborn style to charts, eg. grid
plt.style.use("dark_background")  # inverts colors to dark theme
sns.set_palette("colorblind")
# sns.reset_defaults()

### Flights per country

In [None]:
if "origin_country" in df.columns:
    top_country_count = (df["origin_country"].value_counts().head(10))
    plt.figure(figsize=(10,5))
    sns.barplot(x=top_country_count.values, y=top_country_count.index)
    plt.title("Top 10 origin countries")
    plt.xlabel("Number of flights")
    plt.ylabel("Countries")
    plt.show()    

### Distribution of Altitude

In [None]:
if "barometric_altitude" in df.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(df["barometric_altitude"].dropna(), bins=50, kde=True)
    plt.title("Distribution of Barometric Altitude")
    plt.xlabel("Altitude (m)")
    plt.show()

### Distribution of Flight Velocity

In [None]:
if "velocity" in df.columns:
    plt.figure(figsize=(8,5))
    sns.histplot(df["velocity"].dropna(), bins=50, kde=True)
    plt.title("Distribution of Flight Velocity")
    plt.xlabel("Velocity (m/s)")
    plt.show()

### Flights over time

In [None]:
if "time_position" in df.columns:
    df["hour"] = df["time_position"].dt.hour
    hourly_counts = df.groupby("hour")["icao24"].count()
    plt.figure(figsize=(10,5))
    sns.lineplot(x=hourly_counts.index, y=hourly_counts.values)
    plt.title("Flights per Hour of Day")
    plt.xlabel("Hour of Day")
    plt.ylabel("Number of flights")
    plt.show()

### Spatial Analysis

In [None]:
mean_lat, mean_lon = df["latitude"].mean(), df["longitude"].mean()
flight_map = folium.Map(location=[mean_lat, mean_lon], zoom_start=3)

for _, row in df.sample(min(1000, len(df))).iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=2,
        color="blue",
        fill=True,
        fill_opacity=0.5
    ).add_to(flight_map)
    
flight_map.save("flight_map.html")
print("Map saved")

### Per-Aircraft Analysis

In [None]:
icao_examples = df["icao24"].iloc[:100].unique()  # select 5 unique aircraft for example

for icao in icao_examples:
    flight_path = df[df["icao24"] == icao].sort_values("time_position")
    plt.plot(flight_path["longitude"], flight_path["latitude"], marker="o", markersize=4)

plt.title("Tragectory of Multiple Aircrafts")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

### Correlation Analysis

In [None]:
selected_cols = ["velocity", "barometric_altitude", "vertical_rate"]
plt.figure(figsize=(6,4))
sns.heatmap(df[selected_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

### **Save Processed Data in CSV**

In [None]:
df.to_csv("cleaned_flight_data.csv", index=False)
print("Cleaned Data saved as cleaned_flight_data.csv")

# **ADVANCED ANALYSIS**

### Anomaly detection

In [80]:
if "velocity" in df.columns:
    # Example: Detect flights with abnormally high/low speeds
    speed_q1 = df["velocity"].quantile(0.25)
    speed_q3 = df["velocity"].quantile(0.75)
    iqr = speed_q3 - speed_q1
    lower, upper = speed_q1 - 1.5*iqr, speed_q3 + 1.5*iqr
    anomalies = df[(df["velocity"] < lower) | (df["velocity"] > upper)]
    print(f"Found {len(anomalies)} anomalous flights by speed")

Found 2 anomalous flights by speed


In [83]:
if "geo_altitude" in df.columns:
    # Detect altitude anomalies
    altitude_mean = df["geo_altitude"].mean()
    altitude_std = df["geo_altitude"].std()
    unusual_alt = df[(df["geo_altitude"] > altitude_mean + 3*altitude_std) |
                     (df["geo_altitude"] < altitude_mean - 3*altitude_std)]
    print(f"Found {len(unusual_alt)} anomalous flights by altitude")

Found 0 anomalous flights by altitude
