In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train_prepro = pd.read_csv('data/preprocessed_train_data_with_date_hol_concat.csv')
df_aircraft_model = pd.read_csv('data/full_aircraft_model_mapping_CORRECTED.csv')
df_passenger = pd.read_csv('data/passenger_data.csv')

# Map aircraft model to main df

In [3]:
# Map aircraft_model to df_train_prepro using aircraft_code
df_train_prepro['aircraft_model'] = df_train_prepro['aircraft_code'].map(
    df_aircraft_model.set_index('aircraft_code')['aircraft_model']
)

# Map Passenger data to main df

In [4]:
# Ensure departure_date is datetime
df_train_prepro['departure_date'] = pd.to_datetime(df_train_prepro['departure_date'])

# Extract year from departure_date
df_train_prepro['year'] = df_train_prepro['departure_date'].dt.year

# Melt df_passenger to long format for easy merging
df_passenger_long = df_passenger.melt(
    id_vars=['IATA_Code'],
    value_vars=['2016 Passengers', '2017 Passengers', '2018 Passengers'],
    var_name='year_col',
    value_name='num_passenger_year'
)
df_passenger_long['year'] = df_passenger_long['year_col'].str.extract(r'(\d{4})').astype(int)

# Merge passenger info into flight data (use 'departure_point' instead of 'IATA_Code')
df_train_prepro = df_train_prepro.merge(
    df_passenger_long[['IATA_Code', 'year', 'num_passenger_year']],
    left_on=['departure_point', 'year'],
    right_on=['IATA_Code', 'year'],
    how='left'
)

# Drop the temporary columns if you don't need them
df_train_prepro = df_train_prepro.drop(columns=['year', 'IATA_Code'])

# Map Distance km data to main df

In [5]:
import airportsdata

# Load the airports database (IATA codes)
airports = airportsdata.load('IATA')

def get_lat_lon(iata):
    info = airports.get(iata)
    if info:
        return pd.Series({'lat': info['lat'], 'lon': info['lon']})
    else:
        return pd.Series({'lat': None, 'lon': None})

# Populate departure coordinates
dep_coords = df_train_prepro['departure_point'].apply(get_lat_lon)
df_train_prepro['dep_lat'] = dep_coords['lat']
df_train_prepro['dep_long'] = dep_coords['lon']

# Populate arrival coordinates
arr_coords = df_train_prepro['arrival_point'].apply(get_lat_lon)
df_train_prepro['arr_lat'] = arr_coords['lat']
df_train_prepro['arr_long'] = arr_coords['lon']

# Fill SXF manually if still missing
sxf_lat, sxf_long = 52.380001, 13.522500
df_train_prepro.loc[df_train_prepro['departure_point'] == 'SXF', ['dep_lat', 'dep_long']] = [sxf_lat, sxf_long]
df_train_prepro.loc[df_train_prepro['arrival_point'] == 'SXF', ['arr_lat', 'arr_long']] = [sxf_lat, sxf_long]

In [11]:
# Haversine formula to calculate distance in km
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Calculate the distance for each row
df_train_prepro['distance_km'] = haversine(
    df_train_prepro['dep_lat'],
    df_train_prepro['dep_long'],
    df_train_prepro['arr_lat'],
    df_train_prepro['arr_long']
)

df_train_prepro['distance_km'] = (df_train_prepro['distance_km'].round(2) * 100).fillna(0).astype(int)