In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from meteostat import Stations, Daily
from tqdm import tqdm
import time
warnings.filterwarnings("ignore")

In [2]:
def load_csv_file(filename):
    return pd.read_csv(filename)
fd_train = load_csv_file("data/Train.csv")
fd_test =  load_csv_file("data/Test.csv")

In [3]:
fd_train.describe()

Unnamed: 0,target
count,107833.0
mean,48.733013
std,117.135562
min,0.0
25%,0.0
50%,14.0
75%,43.0
max,3451.0


In [4]:
def get_commmon_columns():
    return ['id', 'date', 'flight_id', 'departure_point', 'arrival_point', 'departure_time', 'arrival_time', 'flight_status', 'aircraft_code']

def rename_colums(fd_train, extra_columns):
    columns = get_commmon_columns() + extra_columns
    fd_train.columns = columns
    return fd_train

fd_train = rename_colums(fd_train, ["target"])
fd_test = rename_colums(fd_test,[])

In [10]:
def add_column_duration(fd_train):
    fd_train['departure_time'] = pd.to_datetime(fd_train['departure_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    fd_train['arrival_time'] = pd.to_datetime(fd_train['arrival_time'], format='%Y-%m-%d %H.%M.%S', errors='coerce')
    fd_train['duration'] = (pd.to_datetime(fd_train['arrival_time']) - pd.to_datetime(fd_train['departure_time'])).dt.total_seconds()
    return fd_train

In [11]:
def get_season(month):
    """Map month to season number."""
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    else:
        return 4  # Fall

def get_time_of_day(hour):
    """Categorize hour into time of day."""
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

def add_datetime_features(df):
    df['departure_time_formated'] = pd.to_datetime(df['departure_time'], errors='coerce')
    df['arrival_time_formated'] = pd.to_datetime(df['arrival_time'], errors='coerce')

    # Duration in seconds
    # df['duration'] = (df['arrival_time'] - df['departure_time']).dt.total_seconds()

    # Departure Features
    df['dep_hour'] = df['departure_time_formated'].dt.hour
    df['dep_day'] = df['departure_time_formated'].dt.day
    df['dep_month'] = df['departure_time_formated'].dt.month
    df['dep_dayofweek'] = df['departure_time_formated'].dt.dayofweek + 1  # Make it 1 (Mon) to 7 (Sun)
    df['dep_quarter'] = df['departure_time_formated'].dt.quarter
    df['dep_season'] = df['dep_month'].apply(get_season)
    df['dep_is_weekend'] = df['dep_dayofweek'].isin([6, 7]).astype(int)
    df['dep_time_of_day'] = df['dep_hour'].apply(get_time_of_day)

    # Arrival Features
    df['arr_hour'] = df['arrival_time_formated'].dt.hour
    df['arr_day'] = df['arrival_time_formated'].dt.day
    df['arr_month'] = df['arrival_time_formated'].dt.month
    df['arr_dayofweek'] = df['arrival_time_formated'].dt.dayofweek + 1
    df['arr_quarter'] = df['arrival_time_formated'].dt.quarter
    df['arr_season'] = df['arr_month'].apply(get_season)
    df['arr_is_weekend'] = df['arr_dayofweek'].isin([6, 7]).astype(int)
    df['arr_time_of_day'] = df['arr_hour'].apply(get_time_of_day)

    return df

In [12]:
def add_route_column(df):
    df['route'] = df['departure_point'].astype(str) + ' → ' + df['arrival_point'].astype(str)
    return df

In [15]:
def add_weather_features(df):
    df['departure_time'] = pd.to_datetime(df['departure_time'], errors='coerce')
    df['arrival_time'] = pd.to_datetime(df['arrival_time'], errors='coerce')

    df['departure_date'] = df['departure_time'].dt.date
    df['arrival_date'] = df['arrival_time'].dt.date

    # STEP 2: Load IATA → coordinates from OurAirports
    airports_df = pd.read_csv("data/airports.csv")  # From https://ourairports.com/data/
    iata_df = airports_df[['iata_code', 'latitude_deg', 'longitude_deg']].dropna()
    iata_coords = dict(zip(iata_df['iata_code'], zip(iata_df['latitude_deg'], iata_df['longitude_deg'])))

    # STEP 3: Find closest station for each IATA code (only once)
    unique_iatas = set(df['departure_point'].unique()) | set(df['arrival_point'].unique())
    iata_station = {}

    print("🔍 Finding nearest stations for all unique airports...")
    for iata in tqdm(unique_iatas, desc="Mapping IATA to Station"):
        coord = iata_coords.get(iata)
        if not coord:
            iata_station[iata] = None
            continue
        lat, lon = coord
        try:
            stations = Stations().nearby(lat, lon)
            station = stations.fetch(1)
            if not station.empty:
                iata_station[iata] = station.index[0]
            else:
                iata_station[iata] = None
        except:
            iata_station[iata] = None
        time.sleep(1)
    # STEP 4: Create weather fetch helper
    def fetch_weather(iata, date):
        station_id = iata_station.get(iata)
        if not station_id or pd.isna(date):
            return None
        date = pd.to_datetime(date)
        try:
            weather = Daily(station_id, date, date).fetch()
            if not weather.empty:
                w = weather.iloc[0]
                return {
                    'temp': w['tavg'],
                    'precip': w['prcp'],
                    'wind': w['wspd'],
                    'snow': w['snow'],
                    'weather_code': w.get('coco')
                }
        except:
            return None
        return None

    # STEP 5: Build unique (iata, date) pairs for both dep/arr
    dep_requests = df[['departure_point', 'departure_date']].dropna().drop_duplicates()
    arr_requests = df[['arrival_point', 'arrival_date']].dropna().drop_duplicates()

    # Convert to records and tag
    dep_requests['type'] = 'dep'
    arr_requests['type'] = 'arr'
    dep_requests.columns = ['iata', 'date', 'type']
    arr_requests.columns = ['iata', 'date', 'type']
    all_requests = pd.concat([dep_requests, arr_requests]).drop_duplicates()

    # STEP 6: Fetch weather for all combinations
    weather_records = []
    print("🌦 Fetching weather data for all airport-date combinations...")
    for _, row in tqdm(all_requests.iterrows(), total=all_requests.shape[0], desc="Weather Fetching"):
        iata = row['iata']
        date = row['date']
        w = fetch_weather(iata, date)
        if w:
            weather_records.append({
                'iata': iata,
                'date': date,
                'type': row['type'],
                **w
            })

    weather_df = pd.DataFrame(weather_records)
    # STEP 7: Merge weather back into main DataFrame
    # Separate dep and arr weather
    dep_weather = weather_df[weather_df['type'] == 'dep'].drop(columns='type')
    arr_weather = weather_df[weather_df['type'] == 'arr'].drop(columns='type')

    dep_weather.columns = ['departure_point', 'departure_date'] + [f'dep_{col}' for col in dep_weather.columns[2:]]
    arr_weather.columns = ['arrival_point', 'arrival_date'] + [f'arr_{col}' for col in arr_weather.columns[2:]]

    df = df.merge(dep_weather, on=['departure_point', 'departure_date'], how='left')
    df = df.merge(arr_weather, on=['arrival_point', 'arrival_date'], how='left')

    # STEP 8: Save output
    return df

In [16]:
def immute_weather_features(df):
    # 1. Remove weather code columns
    df = df.drop(columns=['dep_weather_code', 'arr_weather_code','dep_snow','arr_snow'], errors='ignore')

    # 2. Fill snow columns with 0 (assume no snow if not reported)
    # df['dep_snow'] = df['dep_snow'].fillna(0)
    # df['arr_snow'] = df['arr_snow'].fillna(0)

    # 3. TEMP — Impute by month and location
    df['dep_temp'] = df.groupby(['departure_point', 'dep_month'])['dep_temp'].transform(lambda x: x.fillna(x.median()))
    df['arr_temp'] = df.groupby(['arrival_point', 'arr_month'])['arr_temp'].transform(lambda x: x.fillna(x.median()))

    # 4. WIND — Impute by hour and location
    df['dep_wind'] = df.groupby(['departure_point', 'dep_hour'])['dep_wind'].transform(lambda x: x.fillna(x.median()))
    df['arr_wind'] = df.groupby(['arrival_point', 'arr_hour'])['arr_wind'].transform(lambda x: x.fillna(x.median()))

    # 5. PRECIP — Impute by time_of_day and location
    df['dep_precip'] = df.groupby(['departure_point', 'dep_time_of_day'])['dep_precip'].transform(lambda x: x.fillna(x.median()))
    df['arr_precip'] = df.groupby(['arrival_point', 'arr_time_of_day'])['arr_precip'].transform(lambda x: x.fillna(x.median()))

    # 6. Fallbacks
    df['dep_precip'] = df['dep_precip'].fillna(0)
    df['arr_precip'] = df['arr_precip'].fillna(0)

    for col in ['dep_temp', 'dep_wind', 'arr_temp', 'arr_wind']:
        df[col] = df[col].fillna(df[col].median())
    return df

In [17]:
def prepared_train_data(final_df):
    final_df = add_column_duration(fd_train)
    ## Select records which has less than 20hrs long duration
    ## Filter records which have delay(target) between 1minute to 3hrs
    final_df = add_datetime_features(final_df)
    final_df = add_route_column(final_df)
    final_df = fd_train[fd_train["duration"] < 68400]
    df_non_zeros_without_outliers = final_df[(final_df['target'] > 0) & (final_df['target'] < 165)]
    all_zeros = final_df[final_df['target'] == 0]

    # Create a copy of all_zeros and assign random negative values between -1 and -165 to 'target'
    all_zeros_negative = all_zeros.copy()
    positive_targets = df_non_zeros_without_outliers[df_non_zeros_without_outliers['target'] > 0]['target'].values

    all_zeros_negative['target'] = -np.random.choice(positive_targets, size=len(all_zeros_negative), replace=True)
    all_zeros_pos_neg = pd.concat([all_zeros_negative, df_non_zeros_without_outliers], axis=0, ignore_index=True)
    all_zeros_pos_neg.drop(["id", "flight_id", "date"], inplace=True, axis=1)
    all_zeros_pos_neg = add_weather_features(all_zeros_pos_neg)
    all_zeros_pos_neg = immute_weather_features(all_zeros_pos_neg)
    return all_zeros_pos_neg

In [13]:
df = prepared_train_data(fd_train)
df

🔍 Finding nearest stations for all unique airports...


Mapping IATA to Station: 100%|██████████| 133/133 [02:21<00:00,  1.07s/it]


🌦 Fetching weather data for all airport-date combinations...


Weather Fetching: 100%|██████████| 64625/64625 [39:06<00:00, 27.54it/s]   


Unnamed: 0,departure_point,arrival_point,departure_time,arrival_time,flight_status,aircraft_code,target,duration,dep_hour,dep_day,...,arr_time_of_day,route,departure_date,arrival_date,dep_temp,dep_precip,dep_wind,arr_temp,arr_precip,arr_wind
0,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU 32AIMN,-37.0,9300.0,4,16,...,morning,TUN → IST,2016-01-16,2016-01-16,9.4,9.9,23.6,12.5,0.0,16.5
1,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17:00:00,ATA,TU 736IOK,-15.0,10200.0,14,17,...,evening,DJE → NTE,2016-01-17,2016-01-17,11.7,0.0,41.2,2.7,2.6,8.8
2,TUN,MED,2016-01-20 19:40:00,2016-01-21 00:00:00,ATA,TU 320IMR,-115.0,15600.0,19,20,...,night,TUN → MED,2016-01-20,2016-01-21,11.1,1.3,6.8,22.3,23.9,11.8
3,IST,TUN,2016-01-21 20:10:00,2016-01-21 23:00:00,ATA,TU 320IMU,-25.0,10200.0,20,21,...,night,IST → TUN,2016-01-21,2016-01-21,5.2,0.0,10.8,10.3,0.0,5.0
4,CMN,TUN,2016-01-22 17:45:00,2016-01-22 20:10:00,ATA,TU 320IMR,-19.0,8700.0,17,22,...,evening,CMN → TUN,2016-01-22,2016-01-22,16.0,0.0,9.8,10.6,2.0,4.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99742,TUN,DJE,2018-04-18 08:20:00,2018-04-18 09:10:00,ATA,TU 31BIMQ,8.0,3000.0,8,18,...,morning,TUN → DJE,2018-04-18,2018-04-18,16.8,0.0,14.2,17.1,0.0,20.2
99743,ORY,DJE,2018-12-05 10:15:00,2018-12-05 13:05:00,ATA,TU 736IOL,20.0,10200.0,10,5,...,afternoon,ORY → DJE,2018-12-05,2018-12-05,9.6,1.2,10.7,17.3,0.0,12.5
99744,BRU,DJE,2018-12-05 09:45:00,2018-12-05 12:50:00,ATA,TU 736IOR,15.0,11100.0,9,5,...,afternoon,BRU → DJE,2018-12-05,2018-12-05,19.1,0.0,12.4,17.3,0.0,12.5
99745,ORY,TUN,2018-12-04 18:30:00,2018-12-04 20:55:00,ATA,TU 32AIMI,22.0,8700.0,18,4,...,evening,ORY → TUN,2018-12-04,2018-12-04,10.2,1.2,8.2,16.7,0.0,19.6


In [None]:
df.to_csv("data/preprocessed_train_data_with_date_new.csv", index=False)

In [18]:
def prepared_test_data(final_test_df):
    final_df = final_test_df.copy()
    final_df.drop(["flight_id", "date"], inplace=True, axis=1)
    final_df = add_column_duration(final_df)
    final_df = add_datetime_features(final_df)
    final_df = add_route_column(final_df)
    final_df = add_weather_features(final_df)
    final_df = immute_weather_features(final_df)
    return final_df

In [19]:
df_test = prepared_test_data(fd_test)

🔍 Finding nearest stations for all unique airports...


Mapping IATA to Station: 100%|██████████| 84/84 [01:26<00:00,  1.03s/it]


🌦 Fetching weather data for all airport-date combinations...


Weather Fetching: 100%|██████████| 5762/5762 [01:50<00:00, 52.03it/s]


In [20]:
df_test

Unnamed: 0,id,departure_point,arrival_point,departure_time,arrival_time,flight_status,aircraft_code,duration,departure_time_formated,arrival_time_formated,...,arr_time_of_day,route,departure_date,arrival_date,dep_temp,dep_precip,dep_wind,arr_temp,arr_precip,arr_wind
0,test_id_0,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07:30:00,ATA,TU 32AIMF,3000.0,2016-05-04 06:40:00,2016-05-04 07:30:00,...,morning,DJE → TUN,2016-05-04,2016-05-04,21.2,0.0,12.7,18.7,0.0,21.10
1,test_id_1,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20:05:00,ATA,TU 320IMW,17100.0,2016-05-05 15:20:00,2016-05-05 20:05:00,...,evening,TUN → BKO,2016-05-05,2016-05-05,18.4,0.0,9.7,34.4,0.0,13.10
2,test_id_2,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12:25:00,ATA,TU 32AIMC,8700.0,2016-05-06 10:00:00,2016-05-06 12:25:00,...,afternoon,FRA → TUN,2016-05-06,2016-05-06,17.0,0.0,7.9,19.7,0.0,18.20
3,test_id_3,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13:10:00,ATA,TU 31BIMO,12600.0,2016-05-11 09:40:00,2016-05-11 13:10:00,...,afternoon,BEY → TUN,2016-05-11,2016-05-11,21.3,0.0,10.3,23.1,0.0,15.10
4,test_id_4,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12:35:00,ATA,TU 736IOQ,9900.0,2016-05-11 09:50:00,2016-05-11 12:35:00,...,afternoon,ORY → MIR,2016-05-11,2016-05-11,14.6,4.0,4.8,19.9,0.0,13.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9328,test_id_9328,TUN,NCE,2018-09-12 14:15:00,2018-09-12 15:45:00,ATA,TU 320IMV,5400.0,2018-09-12 14:15:00,2018-09-12 15:45:00,...,afternoon,TUN → NCE,2018-09-12,2018-09-12,25.9,0.0,8.6,24.4,0.0,10.50
9329,test_id_9329,TUN,TUN,2018-09-27 22:00:00,2018-09-28 01:00:00,SCH,TU 32AIMG,10800.0,2018-09-27 22:00:00,2018-09-28 01:00:00,...,night,TUN → TUN,2018-09-27,2018-09-28,21.3,14.5,18.0,21.3,0.0,16.10
9330,test_id_9330,SJJ,TUN,2018-09-03 09:20:00,2018-09-03 11:10:00,SCH,TU CR9ISA,6600.0,2018-09-03 09:20:00,2018-09-03 11:10:00,...,morning,SJJ → TUN,2018-09-03,2018-09-03,18.9,0.0,5.1,27.1,0.0,9.20
9331,test_id_9331,TUN,DJE,2018-09-15 14:30:00,2018-09-15 15:30:00,SCH,UG AT7LBD,3600.0,2018-09-15 14:30:00,2018-09-15 15:30:00,...,afternoon,TUN → DJE,2018-09-15,2018-09-15,26.4,0.8,8.4,27.9,0.0,14.80


In [29]:
test_airports = df_test['departure_point'].unique()
test_airports.sort()
test_airports = pd.DataFrame(test_airports, columns=['departure_point'])
test_airports

Unnamed: 0,departure_point
0,ABJ
1,ALG
2,AMS
3,ATH
4,AYT
...,...
77,VKO
78,VNO
79,VRN
80,YUL


In [28]:
train_airports = df['departure_point'].unique()
train_airports.sort()
train_airports = pd.DataFrame(train_airports, columns=['departure_point'])
train_airports

Unnamed: 0,departure_point
0,AAE
1,AAL
2,ABJ
3,ADB
4,AHU
...,...
123,VKO
124,VNO
125,VRN
126,YUL


In [35]:
test_airports_arrival = df_test['arrival_point'].unique()
test_airports_arrival.sort()
test_airports_arrival = pd.DataFrame(test_airports, columns=['departure_point'])
test_airports_arrival

Unnamed: 0,departure_point
0,ABJ
1,ALG
2,AMS
3,ATH
4,AYT
...,...
77,VKO
78,VNO
79,VRN
80,YUL


In [36]:
train_airports_arrival = df['arrival_point'].unique()
train_airports_arrival.sort()
train_airports_arrival = pd.DataFrame(train_airports, columns=['departure_point'])
train_airports_arrival

Unnamed: 0,departure_point
0,AAE
1,AAL
2,ABJ
3,ADB
4,AHU
...,...
123,VKO
124,VNO
125,VRN
126,YUL


In [30]:
# Check which test airports are NOT in train airports
missing_airports = set(test_airports['departure_point']) - set(train_airports['departure_point'])

print("Airports in test not in train:", missing_airports)

Airports in test not in train: {'MTS', 'BGF'}


In [37]:
# Check which test airports are NOT in train airports
missing_airports = set(test_airports_arrival['departure_point']) - set(train_airports_arrival['departure_point'])

print("Airports in test not in train:", missing_airports)

Airports in test not in train: {'MTS', 'BGF'}


In [31]:
test_airports['in_train'] = test_airports['departure_point'].isin(train_airports['departure_point'])
print(test_airports)

   departure_point  in_train
0              ABJ      True
1              ALG      True
2              AMS      True
3              ATH      True
4              AYT      True
..             ...       ...
77             VKO      True
78             VNO      True
79             VRN      True
80             YUL      True
81             ZRH      True

[82 rows x 2 columns]


In [34]:
not_in = test_airports[test_airports['in_train'] == False]
not_in

Unnamed: 0,departure_point,in_train
8,BGF,False
51,MTS,False


In [None]:
test_airports.to_csv("data/test_airports.csv", index=False)

In [21]:
df_test.to_csv("data/preprocessed_test_data_with_date_test.csv", index=False)

In [20]:
df.head()

Unnamed: 0,departure_point,arrival_point,departure_time,arrival_time,flight_status,aircraft_code,target,duration,dep_hour,dep_day,...,arr_time_of_day,route,departure_date,arrival_date,dep_temp,dep_precip,dep_wind,arr_temp,arr_precip,arr_wind
0,TUN,IST,2016-01-16 04:10:00,2016-01-16 06:45:00,ATA,TU 32AIMN,-19.0,9300.0,4,16,...,morning,TUN → IST,2016-01-16,2016-01-16,9.4,9.9,23.6,12.5,0.0,16.5
1,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17:00:00,ATA,TU 736IOK,-48.0,10200.0,14,17,...,evening,DJE → NTE,2016-01-17,2016-01-17,11.7,0.0,41.2,2.7,2.6,8.8
2,TUN,MED,2016-01-20 19:40:00,2016-01-21 00:00:00,ATA,TU 320IMR,-16.0,15600.0,19,20,...,night,TUN → MED,2016-01-20,2016-01-21,11.1,1.3,6.8,22.3,23.9,11.8
3,IST,TUN,2016-01-21 20:10:00,2016-01-21 23:00:00,ATA,TU 320IMU,-8.0,10200.0,20,21,...,night,IST → TUN,2016-01-21,2016-01-21,5.2,0.0,10.8,10.3,0.0,5.0
4,CMN,TUN,2016-01-22 17:45:00,2016-01-22 20:10:00,ATA,TU 320IMR,-37.0,8700.0,17,22,...,evening,CMN → TUN,2016-01-22,2016-01-22,16.0,0.0,9.8,10.6,2.0,4.6
