In [3]:
import os
import zipfile
import pandas as pd
import joblib
from dateutil import parser
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



# Data Preparation

In [23]:
# time categories
def get_time_of_day(dt):
    hour = dt.hour
    if hour >=5 and hour < 8:
        return 'Early Morning'
    elif hour >= 8 and hour < 11:
        return 'Morning'
    elif hour >= 11 and hour < 14:
        return 'Midday'
    elif hour >= 14 and hour < 17:
        return 'Afternoon'
    elif hour >= 17 and hour < 20:
        return 'Evening'
    elif hour >= 20 and hour < 23:
        return 'Night'
    else:
        return 'Late Night'
 
# Remove UTC offset from datetime strings
def remove_utc_offset(datetime_str):
    dt = parser.parse(datetime_str)
    return dt.strftime('%Y-%m-%dT%H:%M:%S.%f')

# function to get features
def getfeatures(df):

    df['departuretime'] = df['segmentsDepartureTimeRaw'].apply(remove_utc_offset) 
    df['departuretime'] = pd.to_datetime(df['departuretime'], utc=False)
    
    # time category
    df['time_category'] = df['departuretime'].apply(get_time_of_day)
    
    # departure date
    df['date'] = (df['departuretime'] - pd.Timedelta(hours=2)).dt.date

    # no. of days from flight
    df['days_from_flight'] = (df['date'] - pd.to_datetime(df['searchDate']).dt.date).dt.days

    return df

def datefeatures(df):
    # Extract the year
    df['year'] = df['date'].dt.year

    # Extract the month
    df['month'] = df['date'].dt.month

    # Extract the day of the week (Monday=0, Sunday=6)
    df['day_of_week'] = df['date'].dt.dayofweek

    # Extract the day of the month
    df['day_of_month'] = df['date'].dt.day

    return df

In [4]:
# Root directory containing subfolders 
root_dir = 'itineraries_csv'

dataframes = []

# Loop through each subfolder
for directory in os.listdir(root_dir):

  if os.path.isdir(os.path.join(root_dir, directory)) and not directory.startswith('.'):
    print(directory)
    folder_path = os.path.join(root_dir, directory)  

    # Loop through zip files
    for filename in os.listdir(folder_path):
      
      if filename.endswith('.zip'):
      
        # Full path of zip file
        zip_path = os.path.join(folder_path, filename)  
        
        # Extract zip contents
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(folder_path)
              
          # Parse CSV file in zip 
          csv_path = os.path.join(folder_path, [x for x in zip_ref.namelist() if x.endswith('.csv')][0])      
          df = pd.read_csv(csv_path)
          
          # Apply processing 
          df = df[df['isNonStop'] == True]  
          df = getfeatures(df)
          df = datefeatures(df)

          # median fare
          df = df.groupby(['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'date', 'time_category', 'days_from_flight'])['totalFare'].median().reset_index(name='medianfare')
          
          # Append to list
          dataframes.append(df) 
        
# Combine dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.head()

OAK
DEN
LGA
LAX
ATL
CLT
PHL
DTW
IAD
JFK
DFW
BOS
EWR
SFO
ORD
MIA


In [11]:
combined_df.head()

Unnamed: 0,segmentsDepartureAirportCode,segmentsArrivalAirportCode,segmentsCabinCode,date,time_category,days_from_flight,medianfare
0,OAK,DEN,coach,2022-05-20,Evening,31,83.98
1,OAK,DEN,coach,2022-05-21,Evening,32,43.98
2,OAK,DEN,coach,2022-05-22,Evening,33,173.98
3,OAK,LAX,coach,2022-05-20,Afternoon,31,168.61
4,OAK,LAX,coach,2022-05-20,Early Morning,31,100.285


In [25]:
len(combined_df)

1649244

In [9]:
combined_df.to_csv('medianfares.csv', index=False)

In [18]:
combined_df = pd.read_csv('medianfares.csv')

In [19]:
print('Earliest date:', combined_df['date'].min())
print('Latest date:', combined_df['date'].max())

Earliest date: 2022-04-16
Latest date: 2022-07-18


In [20]:
# Define the split date
split_date = pd.Timestamp('2022-06-17')
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Create the train and test sets
train = combined_df.loc[combined_df.date < split_date]
test = combined_df.loc[combined_df.date >= split_date]

print(len(train))
print(len(test))

# Modelling

In [43]:
def fitmodel(model, model_name, train=train, test=test):

    X_train = train.drop(columns='medianfare')
    y_train = train['medianfare']

    X_test = test.drop(columns='medianfare')
    y_test = test['medianfare']

    # Define categorical and numeric columns for preprocessing
    categorical_cols = ['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'time_category']
    numeric_cols = ['year',  'month', 'day_of_week', 'day_of_month', 'days_from_flight']

    # Preprocessing and modeling pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols), 
            ('cat', OneHotEncoder(), categorical_cols)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Assuming you have a scikit-learn pipeline named 'pipeline'
    joblib.dump(pipeline, f'{model_name}.pkl')

    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return pipeline, model_name, rmse, r2, mae, mape

In [44]:
cat = fitmodel(CatBoostRegressor(), 'catboost')

Learning rate set to 0.127437
0:	learn: 178.8832767	total: 38.8ms	remaining: 38.7s
1:	learn: 170.1752199	total: 74ms	remaining: 36.9s
2:	learn: 163.2071162	total: 107ms	remaining: 35.7s
3:	learn: 157.5796433	total: 142ms	remaining: 35.3s
4:	learn: 153.1336637	total: 179ms	remaining: 35.5s
5:	learn: 149.5677644	total: 213ms	remaining: 35.3s
6:	learn: 146.8398058	total: 250ms	remaining: 35.4s
7:	learn: 144.7437824	total: 285ms	remaining: 35.3s
8:	learn: 142.6029095	total: 318ms	remaining: 35s
9:	learn: 141.0711963	total: 353ms	remaining: 34.9s
10:	learn: 139.3966316	total: 390ms	remaining: 35.1s
11:	learn: 137.7504346	total: 427ms	remaining: 35.1s
12:	learn: 136.6428897	total: 464ms	remaining: 35.3s
13:	learn: 135.5752283	total: 504ms	remaining: 35.5s
14:	learn: 134.7283774	total: 536ms	remaining: 35.2s
15:	learn: 133.6428657	total: 570ms	remaining: 35s
16:	learn: 132.6737667	total: 609ms	remaining: 35.2s
17:	learn: 131.8394402	total: 643ms	remaining: 35.1s
18:	learn: 131.1369194	total: 

In [45]:
print(cat)

(Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['year', 'month',
                                                   'day_of_week',
                                                   'day_of_month',
                                                   'days_from_flight']),
                                                 ('cat', OneHotEncoder(),
                                                  ['segmentsDepartureAirportCode',
                                                   'segmentsArrivalAirportCode',
                                                   'segmentsCabinCode',
                                                   'time_category'])])),
                ('regressor',
                 <catboost.core.CatBoostRegressor object at 0x7f85f927f9a0>)]), 'catboost', 89.4629559312407, 0.6901487097839116, 62.53999769026451, 0.21341932118001172)


In [46]:
xgboost = fitmodel(XGBRegressor(), 'xgb')

print(xgboost)

(Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['year', 'month',
                                                   'day_of_week',
                                                   'day_of_month',
                                                   'days_from_flight']),
                                                 ('cat', OneHotEncoder(),
                                                  ['segmentsDepartureAirportCode',
                                                   'segmentsArrivalAirportCode',
                                                   'segmentsCabinCode',
                                                   'time_category'])])),
                ('regressor',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              cols...
                              feature_types=None, gamma=None, gpu_id=None,
                 

In [48]:
rforest = fitmodel(RandomForestRegressor(), 'rf')

print(rforest)

(Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['year', 'month',
                                                   'day_of_week',
                                                   'day_of_month',
                                                   'days_from_flight']),
                                                 ('cat', OneHotEncoder(),
                                                  ['segmentsDepartureAirportCode',
                                                   'segmentsArrivalAirportCode',
                                                   'segmentsCabinCode',
                                                   'time_category'])])),
                ('regressor', RandomForestRegressor())]), 'rf', 98.6780596513176, 0.6230289283508642, 64.68049112683988, 0.21106128587494188)


In [8]:
import json

unique_segments = combined_df[['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode']].drop_duplicates()

# Create a dictionary where each departure airport has a list of arrival airports
airport_dict = {}
for index, row in unique_segments.iterrows():
    dep_airport = row['segmentsDepartureAirportCode']
    arr_airport = row['segmentsArrivalAirportCode']
    if dep_airport in airport_dict:
        airport_dict[dep_airport].append(arr_airport)
    else:
        airport_dict[dep_airport] = [arr_airport]

# Convert dictionary to JSON
json_output = json.dumps(airport_dict, indent=4)

# Printing or using the json_output as needed
print(json_output)


{
    "OAK": [
        "DEN",
        "LAX",
        "PHL",
        "DTW",
        "ORD"
    ],
    "IAD": [
        "SFO",
        "ATL",
        "BOS",
        "CLT",
        "DEN",
        "DFW",
        "DTW",
        "EWR",
        "JFK",
        "LAX",
        "LGA",
        "MIA",
        "ORD"
    ],
    "DEN": [
        "ATL",
        "BOS",
        "CLT",
        "DFW",
        "DTW",
        "EWR",
        "IAD",
        "JFK",
        "LAX",
        "LGA",
        "MIA",
        "OAK",
        "ONT",
        "ORD",
        "PHL",
        "SFO"
    ],
    "LGA": [
        "ATL",
        "BOS",
        "CLT",
        "DEN",
        "DFW",
        "DTW",
        "IAD",
        "MIA",
        "ORD",
        "LAX"
    ],
    "LAX": [
        "ATL",
        "BOS",
        "CLT",
        "DAL",
        "DEN",
        "DFW",
        "DTW",
        "EWR",
        "IAD",
        "JFK",
        "LGA",
        "MIA",
        "OAK",
        "ORD",
        "PHL",
        "SFO"
    ],
   

In [39]:
with open('flightroutes.json', 'w') as file:
    file.write(json_output)

# Catboost with Distance/Duration Features

In [11]:
# Getting all Data

# Root directory containing subfolders 
root_dir = 'itineraries_csv'

dataframes = []

# Loop through each subfolder
for directory in os.listdir(root_dir):

  if os.path.isdir(os.path.join(root_dir, directory)) and not directory.startswith('.'):
    print(directory)
    folder_path = os.path.join(root_dir, directory)  

    # Loop through zip files
    for filename in os.listdir(folder_path):
      
      if filename.endswith('.zip'):
      
        # Full path of zip file
        zip_path = os.path.join(folder_path, filename)  
        
        # Extract zip contents
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(folder_path)
              
          # Parse CSV file in zip 
          csv_path = os.path.join(folder_path, [x for x in zip_ref.namelist() if x.endswith('.csv')][0])      
          df = pd.read_csv(csv_path)
          
          # Apply processing 
          df = df[df['isNonStop'] == True]  
          df = getfeatures(df)
          df = datefeatures(df)

          # Append to list
          dataframes.append(df) 
        
# Combine dataframes
all_single = pd.concat(dataframes, ignore_index=True)
all_single.head()

OAK
DEN
LGA
LAX
ATL
CLT
PHL
DTW
IAD
JFK
DFW
BOS
EWR
SFO
ORD
MIA


Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,segmentsDepartureTimeEpochSeconds,segmentsDepartureTimeRaw,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,departuretime,time_category,date,days_from_flight
0,ce6c79e893312e42ddd97e22637ee980,2022-04-19,2022-05-20,OAK,DEN,PT2H33M,False,False,True,83.98,943.0,1653098280,2022-05-20T18:58:00.000-07:00,1653107460,2022-05-20T22:31:00.000-06:00,DEN,OAK,Frontier Airlines,F9,,9180,943.0,coach,2022-05-20 18:58:00,Evening,2022-05-20,31
1,a0bc7fa432fd2475c3dbe17b376b1830,2022-04-19,2022-05-20,OAK,LAX,PT1H22M,False,False,True,42.79,,1653062160,2022-05-20T08:56:00.000-07:00,1653067080,2022-05-20T10:18:00.000-07:00,LAX,OAK,Spirit Airlines,NK,,4920,,coach,2022-05-20 08:56:00,Morning,2022-05-20,31
2,3e224e69b09b19ba9a2562b55e275713,2022-04-19,2022-05-20,OAK,LAX,PT1H22M,False,False,True,66.97,,1653051900,2022-05-20T06:05:00.000-07:00,1653056820,2022-05-20T07:27:00.000-07:00,LAX,OAK,Spirit Airlines,NK,AIRBUS INDUSTRIE A320 SHARKLETS,4920,,coach,2022-05-20 06:05:00,Early Morning,2022-05-20,31
3,d89f49d3dd5a77fec2fabbcc258ef0f3,2022-04-19,2022-05-20,OAK,LAX,PT1H24M,False,False,True,133.6,338.0,1653056160,2022-05-20T07:16:00.000-07:00,1653061200,2022-05-20T08:40:00.000-07:00,LAX,OAK,Delta,DL,Embraer 175 (Enhanced Winglets),5040,338.0,coach,2022-05-20 07:16:00,Early Morning,2022-05-20,31
4,b22bb0ae76e6f92840354d33d614b922,2022-04-19,2022-05-20,OAK,LAX,PT1H21M,False,False,True,168.61,338.0,1653087840,2022-05-20T16:04:00.000-07:00,1653092700,2022-05-20T17:25:00.000-07:00,LAX,OAK,Delta,DL,Embraer 175 (Enhanced Winglets),4860,338.0,coach,2022-05-20 16:04:00,Afternoon,2022-05-20,31


In [25]:
# Duration and Distance Features

median_duration = all_single.groupby(['segmentsDepartureAirportCode','segmentsArrivalAirportCode', 'time_category'])['segmentsDurationInSeconds'].median().reset_index(name='median_duration')

median_distance = all_single[all_single['segmentsDistance'] != 'None']
median_distance = median_distance.groupby(['segmentsDepartureAirportCode','segmentsArrivalAirportCode', 'time_category'])['segmentsDistance'].median().reset_index(name='median_distance')

combined_df = pd.merge(combined_df, median_duration, how='left', on=['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'time_category'])
combined_df = pd.merge(combined_df, median_distance, how='left', on=['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'time_category'])

combined_df.head()

Unnamed: 0,segmentsDepartureAirportCode,segmentsArrivalAirportCode,segmentsCabinCode,date,time_category,days_from_flight,medianfare,year,month,day_of_week,day_of_month
0,OAK,DEN,coach,2022-05-20,Evening,31,83.98,2022,5,4,20
1,OAK,DEN,coach,2022-05-21,Evening,32,43.98,2022,5,5,21
2,OAK,DEN,coach,2022-05-22,Evening,33,173.98,2022,5,6,22
3,OAK,LAX,coach,2022-05-20,Afternoon,31,168.61,2022,5,4,20
4,OAK,LAX,coach,2022-05-20,Early Morning,31,100.285,2022,5,4,20


In [31]:
# Define the split date
split_date = pd.Timestamp('2022-06-17')
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Create the train and test sets
train = combined_df.loc[combined_df.date < split_date]
test = combined_df.loc[combined_df.date >= split_date]

print(len(train))
print(len(test))

1321239
328005


In [35]:
# Define categorical and numeric columns for preprocessing
categorical_cols = ['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'time_category']
numeric_cols = ['year',  'month', 'day_of_week', 'day_of_month', 'days_from_flight', 'median_duration', 'median_distance']

def fitmodel2(model, model_name, train=train, test=test):

    X_train = train.drop(columns='medianfare')
    y_train = train['medianfare']

    X_test = test.drop(columns='medianfare')
    y_test = test['medianfare']

    # Preprocessing and modeling pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols), 
            ('cat', OneHotEncoder(), categorical_cols)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Assuming you have a scikit-learn pipeline named 'pipeline'
    joblib.dump(pipeline, f'{model_name}.pkl')

    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return pipeline, model_name, rmse, r2, mae, mape

In [36]:
cat2 = fitmodel2(CatBoostRegressor(), 'catboost2')
print(cat2)

Learning rate set to 0.127437
0:	learn: 176.4885393	total: 163ms	remaining: 2m 42s
1:	learn: 165.6213647	total: 213ms	remaining: 1m 46s
2:	learn: 156.7693525	total: 263ms	remaining: 1m 27s
3:	learn: 149.5701692	total: 312ms	remaining: 1m 17s
4:	learn: 143.7220964	total: 373ms	remaining: 1m 14s
5:	learn: 139.0529583	total: 437ms	remaining: 1m 12s
6:	learn: 135.1760285	total: 488ms	remaining: 1m 9s
7:	learn: 132.0802373	total: 544ms	remaining: 1m 7s
8:	learn: 129.5849659	total: 595ms	remaining: 1m 5s
9:	learn: 127.4678092	total: 656ms	remaining: 1m 4s
10:	learn: 125.7560730	total: 712ms	remaining: 1m 3s
11:	learn: 124.2226336	total: 762ms	remaining: 1m 2s
12:	learn: 122.8085946	total: 826ms	remaining: 1m 2s
13:	learn: 121.6664481	total: 892ms	remaining: 1m 2s
14:	learn: 120.6415272	total: 950ms	remaining: 1m 2s
15:	learn: 119.7708667	total: 1s	remaining: 1m 1s
16:	learn: 119.0111153	total: 1.05s	remaining: 1m 1s
17:	learn: 118.3005316	total: 1.12s	remaining: 1m 1s
18:	learn: 117.6003735	