## Data Load (~10 min)

In [1]:
!pip install kaggle
!kaggle datasets download -d dilwong/flightprices

Dataset URL: https://www.kaggle.com/datasets/dilwong/flightprices
License(s): Attribution 4.0 International (CC BY 4.0)
flightprices.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
!unzip -n flightprices.zip

Archive:  flightprices.zip


In [3]:
!pip install pyspark



In [4]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
try:
  sc.stop()
except:
  pass
sc = SparkContext()
sqlContext = SQLContext(sc)



In [6]:
ss = SparkSession.builder.getOrCreate()

In [7]:
df = sqlContext.read.csv('itineraries.csv', header = True)

In [8]:
df.show()

+--------------------+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+--------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|               legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode| segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segments

## Pipeline (Imputer + OneHotEncoder + VectorAssembler) (~40 min)

In [9]:
# Special handling for the travel duration variable:
# Guaranteed to contain Python/tab regex chars (i.e., 'P'/'T').
# Optionally contains day/hour/minute regex chars (i.e., 'D','H','M').

def tD_proc(entry):
  # Remove all Python/tab regex chars.
  wo_PT = ''.join(re.split('[PT]', entry))

  # Split on day/hour/minute:
  daysplit = re.split('D', wo_PT)
  hoursplit = re.split('H', daysplit[len(daysplit) - 1])
  minsplit = re.split('M', hoursplit[len(hoursplit) - 1])

  # Get consistent units (minutes).
  totalmins = 0
  try:
    totalmins += 24 * 60 * int(daysplit[0])
  except: pass
  try:
    totalmins += 60 * int(hoursplit[0])
  except: pass
  try:
    totalmins += int(minsplit[0])
  except: pass

  return totalmins

# Register udf.
tD_p = F.udf(lambda x: tD_proc(x))

In [10]:
import pandas as pd
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.feature import Imputer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor

In [11]:
# -------- Define Features and Target -----
num_feats = ['elapsedDays', 'totalTravelDistance', 'seatsRemaining']
cat_feats = ['startingAirport', 'destinationAirport', 'isBasicEconomy',
             'isNonstop']
targets = ['baseFare', 'totalFare']

# Extract flight duration & select date-based features.
fNt = df.select([df[f].cast('float') for f in num_feats + targets] + cat_feats +
 [F.dayofweek(df['flightDate']).alias('DOW')] +
  [F.month(df['flightDate']).alias('MOY')] +
   [tD_p('travelDuration').alias('tD_p').cast('float')])

In [12]:
# Train-test split
train_df, test_df = fNt.randomSplit([0.01, 0.99], 42)

In [13]:
# Fit the pipeline to the train set and run it through.
cat_feats_si = [f + '_si' for f in cat_feats + ['DOW'] +['MOY']]
cat_feats_ohe = [f + '_ohe' for f in cat_feats + ['DOW'] + ['MOY']]
num_feats_p = num_feats + ['tD_p']
fe_pipeline = Pipeline(stages = [Imputer(strategy = 'median',
                                         inputCols = num_feats_p,
                                         outputCols = num_feats_p),
                                 StringIndexer(inputCols = cat_feats + ['DOW'] + ['MOY'],
                                               outputCols = cat_feats_si),
                                 OneHotEncoder(inputCols = cat_feats_si,
                                               outputCols = cat_feats_ohe),
                                 VectorAssembler(inputCols = (num_feats_p + cat_feats_ohe),
                                                 outputCol = 'features')]).fit(train_df)

train_fe = fe_pipeline.transform(train_df)
train_fe.show()

+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-------------+-------------+-------------+--------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|startingAirport|destinationAirport|isBasicEconomy|isNonstop|DOW|MOY| tD_p|startingAirport_si|destinationAirport_si|isBasicEconomy_si|isNonstop_si|DOW_si|MOY_si|startingAirport_ohe|destinationAirport_ohe|isBasicEconomy_ohe|isNonstop_ohe|      DOW_ohe|      MOY_ohe|            features|
+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-----

In [14]:
# Run the test set through the pipeline.
test_fe = fe_pipeline.transform(test_df)
test_fe.show()

+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-------------+-------------+-------------+--------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|startingAirport|destinationAirport|isBasicEconomy|isNonstop|DOW|MOY| tD_p|startingAirport_si|destinationAirport_si|isBasicEconomy_si|isNonstop_si|DOW_si|MOY_si|startingAirport_ohe|destinationAirport_ohe|isBasicEconomy_ohe|isNonstop_ohe|      DOW_ohe|      MOY_ohe|            features|
+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-----

## Baseline: Optimal Trivial Predictor (Relative to MSE: Mean)
## (~1 hr)

In [15]:
# Choose target: either 'baseFare' or 'totalFare'.
target_choice = 'baseFare'

In [16]:
# Baseline train evaluation.
train_tm = train_fe.agg({target_choice: 'mean'}).first().asDict()['avg(baseFare)']
train_SE_tp = train_fe.select(((train_fe[target_choice] - F.lit(train_tm)) ** 2).alias('SE'))
train_MSE_tp = train_SE_tp.agg({'SE': 'mean'}).first()
train_MSE_tp

Row(avg(SE)=33589.24748531753)

In [17]:
# Baseline test evaluation.
test_SE_tp = test_fe.select(((test_fe[target_choice] - F.lit(train_tm)) ** 2).alias('SE'))
test_MSE_tp = test_SE_tp.agg({'SE': 'mean'}).first()
test_MSE_tp

Row(avg(SE)=33557.903815543345)

## Linear Regression (~1 hr 30 min)

In [18]:
lr = LinearRegression(featuresCol = 'features', labelCol = target_choice).fit(train_fe)
train_lr = lr.transform(train_fe)
train_lr.show()

+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-------------+-------------+-------------+--------------------+------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|startingAirport|destinationAirport|isBasicEconomy|isNonstop|DOW|MOY| tD_p|startingAirport_si|destinationAirport_si|isBasicEconomy_si|isNonstop_si|DOW_si|MOY_si|startingAirport_ohe|destinationAirport_ohe|isBasicEconomy_ohe|isNonstop_ohe|      DOW_ohe|      MOY_ohe|            features|        prediction|
+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+---------

In [19]:
train_SE_lr = train_lr.select(((train_lr[target_choice] - train_lr['prediction']) ** 2).alias('SE'))
train_MSE_lr = train_SE_lr.agg({'SE': 'mean'}).first()
train_MSE_lr

Row(avg(SE)=18440.51589817419)

In [20]:
# Evaluate performance on test set.
test_lr = lr.transform(test_fe)
test_SE_lr = test_lr.select(((test_lr[target_choice] - test_lr['prediction']) ** 2).alias('SE'))
test_MSE_lr = test_SE_lr.agg({'SE': 'mean'}).first()
test_MSE_lr

Row(avg(SE)=18471.990521863416)

## Random Forest Regressor (~2 hr)

In [21]:
rfr = RandomForestRegressor(featuresCol = 'features', labelCol = target_choice, seed = 42).fit(train_fe)
train_rfr = rfr.transform(train_fe)
train_rfr.show()

+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+----------------------+------------------+-------------+-------------+-------------+--------------------+------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|startingAirport|destinationAirport|isBasicEconomy|isNonstop|DOW|MOY| tD_p|startingAirport_si|destinationAirport_si|isBasicEconomy_si|isNonstop_si|DOW_si|MOY_si|startingAirport_ohe|destinationAirport_ohe|isBasicEconomy_ohe|isNonstop_ohe|      DOW_ohe|      MOY_ohe|            features|        prediction|
+-----------+-------------------+--------------+--------+---------+---------------+------------------+--------------+---------+---+---+-----+------------------+---------------------+-----------------+------------+------+------+-------------------+---------

In [22]:
train_SE_rfr = train_rfr.select(((train_rfr[target_choice] - train_rfr['prediction']) ** 2).alias('SE'))
MSE_rfr = train_SE_rfr.agg({'SE': 'mean'}).first()
MSE_rfr

Row(avg(SE)=18881.717884972168)

In [23]:
# Evaluate performance on test set.
test_rfr = rfr.transform(test_fe)
test_SE_rfr = test_rfr.select(((test_rfr[target_choice] - test_rfr['prediction']) ** 2).alias('SE'))
test_MSE_rfr = test_SE_rfr.agg({'SE': 'mean'}).first()
test_MSE_rfr

Row(avg(SE)=18911.338155585952)