## Data Load (~10 min)

In [1]:
!pip install kaggle
!kaggle datasets download -d dilwong/flightprices

Dataset URL: https://www.kaggle.com/datasets/dilwong/flightprices
License(s): Attribution 4.0 International (CC BY 4.0)
flightprices.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
!unzip -n flightprices.zip

Archive:  flightprices.zip


In [3]:
!pip install pyspark



In [4]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
try:
  sc.stop()
except:
  pass
sc = SparkContext()
sqlContext = SQLContext(sc)



In [6]:
ss = SparkSession.builder.getOrCreate()

In [7]:
df = sqlContext.read.csv('itineraries.csv', header = True)

In [8]:
df.show()

+--------------------+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+--------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|               legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode| segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segments

## Pipeline (Imputer + OneHotEncoder + Vector Assembler) (~20 min)

In [9]:
import pandas as pd
from pyspark.ml.tuning import TrainValidationSplit
from sklearn.metrics import mean_squared_error
from pyspark.ml.feature import Imputer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [10]:
# -------- Define Features and Target -----
features = ['elapsedDays', 'totalTravelDistance', 'seatsRemaining']
targets = ['baseFare', 'totalFare']
# Try out base features above along with day of week corresponding to flight
# date (addition of more features to be attempted soon).
fNt = df.select([df[f].cast('float') for f in features + targets] +
 [F.dayofweek(df['flightDate']).alias('DOW')])

In [11]:
# Train-test split
train_df, test_df = fNt.randomSplit([0.75, 0.25], 42)

In [12]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor

In [13]:
# Fit the pipeline to the train set and run it through.
fe_pipeline = Pipeline(stages = [Imputer(strategy = 'median',
                                         inputCols = features + targets,
                                         outputCols = features + targets),
                                 OneHotEncoder(inputCol = 'DOW',
                                               outputCol = 'DOW_ohe'),
                                 VectorAssembler(inputCols = features + ['DOW_ohe'],
                                                 outputCol = 'features')]).fit(train_df)

train_fe = fe_pipeline.transform(train_df)
train_fe.show()

+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|DOW|      DOW_ohe|            features|
+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|    

In [14]:
# Run the test set through the pipeline.
test_fe = fe_pipeline.transform(test_df)
test_fe.show()

+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|DOW|      DOW_ohe|            features|
+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  2|(7,[2],[1.0])|(10,[1,5],[1464.0...|
|        0.0|             1464.0|    

## Baseline: Optimal Trivial Predictor (Relative to MSE: Mean)
## (~40 min)

In [15]:
# Choose target: either 'baseFare' or 'totalFare'.
target_choice = 'baseFare'

In [17]:
# Baseline train evaluation.
train_tm = train_fe.agg({target_choice: 'mean'}).first().asDict()['avg(baseFare)']
train_SE_tp = train_fe.select(((train_fe[target_choice] - F.lit(train_tm)) ** 2).alias('SE'))
train_MSE_tp = train_SE_tp.agg({'SE': 'mean'}).first()
train_MSE_tp

Row(avg(SE)=33563.405918473094)

In [18]:
# Baseline test evaluation.
test_SE_tp = test_fe.select(((test_fe[target_choice] - F.lit(train_tm)) ** 2).alias('SE'))
test_MSE_tp = test_SE_tp.agg({'SE': 'mean'}).first()
test_MSE_tp

Row(avg(SE)=33542.30131842294)

## Linear Regression (~1 hr)

In [25]:
lr = LinearRegression(featuresCol = 'features', labelCol = target_choice).fit(train_fe)
train_lr = lr.transform(train_fe)
train_lr.show()

+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|DOW|      DOW_ohe|            features|        prediction|
+-----------+-------------------+--------------+--------+---------+---+-------------+--------------------+------------------+
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|307.26290005477335|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|307.26290005477335|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|307.26290005477335|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|307.26290005477335|
|        0.0|             1464.0|           0.0|     5.1|    30.69|  1|(7,[1],[1.0])|(10,[1,4],[1464.0...|307.26290005

In [26]:
train_SE_lr = train_lr.select(((train_lr[target_choice] - train_lr['prediction']) ** 2).alias('SE'))
train_MSE_lr = train_SE_lr.agg({'SE': 'mean'}).first()
train_MSE_lr

Row(avg(SE)=24883.32504461784)

In [27]:
# Evaluate performance on test set.
test_lr = lr.transform(test_fe)
test_SE_lr = test_lr.select(((test_lr[target_choice] - test_lr['prediction']) ** 2).alias('SE'))
test_MSE_lr = test_SE_lr.agg({'SE': 'mean'}).first()
test_MSE_lr

Row(avg(SE)=24863.31824310283)

## Random Forest Regressor (? > 90 min)

In [None]:
rfr = RandomForestRegressor(featuresCol = 'features', labelCol = target_choice).fit(train_fe)
train_rfr = rfr.transform(train_fe)
train_rfr.show()

+-----------+-------------------+--------------+--------+---------+----------------+------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|        features|        prediction|
+-----------+-------------------+--------------+--------+---------+----------------+------------------+
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]| 253.1227368957738|
|        0.0|              947.0|           4.0|  217.67|    248.6| [0.0,947.0,4.0]| 276.5514499623333|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]| 253.1227368957738|
|        0.0|              947.0|           8.0|  217.67|    248.6| [0.0,947.0,8.0]| 256.0551399557707|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]| 253.1227368957738|
|        0.0|              947.0|           7.0|  217.67|    248.6| [0.0,947.0,7.0]|246.56159468142215|
|        0.0|              956.0|           3.0|  213.02|    251

In [None]:
train_SE_rfr = train_rfr.select(((train_rfr[target_choice] - train_rfr['prediction']) ** 2).alias('SE'))
MSE_rfr = train_rfr.agg({'SE': 'mean'}).first()
MSE_rfr

In [None]:
# Evaluate performance on test set.
test_rfr = rfr.transform(test_fe)
test_SE_rfr = test_rfr.select(((test_rfr[target_choice] - test_rfr['prediction']) ** 2).alias('SE'))
test_MSE_rfr = test_SE_rfr.agg({'SE': 'mean'}).first()
test_MSE_rfr