## Data Load

In [None]:
!pip install kaggle
!kaggle datasets download -d dilwong/flightprices

Dataset URL: https://www.kaggle.com/datasets/dilwong/flightprices
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading flightprices.zip to /content
100% 5.50G/5.51G [01:05<00:00, 158MB/s]
100% 5.51G/5.51G [01:05<00:00, 90.1MB/s]


In [None]:
!unzip -n flightprices.zip

Archive:  flightprices.zip
  inflating: itineraries.csv         


In [None]:
!pip install pyspark



In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
try:
  sc.stop()
except:
  pass
sc = SparkContext()
sqlContext = SQLContext(sc)



In [None]:
ss = SparkSession.builder.getOrCreate()

In [None]:
df = sqlContext.read.csv('itineraries.csv', header = True)

In [None]:
df.show()

+--------------------+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+--------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|               legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode| segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segments

In [None]:
import pandas as pd
from pyspark.ml.tuning import TrainValidationSplit
from sklearn.metrics import mean_squared_error
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml import Pipeline

In [None]:
# -------- Define Features and Target -----
features = ['elapsedDays', 'totalTravelDistance', 'seatsRemaining']
targets = ['baseFare', 'totalFare']
fNt = df.select([df[f].cast('float') for f in features + targets])

In [None]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor

In [None]:
fe_pipeline = Pipeline(stages = [Imputer(strategy='median',
                                         inputCols = features + targets,
                                         outputCols = features + targets),
                                 VectorAssembler(inputCols = features,
                                                 outputCol = 'features')])

fe_df = fe_pipeline.fit(fNt).transform(fNt)
fe_df.show()

+-----------+-------------------+--------------+--------+---------+----------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|        features|
+-----------+-------------------+--------------+--------+---------+----------------+
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|
|        0.0|              947.0|           4.0|  217.67|    248.6| [0.0,947.0,4.0]|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|
|        0.0|              947.0|           8.0|  217.67|    248.6| [0.0,947.0,8.0]|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|
|        0.0|              947.0|           7.0|  217.67|    248.6| [0.0,947.0,7.0]|
|        0.0|              956.0|           3.0|  213.02|    251.1| [0.0,956.0,3.0]|
|        0.0|              956.0|           3.0|  213.02|    251.1| [0.0,956.0,3.0]|
|        0.0|              956.0|           7.0|  213.02|    251.

In [13]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'baseFare').fit(fe_df)
lr_df = lr.transform(fe_df)
lr_df.show()

+-----------+-------------------+--------------+--------+---------+----------------+------------------+
|elapsedDays|totalTravelDistance|seatsRemaining|baseFare|totalFare|        features|        prediction|
+-----------+-------------------+--------------+--------+---------+----------------+------------------+
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|237.18986311092758|
|        0.0|              947.0|           4.0|  217.67|    248.6| [0.0,947.0,4.0]|215.19402961363704|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|237.18986311092758|
|        0.0|              947.0|           8.0|  217.67|    248.6| [0.0,947.0,8.0]|232.79069641146947|
|        0.0|              947.0|           9.0|  217.67|    248.6| [0.0,947.0,9.0]|237.18986311092758|
|        0.0|              947.0|           7.0|  217.67|    248.6| [0.0,947.0,7.0]|228.39152971201136|
|        0.0|              956.0|           3.0|  213.02|    251

In [None]:
rfr = RandomForestRegressor(featuresCol = 'features', labelCol = 'baseFare').fit(fe_df)
rfr_df = rfr.transform(fe_df)
rfr_df.show()