**Data Preparation**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
# Ignore future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Import packages
import os
import zipfile
import pandas as pd
import numpy
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [4]:
from src.data.generate_data import unzip_all_files
unzip_all_files(path='../data/raw/itineraries_csv')

Finished unzipping files for: ../data/raw/itineraries_csv
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\ATL
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\BOS
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\CLT
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\DEN
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\DFW
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\DTW
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\EWR
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\IAD
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\JFK
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\LAX
Finished unzipping files for: ../data/raw/itineraries_csv\itineraries_csv\LGA
Finished u

In [5]:
from src.data.generate_data import load_data_into_dataframe
flights = load_data_into_dataframe(path='../data/raw/itineraries_csv')

Finished loading files for: ../data/raw/itineraries_csv
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\ATL
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\BOS
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\CLT
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\DEN
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\DFW
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\DTW
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\EWR
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\IAD
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\JFK
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\LAX
Finished loading files for: ../data/raw/itineraries_csv\itineraries_csv\LGA
Finished loading files for: ../data/

In [6]:
flights.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,9ca0e81111c683bec1012473feefd28f,2022-04-16,2022-04-17,ATL,BOS,PT2H29M,False,False,True,248.6,...,1650223560,2022-04-17T15:26:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,8940,947,coach
1,98685953630e772a098941b71906592b,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,...,1650200400,2022-04-17T09:00:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9000,947,coach
2,98d90cbc32bfbb05c2fc32897c7c1087,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,...,1650218700,2022-04-17T14:05:00.000-04:00,BOS,ATL,Delta,DL,Boeing 757-200,9000,947,coach
3,969a269d38eae583f455486fa90877b4,2022-04-16,2022-04-17,ATL,BOS,PT2H32M,False,False,True,248.6,...,1650227460,2022-04-17T16:31:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9120,947,coach
4,980370cf27c89b40d2833a1d5afc9751,2022-04-16,2022-04-17,ATL,BOS,PT2H34M,False,False,True,248.6,...,1650213180,2022-04-17T12:33:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9240,947,coach


In [7]:
#Create empty dataframe
dataframe = []

In [8]:
flights['searchDate'] = pd.to_datetime(flights['searchDate'])

In [9]:
from dateutil import parser
# time categories
def get_time_of_day(dt):
    hour = dt.hour
    if hour >=5 and hour < 8:
        return 'Early Morning'
    elif hour >= 8 and hour < 11:
        return 'Morning'
    elif hour >= 11 and hour < 14:
        return 'Midday'
    elif hour >= 14 and hour < 17:
        return 'Afternoon'
    elif hour >= 17 and hour < 20:
        return 'Evening'
    elif hour >= 20 and hour < 23:
        return 'Night'
    else:
        return 'Late Night'

In [10]:
# Remove UTC offset from datetime strings
def remove_utc_offset(datetime_str):
    dt = parser.parse(datetime_str)
    return dt.strftime('%Y-%m-%dT%H:%M:%S.%f')

**Feature Selection**

In [11]:
# function to get features
def getfeatures(flights):

    flights['departuretime'] = flights['segmentsDepartureTimeRaw'].apply(remove_utc_offset)
    flights['departuretime'] = pd.to_datetime(flights['departuretime'], utc=False)
    
    # time category
    flights['time_category'] = flights['departuretime'].apply(get_time_of_day)

    # departure date
    flights['date'] = (flights['departuretime'] - pd.Timedelta(hours=2)).dt.date
    flights['date'] = pd.to_datetime(flights['date'])
     
    # no. of days from flight
    flights['days_from_flight'] = (flights['date'] - flights['searchDate']).dt.days

    return flights

In [12]:
# Apply processing
flights = flights[flights['isNonStop'] == True]
flights = getfeatures(flights)

In [13]:
# Group by minimum fare
df = flights.groupby(['flightDate','searchDate','startingAirport', 'destinationAirport', 'segmentsCabinCode',  'time_category','days_from_flight'])['totalFare'].min().reset_index(name='minimumfare')

In [14]:
#Append to list
dataframe.append(df)

In [15]:
#Combine dataframes
combined_df = pd.concat(dataframe, ignore_index=True)

In [16]:
len(combined_df)

1586051

In [17]:
combined_df['searchDate'] = pd.to_datetime(combined_df['searchDate'])

In [18]:
combined_df.to_csv('minimumfares.csv', index=False)

In [19]:
combined_df = pd.read_csv('minimumfares.csv')

In [20]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

combined_df.head()

Unnamed: 0,flightDate,searchDate,startingAirport,destinationAirport,segmentsCabinCode,time_category,days_from_flight,minimumfare
0,2022-04-17,2022-04-16,ATL,BOS,coach,Afternoon,1,398.6
1,2022-04-17,2022-04-16,ATL,BOS,coach,Early Morning,1,248.6
2,2022-04-17,2022-04-16,ATL,BOS,coach,Evening,1,355.09
3,2022-04-17,2022-04-16,ATL,BOS,coach,Midday,1,248.6
4,2022-04-17,2022-04-16,ATL,BOS,coach,Morning,1,248.6


In [21]:
print('Earliest date:', combined_df['searchDate'].min())
print('Latest date:', combined_df['searchDate'].max())

Earliest date: 2022-04-16
Latest date: 2022-05-19


In [22]:
# Define the split date
split_date = pd.Timestamp('2022-06-17')

# Convert df.index to DateTimeIndex
df.index = pd.to_datetime(df.index)

# Create the train and test sets
train = combined_df.loc[df.index < split_date]
test = combined_df.loc[df.index >= split_date]

In [23]:
#Display data types
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586051 entries, 0 to 1586050
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   flightDate          1586051 non-null  object 
 1   searchDate          1586051 non-null  object 
 2   startingAirport     1586051 non-null  object 
 3   destinationAirport  1586051 non-null  object 
 4   segmentsCabinCode   1586051 non-null  object 
 5   time_category       1586051 non-null  object 
 6   days_from_flight    1586051 non-null  int64  
 7   minimumfare         1586051 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 96.8+ MB


In [24]:
#Display missing values
combined_df.isna().sum()

flightDate            0
searchDate            0
startingAirport       0
destinationAirport    0
segmentsCabinCode     0
time_category         0
days_from_flight      0
minimumfare           0
dtype: int64

**Build Pipeline**

In [25]:
#Import pipeline package
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump

In [26]:
#Check unique values of categorical columns
print(combined_df['startingAirport'].unique())
print(combined_df['destinationAirport'].unique())
print(combined_df['segmentsCabinCode'].unique())
print(combined_df['time_category'].unique())
print(combined_df['days_from_flight'].unique())
print(combined_df['flightDate'].unique())
print(combined_df['searchDate'].unique())

['ATL' 'BOS' 'CLT' 'DEN' 'DFW' 'DTW' 'EWR' 'IAD' 'JFK' 'LAX' 'LGA' 'MIA'
 'OAK' 'ORD' 'PHL' 'SFO']
['BOS' 'CLT' 'DEN' 'DFW' 'DTW' 'EWR' 'IAD' 'JFK' 'LAX' 'LGA' 'MIA' 'ORD'
 'PHL' 'SFO' 'ATL' 'OAK']
['coach' 'first' 'premium coach' 'business']
['Afternoon' 'Early Morning' 'Evening' 'Midday' 'Morning' 'Night'
 'Late Night']
[ 1  0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61]
['2022-04-17' '2022-04-18' '2022-04-19' '2022-04-20' '2022-04-21'
 '2022-04-22' '2022-04-23' '2022-04-24' '2022-04-25' '2022-04-26'
 '2022-04-27' '2022-04-28' '2022-04-29' '2022-04-30' '2022-05-01'
 '2022-05-02' '2022-05-03' '2022-05-04' '2022-05-05' '2022-05-06'
 '2022-05-07' '2022-05-08' '2022-05-09' '2022-05-10' '2022-05-11'
 '2022-05-12' '2022-05-13' '2022-05-14' '2022-05-15' '2022-05-16'
 '2022-05-17' '2022-05-18' '2022-05-19' '2022-05-20' '2022-05-21'
 '2022-05-22' '2022-05-2

In [27]:
# Creating a list of categorical columns
cat_cols = ['startingAirport','destinationAirport',
             'segmentsCabinCode','time_category', 'flightDate', 'searchDate']
# Creating a list of numeric columns
num_cols = ['days_from_flight']

In [28]:
#Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

In [29]:
#Create parameters for randomforest model
rf_params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42
}

In [30]:
#Build pipeline
rfpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(**rf_params))
])

In [31]:
#Set target
X = combined_df.drop('minimumfare', axis=1)
y = combined_df['minimumfare']

In [32]:
#Split data
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [34]:
#Train model
rfpipeline.fit(x_train, y_train)

KeyboardInterrupt: 

In [35]:
dump(rfpipeline, '../models/rfmodel.joblib')

['../models/rfmodel.joblib']

In [36]:
# Generate predictions for training and test sets
y_train_pred = rfpipeline.predict(x_train)
y_test_pred = rfpipeline.predict(x_test)

  y_hat /= len(self.estimators_)
  y_hat /= len(self.estimators_)
  y_hat /= len(self.estimators_)


In [37]:
#Print RMSE score
print(f"RMSE on train data: {mean_squared_error(y_train, y_train_pred, squared=False)}")
print(f"RMSE on test data: {mean_squared_error(y_test, y_test_pred, squared=False)}")

ValueError: Input contains NaN.

In [38]:
#Print R2 score
print(f"R2 score on train data: {r2_score(y_train, y_train_pred)}")
print(f"R2 score on test data: {r2_score(y_test, y_test_pred)}")

ValueError: Input contains NaN.

In [39]:
#Import predictions
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
predictions.to_csv('predictions.csv', index=False)