In [1]:
##Bring in the appropriate packages
import datetime
import time
import pandas as pd
import numpy as np
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import pickle

In [2]:
##Create definition for working with date and time items
def convert_duration(duration_str):
    # Pattern for 'PT#H#M' and 'P#DT#H#M'
    pattern = re.compile(r'P(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?')

    match = pattern.match(duration_str)
    if match:
        days = int(match.group(1) or 0)
        hours = int(match.group(2) or 0)
        minutes = int(match.group(3) or 0)
        total_minutes = days * 24 * 60 + hours * 60 + minutes
        return total_minutes
    else:
        return None

In [3]:
##Create split function for flights with more than one stop
def split_columns(row, column_name):
    values = row[column_name].split("||")
    return values + [0] * (4 - len(values))

In [4]:
##Create translate functions where the PCA will take place

df = pd.read_csv('Atlanta Prices.csv')

# df.info()
df.drop(['legId'],inplace=True,axis=1)
##Drop LegID as this is not needed in the PCA as it is a unique identifier
df = df.dropna()
##Drop any row with NA. In our analysis there are no seats remaining in columns with NA values
# df.info()
## Convert 'searchDate' and 'flightDate' from object to datetime-type
##df['searchDate'] = pd.to_datetime(df['searchDate'])
##df['flightDate'] = pd.to_datetime(df['flightDate'])
columns_to_process = [
    'segmentsDepartureTimeEpochSeconds',
    'segmentsDepartureTimeRaw',
    'segmentsArrivalTimeEpochSeconds',
    'segmentsArrivalTimeRaw',
    'segmentsArrivalAirportCode',
    'segmentsDepartureAirportCode',
    'segmentsAirlineName',
    'segmentsAirlineCode',
    'segmentsEquipmentDescription',
    'segmentsDurationInSeconds',
    'segmentsDistance',
    'segmentsCabinCode',
]

for column_name in columns_to_process:
    new_columns = df.apply(lambda row: split_columns(row, column_name), axis=1, result_type='expand')
    df[[f'{column_name}_1', f'{column_name}_2', f'{column_name}_3', f'{column_name}_4']] = new_columns

In [5]:
df['totalTravelDuration'] = df['travelDuration'].apply(lambda x: convert_duration(x))
# df.head()
for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop(col, inplace=True, axis=1)

df['weeknum'] = pd.to_datetime(df['segmentsDepartureTimeRaw_1'], format='%Y-%m-%dT%H:%M:%S.%f%z').dt.isocalendar().week

df['hour_of_flight']=df['segmentsDepartureTimeRaw_1'].str.split('T').str[1].str.split('.').str[0]
df['departure_min'] = pd.to_datetime(df['hour_of_flight'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(df['hour_of_flight'], format='%H:%M:%S').dt.minute
df['time_of_departure'] = np.where(df['departure_min'] <= (8 * 60), "Morning", np.where(df['departure_min'] <= (16 * 60), "Mid-Day", "Night"))

df['last_arrival_time']=df['segmentsArrivalTimeRaw'].str[-29:]
df['last_arrival_time']=df['last_arrival_time'].str.split('T').str[1].str.split('.').str[0]
df['last_arrival_time']=pd.to_datetime(df['last_arrival_time'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(df['last_arrival_time'], format='%H:%M:%S').dt.minute
df['time_of_arrival'] = np.where(df['last_arrival_time'] <= (8 * 60), "Morning", np.where(df['last_arrival_time'] <= (16 * 60), "Mid-Day", "Night"))


print(df[['segmentsDepartureTimeRaw_1','weeknum','time_of_departure','time_of_arrival']])

          segmentsDepartureTimeRaw_1  weeknum time_of_departure  \
0      2022-04-17T12:57:00.000-04:00       15           Mid-Day   
1      2022-04-17T06:30:00.000-04:00       15           Morning   
2      2022-04-17T11:35:00.000-04:00       15           Mid-Day   
3      2022-04-17T13:59:00.000-04:00       15           Mid-Day   
4      2022-04-17T09:59:00.000-04:00       15           Mid-Day   
...                              ...      ...               ...   
15998  2022-05-05T08:02:00.000-04:00       18           Mid-Day   
15999  2022-05-05T09:00:00.000-04:00       18           Mid-Day   
16000  2022-05-05T22:38:00.000-04:00       18             Night   
16001  2022-05-05T10:00:00.000-04:00       18           Mid-Day   
16002  2022-05-05T16:45:00.000-04:00       18             Night   

      time_of_arrival  
0             Mid-Day  
1             Mid-Day  
2             Mid-Day  
3               Night  
4             Mid-Day  
...               ...  
15998         Mid-Day  
159

In [6]:
columns_to_count = ['segmentsAirlineName_1', 'segmentsAirlineName_2', 'segmentsAirlineName_3', 'segmentsAirlineName_4']
df['no_airlines'] = df[columns_to_count].apply(lambda row: len(set(filter(lambda x: x != 0, row))), axis=1)

In [7]:
columns_to_count = ['segmentsArrivalAirportCode_1', 'segmentsArrivalAirportCode_2', 'segmentsArrivalAirportCode_3', 'segmentsArrivalAirportCode_4']
df['no_layovers'] = df[columns_to_count].apply(lambda row: len(set(filter(lambda x: x != 0, row))), axis=1)
print(df['no_layovers'])

0        1
1        1
2        1
3        1
4        1
        ..
15998    2
15999    2
16000    2
16001    1
16002    1
Name: no_layovers, Length: 14303, dtype: int64


In [8]:
columns_to_keep = ['destinationAirport','isBasicEconomy', 'isNonStop', 'baseFare', 'seatsRemaining', 
                   'totalTravelDistance', 'totalTravelDuration','weeknum', 
                   'time_of_departure', 'time_of_arrival', 'no_airlines', 'no_layovers'
                  ,'segmentsCabinCode_1']

df = df[columns_to_keep]

df.head()

Unnamed: 0,destinationAirport,isBasicEconomy,isNonStop,baseFare,seatsRemaining,totalTravelDistance,totalTravelDuration,weeknum,time_of_departure,time_of_arrival,no_airlines,no_layovers,segmentsCabinCode_1
0,BOS,False,True,217.67,9.0,947.0,149,15,Mid-Day,Mid-Day,1,1,coach
1,BOS,False,True,217.67,4.0,947.0,150,15,Morning,Mid-Day,1,1,coach
2,BOS,False,True,217.67,9.0,947.0,150,15,Mid-Day,Mid-Day,1,1,coach
3,BOS,False,True,217.67,8.0,947.0,152,15,Mid-Day,Night,1,1,coach
4,BOS,False,True,217.67,9.0,947.0,154,15,Mid-Day,Mid-Day,1,1,coach


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14303 entries, 0 to 16002
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   destinationAirport   14303 non-null  object 
 1   isBasicEconomy       14303 non-null  bool   
 2   isNonStop            14303 non-null  bool   
 3   baseFare             14303 non-null  float64
 4   seatsRemaining       14303 non-null  float64
 5   totalTravelDistance  14303 non-null  float64
 6   totalTravelDuration  14303 non-null  int64  
 7   weeknum              14303 non-null  UInt32 
 8   time_of_departure    14303 non-null  object 
 9   time_of_arrival      14303 non-null  object 
 10  no_airlines          14303 non-null  int64  
 11  no_layovers          14303 non-null  int64  
 12  segmentsCabinCode_1  14303 non-null  object 
dtypes: UInt32(1), bool(2), float64(3), int64(3), object(4)
memory usage: 1.8+ MB


In [14]:
#Import pacakges necessary for the creation of a pipeline that encodes, then scales the numerical data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Create the XGBoost Regressor model
model = XGBRegressor()

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')

Mean Squared Error: 9714.953207683384
R-squared: 0.6404033480062912


In [15]:
#This algorithm uses gridsearch to identify the optimal hyperparameters 

from sklearn.model_selection import GridSearchCV


# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Create the XGBoost Regressor model
xgb_model = XGBRegressor()

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb_model)])

# Define hyperparameters for tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_child_weight': [1, 3, 5],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Best Hyperparameters: {grid_search.best_params_}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')


Best Hyperparameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_child_weight': 1, 'model__n_estimators': 300, 'model__subsample': 0.8}
Mean Squared Error: 9670.933080739997
R-squared: 0.6420327423976775


In [16]:
#Once hyperparameters are identified we run the algorithm again with said parameters and increase the R-squared slightly

# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Use the best hyperparameters from GridSearchCV
best_hyperparameters = {'model__colsample_bytree': 0.8,
                        'model__learning_rate': 0.1,
                        'model__max_depth': 7,
                        'model__min_child_weight': 1,
                        'model__n_estimators': 300,
                        'model__subsample': 0.8}

# Create the XGBoost Regressor model with the best hyperparameters
xgb_model = XGBRegressor(**best_hyperparameters)

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', xgb_model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Best Hyperparameters: {best_hyperparameters}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')

Best Hyperparameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__min_child_weight': 1, 'model__n_estimators': 300, 'model__subsample': 0.8}
Mean Squared Error: 9315.684684960568
R-squared: 0.6612753725399783


Parameters: { "model__colsample_bytree", "model__learning_rate", "model__max_depth", "model__min_child_weight", "model__n_estimators", "model__subsample" } are not used.



In [17]:
from sklearn.ensemble import GradientBoostingRegressor  # Import GradientBoostingRegressor

# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Create the Gradient Boosting Regressor model
model = GradientBoostingRegressor()  # Use GradientBoostingRegressor instead of XGBRegressor

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')

Mean Squared Error: 13082.112062334896
R-squared: 0.5157687743774608


In [18]:
from sklearn.ensemble import AdaBoostRegressor  # Import AdaBoostRegressor

# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Create the AdaBoost Regressor model
model = AdaBoostRegressor()  # Use AdaBoostRegressor instead of XGBRegressor

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')


Mean Squared Error: 34343.87792251694
R-squared: -0.2712303655410706


In [19]:
from sklearn.ensemble import RandomForestRegressor  # Import RandomForestRegressor

# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Create the Random Forest Regressor model
model = RandomForestRegressor()  # Use RandomForestRegressor instead of AdaBoostRegressor

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')


Mean Squared Error: 10087.69064686297
R-squared: 0.6266065615126947


In [22]:
from sklearn.neural_network import MLPRegressor  # Import MLPRegressor


# Separate features (X) and target variable (y)
X = df.drop('baseFare', axis=1)
y = df['baseFare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for encoding and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)  # Use drop='first' to avoid dummy variable trap
    ])

# Create the MLP Regressor model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=500)  # Example: one hidden layer with 100 neurons

# Create a pipeline with encoding and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', mlp_model)])

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r_squared}')


Mean Squared Error: 13318.532677148785
R-squared: 0.5070177222898264


