In [2]:
import os
import zipfile
import pandas as pd
from dateutil import parser

In [3]:
# time categories
def get_time_of_day(dt):
    hour = dt.hour
    if hour >=5 and hour < 8:
        return 'Early Morning'
    elif hour >= 8 and hour < 11:
        return 'Morning'
    elif hour >= 11 and hour < 14:
        return 'Midday'
    elif hour >= 14 and hour < 17:
        return 'Afternoon'
    elif hour >= 17 and hour < 20:
        return 'Evening'
    elif hour >= 20 and hour < 23:
        return 'Night'
    else:
        return 'Late Night'
 
# Remove UTC offset from datetime strings
def remove_utc_offset(datetime_str):
    dt = parser.parse(datetime_str)
    return dt.strftime('%Y-%m-%dT%H:%M:%S.%f')

# function to get features
def getfeatures(df):

    df['departuretime'] = df['segmentsDepartureTimeRaw'].apply(remove_utc_offset) 
    df['departuretime'] = pd.to_datetime(df['departuretime'], utc=False)
    
    # time category
    df['time_category'] = df['departuretime'].apply(get_time_of_day)
    
    # departure date
    df['date'] = (df['departuretime'] - pd.Timedelta(hours=2)).dt.date

    # no. of days from flight
    df['days_from_flight'] = (df['date'] - pd.to_datetime(df['searchDate']).dt.date).dt.days

    return df

In [4]:
# Root directory containing subfolders 
root_dir = 'itineraries_csv'

dataframes = []

# Loop through each subfolder
for directory in os.listdir(root_dir):

  if os.path.isdir(os.path.join(root_dir, directory)) and not directory.startswith('.'):
    print(directory)
    folder_path = os.path.join(root_dir, directory)  

    # Loop through zip files
    for filename in os.listdir(folder_path):
      
      if filename.endswith('.zip'):
      
        # Full path of zip file
        zip_path = os.path.join(folder_path, filename)  
        
        # Extract zip contents
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(folder_path)
              
          # Parse CSV file in zip 
          csv_path = os.path.join(folder_path, [x for x in zip_ref.namelist() if x.endswith('.csv')][0])      
          df = pd.read_csv(csv_path)
          
          # Apply processing 
          df = df[df['isNonStop'] == True]  
          df = getfeatures(df)

          # CHANGE THIS LINE - MAYBE USE MINIMUM, MODE AND MEAN INSTEAD?
          df = df.groupby(['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'date', 'time_category', 'days_from_flight'])['totalFare'].median().reset_index(name='medianfare')
          
          # Append to list
          dataframes.append(df) 
        
# Combine dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

OAK
DEN
LGA
LAX
ATL
CLT
PHL
DTW
IAD
JFK
DFW
BOS
EWR
SFO
ORD
MIA


In [8]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

combined_df.head()

Unnamed: 0,segmentsDepartureAirportCode,segmentsArrivalAirportCode,segmentsCabinCode,date,time_category,days_from_flight,medianfare
0,OAK,DEN,coach,2022-05-20,Evening,31,83.98
1,OAK,DEN,coach,2022-05-21,Evening,32,43.98
2,OAK,DEN,coach,2022-05-22,Evening,33,173.98
3,OAK,LAX,coach,2022-05-20,Afternoon,31,168.61
4,OAK,LAX,coach,2022-05-20,Early Morning,31,100.285


In [25]:
len(combined_df)

1649244

In [4]:
combined_df.to_csv('medianfares.csv', index=False)

In [3]:
combined_df = pd.read_csv('medianfares.csv')

In [18]:
print('Earliest date:', combined_df['date'].min())
print('Latest date:', combined_df['date'].max())

Earliest date: 2022-04-16
Latest date: 2022-07-18


In [19]:
# Define the split date
split_date = pd.Timestamp('2022-06-17')

# Create the train and test sets
train = combined_df.loc[df.index < split_date]
test = combined_df.loc[df.index >= split_date]

NameError: name 'df' is not defined