In [2]:
library(dplyr)
library(lubridate)
library(lutz)
library(data.table)

In [3]:
# Define a vectorized CST conversion function
convert_to_cst <- Vectorize(function(time, timezone) {
  if (is.na(time) || is.na(timezone)) {
    return(NA)  # Return NA if either time or timezone is NA
  }
  datetime <- force_tz(time, tzone = timezone)   # Set to the original timezone
  with_tz(datetime, tzone = "America/Chicago")  # Convert to CST
})

In [6]:
# Use data.table to speed up processing
process_flight_data_dt <- function(data_file, location_file, output_file, flight_columns = NULL) {
  
  # Read data and convert to data.table format
  data <- fread(data_file, stringsAsFactors = FALSE)
  # data <- data %>% sample_frac(0.1)  ###############################
  location <- fread(location_file, stringsAsFactors = FALSE)
  
  # If specific columns are defined, filter the data to keep only those columns
  if (!is.null(flight_columns)) {
    data <- data[, ..flight_columns]
  }
  
  # Merge timezone information for the origin airport
  data <- merge(data, location[, .(AIRPORT_ID, Timezone)], 
                by.x = "OriginAirportID", by.y = "AIRPORT_ID", all.x = TRUE,
                allow.cartesian = TRUE)
  setnames(data, "Timezone", "Origin_Timezone")
  
  # Merge timezone information for the destination airport
  data <- merge(data, location[, .(AIRPORT_ID, Timezone)], 
                by.x = "DestAirportID", by.y = "AIRPORT_ID", all.x = TRUE,
                allow.cartesian = TRUE)
  setnames(data, "Timezone", "Dest_Timezone")
  
  # Format time columns to ensure consistent 4-digit time strings
  data[, `:=` (
    CRSDepTime = sprintf("%04d", CRSDepTime),
    CRSArrTime = sprintf("%04d", CRSArrTime),
    DepTime = sprintf("%04d", DepTime),
    ArrTime = sprintf("%04d", ArrTime)
  )]
  
  # Create datetime columns by combining FlightDate with time columns
  data[, `:=` (
    #DepDatetime = ymd(FlightDate) + hours(as.numeric(substr(DepTime, 1, 2))) + minutes(as.numeric(substr(DepTime, 3, 4))),
    #ArrDatetime = ymd(FlightDate) + hours(as.numeric(substr(ArrTime, 1, 2))) + minutes(as.numeric(substr(ArrTime, 3, 4)))#,
    CRSDepDatetime = ymd(FlightDate) + hours(as.numeric(substr(CRSDepTime, 1, 2))) + minutes(as.numeric(substr(CRSDepTime, 3, 4)))
    #CRSArrDatetime = ymd(FlightDate) + hours(as.numeric(substr(CRSArrTime, 1, 2))) + minutes(as.numeric(substr(CRSArrTime, 3, 4)))
  )]
  
  # Convert to CST timezone
  data[, `:=` (
    CRSDepTime_CST = convert_to_cst(CRSDepDatetime, Origin_Timezone)
    #DepTime_CST = convert_to_cst(DepDatetime, Origin_Timezone),
    #CRSArrTime_CST = convert_to_cst(CRSArrDatetime, Dest_Timezone),
    #ArrTime_CST = convert_to_cst(ArrDatetime, Dest_Timezone)
  )]

  # Ensure the converted CST times are in POSIXct format with "America/Chicago" timezone
  data$CRSDepTime_CST <- as.POSIXct(data$CRSDepTime_CST, origin = "1970-01-01", tz = "America/Chicago")
  #data$DepTime_CST <- as.POSIXct(data$DepTime_CST, origin = "1970-01-01", tz = "America/Chicago")
  #data$CRSArrTime_CST <- as.POSIXct(data$CRSArrTime_CST, origin = "1970-01-01", tz = "America/Chicago")
  #data$ArrTime_CST <- as.POSIXct(data$ArrTime_CST, origin = "1970-01-01", tz = "America/Chicago")
    
  # Calculate the expected arrival time based on DepTime_CST + ActualElapsedTime
  data[, DepTime_CST  := CRSDepTime_CST + minutes(DepDelay)]
  data[, ArrTime_CST  := DepTime_CST + minutes(ActualElapsedTime)]
  data[, CRSArrTime_CST  := CRSDepTime_CST + minutes(CRSElapsedTime)]
    
  data <- data[, !names(data) %in% c("Origin_Timezone","Dest_Timezone"), with = FALSE]
  
  # Save the processed data to a CSV file
  fwrite(data, output_file)
  
  return(data)  # Return the processed data.table
}

In [11]:
input_folder <- "flight_holidayseason"
output_folder <- "flight_processed"
location_file <- "airport_timezones.csv"

if (!dir.exists(output_folder)) {
  dir.create(output_folder)
}

flight_files <- list.files(input_folder, pattern = "\\.csv$", full.names = TRUE)

flight_columns <- c("OriginAirportID", "DestAirportID", "FlightDate", "DayOfWeek", 
                     "Origin", "Dest", "CRSDepTime", "DepTime", 
                    "DepDelay", "CRSArrTime", "ArrTime", "ArrDelay", "Cancelled", 
                    "CancellationCode", "Diverted", "CRSElapsedTime", 
                    "ActualElapsedTime", "Distance", "WeatherDelay")

for (file in flight_files) {
  output_file <- file.path(output_folder, basename(file))
  
  processed_data <- process_flight_data_dt(
    data_file = file,
    location_file = location_file,
    output_file = output_file,
    flight_columns = flight_columns
  )
}

In [12]:
# check Column names
example_file <- list.files(output_folder, pattern = "\\.csv$", full.names = TRUE)[1]
example_data <- fread(example_file)
print(colnames(example_data))

 [1] "DestAirportID"     "OriginAirportID"   "FlightDate"       
 [4] "DayOfWeek"         "Origin"            "Dest"             
 [7] "CRSDepTime"        "DepTime"           "DepDelay"         
[10] "CRSArrTime"        "ArrTime"           "ArrDelay"         
[13] "Cancelled"         "CancellationCode"  "Diverted"         
[16] "CRSElapsedTime"    "ActualElapsedTime" "Distance"         
[19] "WeatherDelay"      "CRSDepDatetime"    "CRSDepTime_CST"   
[22] "DepTime_CST"       "ArrTime_CST"       "CRSArrTime_CST"   


In [13]:
processed_data <- read.csv("flight_processed/2018_1.csv", stringsAsFactors = FALSE)
head(processed_data)

Unnamed: 0_level_0,DestAirportID,OriginAirportID,FlightDate,DayOfWeek,Origin,Dest,CRSDepTime,DepTime,DepDelay,CRSArrTime,⋯,Diverted,CRSElapsedTime,ActualElapsedTime,Distance,WeatherDelay,CRSDepDatetime,CRSDepTime_CST,DepTime_CST,ArrTime_CST,CRSArrTime_CST
Unnamed: 0_level_1,<int>,<int>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>
1,10135,10397,2018/1/1,1,ATL,ABE,1505,1703,118,1710,⋯,0,125,117,692,0.0,2018-01-01T15:05:00Z,2018-01-01T20:05:00Z,2018-01-01T22:03:00Z,2018-01-02T00:00:00Z,2018-01-01T22:10:00Z
2,10135,10397,2018/1/2,2,ATL,ABE,1505,1530,25,1710,⋯,0,125,113,692,,2018-01-02T15:05:00Z,2018-01-02T20:05:00Z,2018-01-02T20:30:00Z,2018-01-02T22:23:00Z,2018-01-02T22:10:00Z
3,10135,10397,2018/1/3,3,ATL,ABE,1015,1022,7,1219,⋯,0,124,132,692,0.0,2018-01-03T10:15:00Z,2018-01-03T15:15:00Z,2018-01-03T15:22:00Z,2018-01-03T17:34:00Z,2018-01-03T17:19:00Z
4,10135,10397,2018/1/4,4,ATL,ABE,1015,1455,280,1219,⋯,0,124,125,692,0.0,2018-01-04T10:15:00Z,2018-01-04T15:15:00Z,2018-01-04T19:55:00Z,2018-01-04T22:00:00Z,2018-01-04T17:19:00Z
5,10135,10397,2018/1/5,5,ATL,ABE,1015,1034,19,1219,⋯,0,124,104,692,,2018-01-05T10:15:00Z,2018-01-05T15:15:00Z,2018-01-05T15:34:00Z,2018-01-05T17:18:00Z,2018-01-05T17:19:00Z
6,10135,10397,2018/1/8,1,ATL,ABE,1015,1040,25,1219,⋯,0,124,107,692,,2018-01-08T10:15:00Z,2018-01-08T15:15:00Z,2018-01-08T15:40:00Z,2018-01-08T17:27:00Z,2018-01-08T17:19:00Z


In [9]:
file <- "flight_holidayseason/2018_1.csv"
output_file <- "2018_1_processed.csv"
location_file <- "airport_timezones.csv"
flight_columns <- c("OriginAirportID", "DestAirportID", "FlightDate", "DayOfWeek", 
                     "Origin", "Dest", "CRSDepTime", "DepTime", 
                    "DepDelay", "CRSArrTime", "ArrTime", "ArrDelay", "Cancelled", 
                    "CancellationCode", "Diverted", "CRSElapsedTime", 
                    "ActualElapsedTime", "Distance", "WeatherDelay")
  
processed_data <- process_flight_data_dt(
    data_file = file,
    location_file = location_file,
    output_file = output_file,
    flight_columns = flight_columns
  )

In [10]:
head(processed_data)

DestAirportID,OriginAirportID,FlightDate,DayOfWeek,Origin,Dest,CRSDepTime,DepTime,DepDelay,CRSArrTime,⋯,Diverted,CRSElapsedTime,ActualElapsedTime,Distance,WeatherDelay,CRSDepDatetime,CRSDepTime_CST,DepTime_CST,ArrTime_CST,CRSArrTime_CST
<int>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,⋯,<int>,<int>,<int>,<int>,<int>,<dttm>,<dttm>,<dttm>,<dttm>,<dttm>
10135,10397,2018/1/1,1,ATL,ABE,1505,1703,118,1710,⋯,0,125,117,692,0.0,2018-01-01 15:05:00,2018-01-01 14:05:00,2018-01-01 16:03:00,2018-01-01 18:00:00,2018-01-01 16:10:00
10135,10397,2018/1/2,2,ATL,ABE,1505,1530,25,1710,⋯,0,125,113,692,,2018-01-02 15:05:00,2018-01-02 14:05:00,2018-01-02 14:30:00,2018-01-02 16:23:00,2018-01-02 16:10:00
10135,10397,2018/1/3,3,ATL,ABE,1015,1022,7,1219,⋯,0,124,132,692,0.0,2018-01-03 10:15:00,2018-01-03 09:15:00,2018-01-03 09:22:00,2018-01-03 11:34:00,2018-01-03 11:19:00
10135,10397,2018/1/4,4,ATL,ABE,1015,1455,280,1219,⋯,0,124,125,692,0.0,2018-01-04 10:15:00,2018-01-04 09:15:00,2018-01-04 13:55:00,2018-01-04 16:00:00,2018-01-04 11:19:00
10135,10397,2018/1/5,5,ATL,ABE,1015,1034,19,1219,⋯,0,124,104,692,,2018-01-05 10:15:00,2018-01-05 09:15:00,2018-01-05 09:34:00,2018-01-05 11:18:00,2018-01-05 11:19:00
10135,10397,2018/1/8,1,ATL,ABE,1015,1040,25,1219,⋯,0,124,107,692,,2018-01-08 10:15:00,2018-01-08 09:15:00,2018-01-08 09:40:00,2018-01-08 11:27:00,2018-01-08 11:19:00
