In [1]:
# use natsort package to sort thos missing leading zero files 
# (sorting is optional, depends on whether you want the big file to be concatenated ordered)
# pip install natsort

# in case tqdm is not installed
# pip install tqdm

In [2]:
import pandas as pd
import os
from natsort import os_sorted
from tqdm.notebook import tqdm

# Raw Data Source: https://transtats.bts.gov/PREZIP/

The ZIP files we are interested in have this naming: "On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2024_2.zip"  
Each ZIP file contains a CSV with data for one month. The last part "..._2024_2" of the file name indicates that this file contains data for February 2024.

#### working scenario: 
- download all relevant zipfiles to a folder `downloads` in our repo
- extract each csv into a folder `downloads/extracted`

In [3]:
# open a file test
flights_raw = pd.read_csv('./downloads/extracted/On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2024_2.csv', low_memory = False)
flights_raw.head(2)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,...,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2024,1,2,17,6,2024-02-17,9E,20363,9E,N922XJ,...,,,,,,,,,,
1,2024,1,2,17,6,2024-02-17,9E,20363,9E,N922XJ,...,,,,,,,,,,


In [4]:
# select columns to keep
def cols_to_keep(flights_raw):
    columns_to_keep = [
        "FlightDate",
        "DepTime",
        "CRSDepTime",
        "DepDelay",
        "ArrTime",
        "CRSArrTime",
        "ArrDelay",
        "Reporting_Airline",
        "Tail_Number",
        "Flight_Number_Reporting_Airline",
        "Origin",
        "Dest",
        "AirTime",
        "ActualElapsedTime",
        "Distance",
        "Cancelled",
        "Diverted",
    ]
    flights = flights_raw.loc[:, columns_to_keep]
    return flights

In [5]:
# rename columns
def rename_cols(flights):
    new_column_names = {
        'FlightDate': 'flight_date',
        'DepTime': 'dep_time',
        'CRSDepTime': 'sched_dep_time',
        'DepDelay': 'dep_delay',
        'ArrTime': 'arr_time',
        'CRSArrTime': 'sched_arr_time',
        'ArrDelay': 'arr_delay',
        'Reporting_Airline': 'airline',
        'Tail_Number': 'tail_number',
        'Flight_Number_Reporting_Airline': 'flight_number',
        'Origin': 'origin',
        'Dest': 'dest',
        'AirTime': 'air_time',
        'ActualElapsedTime': 'actual_elapsed_time',
        'Distance': 'distance',
        'Cancelled': 'cancelled',
        'Diverted': 'diverted'
    }
    flights.rename(columns=new_column_names, inplace=True)
    return flights

In [6]:
# change datatype
def change_dtypes(flights):
    types_change = {
        'flight_date': 'datetime64[ns]',
        'dep_time': 'float64',
        'sched_dep_time': 'int16',
        'dep_delay': 'float64',
        'arr_time': 'float64',
        'sched_arr_time': 'int16',
        'arr_delay': 'float64',
        'airline': 'O',
        'tail_number': 'O',
        'flight_number': 'int64',
        'origin': 'O',
        'dest': 'O',
        'air_time': 'float64',
        'actual_elapsed_time': 'float64',
        'distance': 'int16',
        'cancelled': 'int16',
        'diverted': 'int16'
    }
    flights = flights.astype(types_change)
    return flights

In [7]:
# getting file names for the for-loop (sorted with function 'os_sorted' from the 'natsort' package)
data_files = os_sorted(os.listdir('./downloads/extracted/'))
data_files

['On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_1.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_2.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_3.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_4.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_5.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_6.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_7.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_8.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_9.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_10.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_11.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_12.csv',
 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2024_1.csv',
 'On_Time

In [8]:
# getting the period range for the big file name
year_month = [x.split(')_')[1][:-4] for x in data_files]
f'flights_from_{year_month[0]}_until_{year_month[-1]}'

'flights_from_2023_1_until_2024_3'

In [9]:
flights_list = []

for file in tqdm(data_files):
    print(file)
    print('reading...', end=" ")
    flights_raw = pd.read_csv(f'./downloads/extracted/{file}', low_memory = False) # read as a dataframe
    
    flights_select = cols_to_keep(flights_raw) # select columns to keep
    print('filter colums...', end=" ")
    flights_rename = rename_cols(flights_select) # rename columns
    print('rename colums...', end=" ")
    flights_dtypes = change_dtypes(flights_rename) # change data types
    print('change dtypes...', end=" ")
    
    flights_list.append(flights_dtypes) # add to the list of dateframes
    print('appended to flight_list\n')
    
print(f'Done. The list has {len(flights_list)} elements')

  0%|          | 0/15 [00:00<?, ?it/s]

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_1.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_2.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_3.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_4.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_5.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2023_6.csv
reading... filter colums... rename colums... change dtypes... appended to flight_list

On_Time_Reporting_Carrier_On_Time_Performance_

In [10]:
flights_all = pd.concat(flights_list)

In [12]:
flights_all

Unnamed: 0,flight_date,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,airline,tail_number,flight_number,origin,dest,air_time,actual_elapsed_time,distance,cancelled,diverted
0,2023-01-02,757.0,800,-3.0,853.0,905,-12.0,9E,N605LR,4628,BDL,LGA,25.0,56.0,101,0,0
1,2023-01-03,755.0,800,-5.0,857.0,905,-8.0,9E,N605LR,4628,BDL,LGA,37.0,62.0,101,0,0
2,2023-01-04,755.0,800,-5.0,844.0,905,-21.0,9E,N331PQ,4628,BDL,LGA,28.0,49.0,101,0,0
3,2023-01-05,754.0,800,-6.0,848.0,905,-17.0,9E,N906XJ,4628,BDL,LGA,38.0,54.0,101,0,0
4,2023-01-06,759.0,800,-1.0,849.0,905,-16.0,9E,N337PQ,4628,BDL,LGA,28.0,50.0,101,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591762,2024-03-29,1312.0,1315,-3.0,1452.0,1507,-15.0,YX,N236JQ,5646,BOS,DCA,75.0,100.0,399,0,0
591763,2024-03-29,1616.0,1620,-4.0,1743.0,1802,-19.0,YX,N236JQ,5646,DCA,BOS,67.0,87.0,399,0,0
591764,2024-03-29,1418.0,1425,-7.0,1634.0,1659,-25.0,YX,N212JQ,5647,BOS,CVG,110.0,136.0,752,0,0
591765,2024-03-29,1743.0,1751,-8.0,1951.0,1958,-7.0,YX,N212JQ,5647,CVG,BOS,107.0,128.0,752,0,0


In [14]:
# flights_all.to_csv(f'../../data/flights_from_{year_month[0]}_until_{year_month[-1]}.csv', index=False)