In [1]:
# use natsort package to sort thos missing leading zero files 
# (sorting is optional, depends on whether you want the big file to be concatenated ordered)
# pip install natsort

# in case tqdm is not installed
# pip install tqdm

In [2]:
import pandas as pd
import os
from natsort import os_sorted
from tqdm.notebook import tqdm

# Raw Data Source: https://transtats.bts.gov/PREZIP/

The ZIP files we are interested in have this naming: "On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2024_2.zip"  
Each ZIP file contains a CSV with data for one month. The last part "..._2024_2" of the file name indicates that this file contains data for February 2024.

> SOURCE: https://transtats.bts.gov

#### working scenario: 
- download all relevant zipfiles to a folder `downloads` in our repo
- extract each csv into a folder `downloads/extracted`

In [None]:
# open a file test
flights_raw = pd.read_csv('./downloads/extracted/On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2024_2.csv', low_memory = False)
flights_raw.head(2)

In [4]:
# select columns to keep
def cols_to_keep(flights_raw):
    columns_to_keep = [
        "FlightDate",
        "DepTime",
        "CRSDepTime",
        "DepDelay",
        "ArrTime",
        "CRSArrTime",
        "ArrDelay",
        "Reporting_Airline",
        "Tail_Number",
        "Flight_Number_Reporting_Airline",
        "Origin",
        "Dest",
        "AirTime",
        "ActualElapsedTime",
        "Distance",
        "Cancelled",
        "Diverted",
    ]
    flights = flights_raw.loc[:, columns_to_keep]
    return flights

In [5]:
# rename columns
def rename_cols(flights):
    new_column_names = {
        'FlightDate': 'flight_date',
        'DepTime': 'dep_time',
        'CRSDepTime': 'sched_dep_time',
        'DepDelay': 'dep_delay',
        'ArrTime': 'arr_time',
        'CRSArrTime': 'sched_arr_time',
        'ArrDelay': 'arr_delay',
        'Reporting_Airline': 'airline',
        'Tail_Number': 'tail_number',
        'Flight_Number_Reporting_Airline': 'flight_number',
        'Origin': 'origin',
        'Dest': 'dest',
        'AirTime': 'air_time',
        'ActualElapsedTime': 'actual_elapsed_time',
        'Distance': 'distance',
        'Cancelled': 'cancelled',
        'Diverted': 'diverted'
    }
    flights.rename(columns=new_column_names, inplace=True)
    return flights

In [6]:
# change datatype
def change_dtypes(flights):
    types_change = {
        'flight_date': 'datetime64[ns]',
        'dep_time': 'float64',
        'sched_dep_time': 'int16',
        'dep_delay': 'float64',
        'arr_time': 'float64',
        'sched_arr_time': 'int16',
        'arr_delay': 'float64',
        'airline': 'O',
        'tail_number': 'O',
        'flight_number': 'int64',
        'origin': 'O',
        'dest': 'O',
        'air_time': 'float64',
        'actual_elapsed_time': 'float64',
        'distance': 'int16',
        'cancelled': 'int16',
        'diverted': 'int16'
    }
    flights = flights.astype(types_change)
    return flights

In [None]:
# getting file names for the for-loop (sorted with function 'os_sorted' from the 'natsort' package)
data_files = os_sorted(os.listdir('./downloads/extracted/'))
data_files

In [None]:
# getting the period range for the big file name
year_month = [x.split(')_')[1][:-4] for x in data_files]
f'flights_from_{year_month[0]}_until_{year_month[-1]}'

In [None]:
flights_list = []

for file in tqdm(data_files):
    print(file)
    print('reading...', end=" ")
    flights_raw = pd.read_csv(f'./downloads/extracted/{file}', low_memory = False) # read as a dataframe
    
    flights_select = cols_to_keep(flights_raw) # select columns to keep
    print('filter colums...', end=" ")
    flights_rename = rename_cols(flights_select) # rename columns
    print('rename colums...', end=" ")
    flights_dtypes = change_dtypes(flights_rename) # change data types
    print('change dtypes...', end=" ")
    
    flights_list.append(flights_dtypes) # add to the list of dateframes
    print('appended to flight_list\n')
    
print(f'Done. The list has {len(flights_list)} elements')

In [10]:
flights_all = pd.concat(flights_list)

In [None]:
flights_all

In [14]:
# just as a backup we can save our data as a CSV file

# flights_all.to_csv(f'../../data/flights_from_{year_month[0]}_until_{year_month[-1]}.csv', index=False)

## To-Do: Load the data to your Project Schemas in our Database.

In [None]:
## add code here...