In [1]:
import pandas as pd
import glob
import time
import warnings

In [2]:
# Disable iPython low_memory warnings
warnings.filterwarnings('ignore')

In [3]:
# Read csv containing list of columns to drop
drop_col = pd.read_csv('drop_columns.csv', index_col=False)
drop_col_list = drop_col['ColNum'].tolist()

# Read csv containing unique carrier IDs with full names
carriers = pd.read_csv('lookup_tables/L_UNIQUE_CARRIERS.csv', index_col=False)

In [31]:
start_time = time.time()

origin_df_list = []
dest_df_list = []
counter = 1

print('------------------------------\nBegin Data Munging\n------------------------------')

path = "raw/*.csv"
for fname in glob.glob(path):
    print(f'Processing file {"{0:>2s}".format(str(counter))} of {len(glob.glob(path))} | {fname}')
    counter = counter + 1
    
    raw_data = pd.read_csv(fname, index_col=False)
    raw_df = pd.DataFrame(raw_data)
    raw_df = raw_df.drop(raw_df.iloc[:,drop_col_list], axis=1)
    
    origin_df = raw_df[raw_df['Origin'] == 'LAX']
    origin_df = origin_df.dropna(subset=['DepDel15'])
    origin_df = origin_df.join(carriers.set_index('Code')['Description'], on='UniqueCarrier')
    origin_df = origin_df.rename(columns={'Description':'CarrierName'})
    origin_df_list.append(origin_df)
    
    dest_df = raw_df[raw_df['Dest'] == 'LAX']
    dest_df = dest_df.dropna(subset=['ArrDel15'])
    dest_df = dest_df.join(carriers.set_index('Code')['Description'], on='UniqueCarrier')
    dest_df = dest_df.rename(columns={'Description':'CarrierName'})
    dest_df_list.append(dest_df)    

print('------------------------------\nSaving origin delay data...')
combined_origin = pd.concat(origin_df_list)
combined_origin = combined_origin.reset_index(drop=True)
combined_origin.to_csv('clean/LAX_Origin_Flight_Delays_2016.csv', encoding='utf-8', index=False)

print('Saving destination delay data...')
combined_dest = pd.concat(dest_df_list)
combined_dest = combined_dest.reset_index(drop=True)
combined_dest.to_csv('clean/LAX_Dest_Flight_Delays_2016.csv', encoding='utf-8', index=False)

print('------------------------------\nData Munging Complete')
print('Processing time: '+ str(round(time.time() - start_time, 2)) + ' seconds')
print('------------------------------')

------------------------------
Begin Data Munging
------------------------------
Processing file  1 of 12 | raw\On_Time_On_Time_Performance_2016_1.csv
Processing file  2 of 12 | raw\On_Time_On_Time_Performance_2016_10.csv
Processing file  3 of 12 | raw\On_Time_On_Time_Performance_2016_11.csv
Processing file  4 of 12 | raw\On_Time_On_Time_Performance_2016_12.csv
Processing file  5 of 12 | raw\On_Time_On_Time_Performance_2016_2.csv
Processing file  6 of 12 | raw\On_Time_On_Time_Performance_2016_3.csv
Processing file  7 of 12 | raw\On_Time_On_Time_Performance_2016_4.csv
Processing file  8 of 12 | raw\On_Time_On_Time_Performance_2016_5.csv
Processing file  9 of 12 | raw\On_Time_On_Time_Performance_2016_6.csv
Processing file 10 of 12 | raw\On_Time_On_Time_Performance_2016_7.csv
Processing file 11 of 12 | raw\On_Time_On_Time_Performance_2016_8.csv
Processing file 12 of 12 | raw\On_Time_On_Time_Performance_2016_9.csv
------------------------------
Saving origin delay data...
Saving destinatio