In [1]:
# import packages
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
# add raw flights data to df
df = pd.read_csv('flights_raw.txt')
df.head()

Unnamed: 0,row_num,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay
0,300,2018-01-01,B6,B6,B6,523,B6,N984JB,523,12478,...,12892,LAX,"Los Angeles, CA",1235,1550,N,375,1,2475,-6.0
1,600,2018-01-01,B6,B6,B6,1224,B6,N969JB,1224,12892,...,12478,JFK,"New York, NY",1700,110,N,310,1,2475,17.0
2,900,2018-01-01,HA,HA,HA,16,HA,N373HA,16,12173,...,14679,SAN,"San Diego, CA",1505,2220,N,315,1,2614,44.0
3,1200,2018-01-01,UA,UA_CODESHARE,UA,3443,YX,N979RP,3443,11292,...,14457,RAP,"Rapid City, SD",1130,1249,N,79,1,300,-7.0
4,1500,2018-01-01,AA,AA,AA,608,AA,N742PS,608,11298,...,10397,ATL,"Atlanta, GA",1645,1948,N,123,1,731,-7.0


In [3]:
# create day of week column from fl_date
print(type(df['fl_date'][0]))
df['fl_date'] = pd.to_datetime(df['fl_date'])
print(type(df['fl_date'][0]))
df['day_of_week'] = df['fl_date'].dt.dayofweek
set(df['day_of_week'])

<class 'str'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


{0, 1, 2, 3, 4, 5, 6}

In [4]:
# split into training and test
features = ['op_unique_carrier', 'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'distance', 'day_of_week']
X = df[features]
y = df['arr_delay']
X.shape, y.shape

((53091, 6), (53091,))

### create preprocessed working file

In [5]:
# initialize scalers
scaler = StandardScaler()
enc = OneHotEncoder(sparse = False)

In [31]:
# OHE categorical features
cat = X['op_unique_carrier']
cat_array = cat.values.reshape(-1, 1)
cat_encoded = enc.fit_transform(cat_array)
cat_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
enc.categories_

[array(['9E', '9K', 'AA', 'AS', 'AX', 'B6', 'C5', 'CP', 'DL', 'EM', 'EV',
        'F9', 'G4', 'G7', 'HA', 'KS', 'MQ', 'NK', 'OH', 'OO', 'PT', 'QX',
        'UA', 'VX', 'WN', 'YV', 'YX', 'ZW'], dtype=object)]

In [33]:
enc.get_feature_names_out(['carrier'])

array(['carrier_9E', 'carrier_9K', 'carrier_AA', 'carrier_AS',
       'carrier_AX', 'carrier_B6', 'carrier_C5', 'carrier_CP',
       'carrier_DL', 'carrier_EM', 'carrier_EV', 'carrier_F9',
       'carrier_G4', 'carrier_G7', 'carrier_HA', 'carrier_KS',
       'carrier_MQ', 'carrier_NK', 'carrier_OH', 'carrier_OO',
       'carrier_PT', 'carrier_QX', 'carrier_UA', 'carrier_VX',
       'carrier_WN', 'carrier_YV', 'carrier_YX', 'carrier_ZW'],
      dtype=object)

In [34]:
# scale numerical features
num = X.drop('op_unique_carrier', axis=1)
num_scaled = scaler.fit_transform(num)
num_scaled

array([[-0.19008473,  0.12852496,  3.28118944,  2.90238863, -1.47130951],
       [ 0.76082616, -2.66661571,  2.38072961,  2.90238863, -1.47130951],
       [ 0.36205708,  1.4290418 ,  2.44999576,  3.13899514, -1.47130951],
       ...,
       [-1.44774105, -1.11375978,  0.44127767,  0.40014704, -0.97024923],
       [-0.38844678, -0.48485313, -1.13799034, -1.06715381, -0.97024923],
       [-0.0571617 ,  1.25434551,  2.65779418,  3.0913334 , -0.97024923]])

In [37]:
# recombine processed files back into a single features file
cat_df = pd.DataFrame(cat_encoded, columns = enc.get_feature_names_out(['carrier']))
num_df = pd.DataFrame(num_scaled, columns = ['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'distance', 'day_of_week'])
X_scaled = pd.merge(num_df, cat_df, left_index = True, right_index = True)
X_scaled.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,day_of_week,carrier_9E,carrier_9K,carrier_AA,carrier_AS,carrier_AX,...,carrier_OH,carrier_OO,carrier_PT,carrier_QX,carrier_UA,carrier_VX,carrier_WN,carrier_YV,carrier_YX,carrier_ZW
0,-0.190085,0.128525,3.281189,2.902389,-1.47131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.760826,-2.666616,2.38073,2.902389,-1.47131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.362057,1.429042,2.449996,3.138995,-1.47131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.404807,-0.455737,-0.819366,-0.799908,-1.47131,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.648353,0.901071,-0.209824,-0.066257,-1.47131,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
X_scaled.shape

(53091, 33)

In [38]:
# merge training and target file back to export
full_scaled_data = pd.merge(y, X_scaled, left_index = True, right_index = True)
full_scaled_data

Unnamed: 0,arr_delay,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,day_of_week,carrier_9E,carrier_9K,carrier_AA,carrier_AS,...,carrier_OH,carrier_OO,carrier_PT,carrier_QX,carrier_UA,carrier_VX,carrier_WN,carrier_YV,carrier_YX,carrier_ZW
0,-6.0,-0.190085,0.128525,3.281189,2.902389,-1.471310,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17.0,0.760826,-2.666616,2.380730,2.902389,-1.471310,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44.0,0.362057,1.429042,2.449996,3.138995,-1.471310,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-7.0,-0.404807,-0.455737,-0.819366,-0.799908,-1.471310,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-7.0,0.648353,0.901071,-0.209824,-0.066257,-1.471310,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53086,2.0,0.433631,0.904953,0.718342,1.040176,-0.970249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53087,21.0,-0.967173,-0.143225,0.787608,1.252952,-0.970249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53088,-11.0,-1.447741,-1.113760,0.441278,0.400147,-0.970249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53089,-13.0,-0.388447,-0.484853,-1.137990,-1.067154,-0.970249,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# export
full_scaled_data.to_csv('full_scaled_data.csv')