In [1]:
import pandas as pd
import numpy as np
from numpy import sort
import random
import matplotlib.pyplot as plt
import seaborn as sns
import copy

import datetime as dt
from datetime import date, time

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from scipy.stats import boxcox
from numpy import log

from sklearn.model_selection import train_test_split

from sklearn import metrics
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error

import pickle

In [48]:
filepath = 'C:/Users/Tim/Desktop/lighthouse/w6 - midterm/'
file = 'final_test.csv'
data = pd.read_csv(filepath+file)

In [49]:
# o_data = pd.read_csv(filepath+'mini_sample.csv')
o_X = pd.read_csv(filepath+'X.csv')

In [50]:
def info(x):
    n_missing = x.isnull().sum().sort_values(ascending=False)
    p_missing = (x.isnull().sum()/x.isnull().count()).sort_values(ascending=False)
    dtype = x.dtypes
    count = x.count()
    missing_ = pd.concat([n_missing, p_missing, dtype, count],axis=1, keys = [
        'number_missing',
        'percent_missing',
        'type',
        'count'
    ])
    return missing_

In [51]:
info(data)

Unnamed: 0,number_missing,percent_missing,type,count
tail_num,124,0.000823,object,150499
year,0,0.0,int64,150623
season,0,0.0,int64,150623
dest_airport_size,0,0.0,int64,150623
origin_airport_size,0,0.0,int64,150623
predicted_speed,0,0.0,float64,150623
dest_fl_density,0,0.0,int64,150623
origin_fl_density,0,0.0,int64,150623
crs_dep_hour,0,0.0,int64,150623
crs_arr_hour,0,0.0,int64,150623


In [52]:
data = data.dropna()

In [53]:
# cols with > 80% missing values

drop = [
    'no_name',
    'total_add_gtime',
    'first_dep_time',
    'longest_add_gtime',
    'cancellation_code',
]

# missing 80%+ values, but apparently useful?

delay_cols = [
    'late_aircraft_delay',
    'security_delay',
    'nas_delay',
    'weather_delay',
    'carrier_delay'
]

In [54]:
# split city and country

data[['origin_city', 'origin_country']] = data['origin_city_name'].str.split(',', expand=True, n=2)

data[['dest_city', 'dest_country']] = data['dest_city_name'].str.split(',', expand=True, n=2)

# try using just city data, groupby country data?

In [55]:
# removing related carrier identifiers; commented out = used feature

carrier_identifiers = [
    'mkt_unique_carrier',
    'branded_code_share',
    #'mkt_carrier',
    'mkt_carrier_fl_num',
    'op_unique_carrier',
    'tail_num',
    'op_carrier_fl_num',
]

# removing related airport, place identifiers

place_identifiers = [
#origin identifiers
    'origin_airport_id',
    #'origin',
    'origin_city_name',
    'origin_city',
    'origin_country',
#destination identifiers
    'dest_airport_id',
    #'dest',
    'dest_city_name',
    'dest_city',
    'dest_country',
]

In [56]:
# avg flights per hr per airport -> fl traffic

data['hrly_bin'] = round(data['crs_arr_time'],-2)
data['avg_hr_fl'] = data.groupby('dest')['hrly_bin'].transform(lambda x: len(x)/x.nunique())
data['avg_day_fl'] = data.groupby('dest')['day'].transform(lambda x: len(x)/x.nunique())
# data = data.drop(columns='hrly_bin')

In [57]:
# data['fl_date'] = pd.to_datetime(data['fl_date'])

# # month and year

# data['day_of_week'] = data['fl_date'].dt.date.isowekday()
# data['month'] = data['fl_date'].dt.month
# data['year'] = data['fl_date'].dt.year
data = data.sort_values(by=['fl_date'])

# dep_delay 7 day lag

# data['dep_delay_lag'] = data['dep_delay'].shift(7)
# # data['arr_delay_lag'] = data['arr_delay'].shift(7)

# # dep_delay 7 day mean

# data['ddl_rolling_mean'] = data['dep_delay'].rolling(window=30).mean()

# if we use classifier

# data['arr_delay_flag'] = data['arr_delay'].map(lambda x: 1 if x > 0 else 0)

data = data.drop(columns = 'fl_date')

In [58]:
# dep_time	          wheels_off	0.9725230213908642
# wheels_on	          arr_time	    0.9630471247128861
# crs_elapsed_time	  distance	    0.9827710593474663
# actual_elapsed_time air_time	    0.985116094313287

# flights = column of 1s

corr_drop = [
    #'wheels_off',  #dep/arr time could have information on whether or not time of day affects delay
    #'wheels_on',   #wheels off/on probably time spent in tarmac
    #'timestamp',
    #'dist_group',
    'distance'
]

# don't use dep_delay

extra_drop = [
    'flights',
    #'origin_num_freight',
    #'origin_num_passengers',
    'crs_elapsed_time',
    #'crs_arr_daytime',
    #'crs_dep_daytime',
    #'actual_elapsed_time',
    #'predicted_speed',
    #'actual_speed',
    #'dep_early_morning',
    #'origin_num_intl_dep',
    #'dest_num_intl_dep',
    #'origin_num_intl_arr',
    #'dest_num_intl_arr',
    #'mean_monthly_op_carrier_delay',
    #'mean_mo_delay_origin_airport',
    #'mean_mo_delay_dest_airport',
    #'air_time',
    #'taxi_in',
    #'taxi_out',
    #'diverted',
    #'cancelled',
    #'dep_delay',
    #'dep_time',
    #'arr_time',
    #'day_of_year',
    'dup',
]


data = data.drop(columns=corr_drop+extra_drop)

In [59]:
cd = o_data.groupby('mkt_unique_carrier').mean()['arr_delay']
cd = cd.reset_index()
cd.columns = ['mkt_unique_carrier','mean_carrier_arr_delay']
data = data.merge(cd, on=['mkt_unique_carrier'], how='left')

In [60]:
td = o_data.groupby('tail_num').mean()['arr_delay']
td = td.reset_index()
td.columns = ['tail_num','tail_num_arr_delay']
data = data.merge(td, on=['tail_num'], how='left')

In [61]:
# op_carrier, origin, dest

data['mkt_carrier'] = pd.factorize(data['mkt_carrier'])[0]
data['origin'] = pd.factorize(data['origin'])[0]
data['dest'] = pd.factorize(data['dest'])[0]

In [62]:
# remove cols

data = data.drop(columns=
                #drop+       # already dropped
                #delay_cols+ # not really sure what to do with these columns (already dropped)
                carrier_identifiers+
                place_identifiers)

In [63]:
data['totalSnow_mm'] = data['totalSnow_cm']*10
data = data.drop(columns = 'totalSnow_cm')

In [64]:
# crs_dep_time, arr_time are values in range 0-2.4k -> get hours only

# data['crs_dep_hour'] = round(data['crs_dep_time']/60)
# data['crs_arr_hour'] = round(data['crs_arr_time']/60)

data = data.drop(columns=['crs_dep_time','crs_arr_time'])

In [73]:
# reshuffle
data = data.sample(frac=1)
data = data.reset_index()
# data = data[data['arr_delay']<=200]

In [74]:
numerical = list(data.dtypes[data.dtypes != 'object'].index)

categorical = list(data.dtypes[data.dtypes == 'object'].index)

In [75]:
numerical

['index',
 'mkt_carrier',
 'origin',
 'dest',
 'sunHour',
 'cloudcover',
 'precipMM',
 'day',
 'month',
 'year',
 'weekday',
 'crs_arr_hour',
 'crs_dep_hour',
 'origin_fl_density',
 'dest_fl_density',
 'predicted_speed',
 'origin_airport_size',
 'dest_airport_size',
 'season',
 'dist_group',
 'mean_op_carrier_delay',
 'mean_delay_origin_airport',
 'mean_delay_dest_airport',
 'origin_num_passengers',
 'dest_num_passengers',
 'origin_num_freight',
 'dest_num_freight',
 'hrly_bin',
 'avg_hr_fl',
 'avg_day_fl',
 'mean_carrier_arr_delay',
 'tail_num_arr_delay',
 'totalSnow_mm',
 'weather_type_Rainy',
 'weather_type_Snowy',
 'weather_type_Sunny']

In [76]:
categorical

[]

In [77]:
data = pd.get_dummies(data=data,columns=categorical,drop_first=True)

In [81]:
# set(o_X.columns)-set(data.columns)

In [36]:
data = data.dropna()

In [79]:
# col = ['mean_delay_origin_airport', 
#        'avg_day_fl', 
#        'origin_fl_density', 
#        'year',
#        'weather_type_Rainy', 
#        'origin_num_freight', 
#        'crs_arr_hour', 
#        'weather_type_Snowy', 
#        'dest_fl_density', 
#        'mean_op_carrier_delay', 
#        'mean_delay_dest_airport', 
#        'dist_group', 
#        'day', 
#        'dest_num_passengers', 
#        'dest_airport_size', 
#        'season', 
#        'dest', 
#        'weather_type_Sunny', 
#        'weekday', 
#        'origin_airport_size', 
#        'dest_num_freight',
#        'mkt_carrier', 
#        'mean_carrier_arr_delay', 
#        'origin_num_passengers', 
#        'origin', 
#        'month', 
#        'tail_num_arr_delay', 
#        'precipMM', 
#        'crs_dep_hour', 
#        'totalSnow_mm', 
#        'index', 
#        'sunHour', 
#        'cloudcover', 
#        'avg_hr_fl', 
#        'hrly_bin', 
#        'predicted_speed']
# data = data[col]

In [80]:
data.to_csv('final_test_cleaned.csv')

In [83]:
data.columns

Index(['index', 'mkt_carrier', 'origin', 'dest', 'sunHour', 'cloudcover',
       'precipMM', 'day', 'month', 'year', 'weekday', 'crs_arr_hour',
       'crs_dep_hour', 'origin_fl_density', 'dest_fl_density',
       'predicted_speed', 'origin_airport_size', 'dest_airport_size', 'season',
       'dist_group', 'mean_op_carrier_delay', 'mean_delay_origin_airport',
       'mean_delay_dest_airport', 'origin_num_passengers',
       'dest_num_passengers', 'origin_num_freight', 'dest_num_freight',
       'hrly_bin', 'avg_hr_fl', 'avg_day_fl', 'mean_carrier_arr_delay',
       'tail_num_arr_delay', 'totalSnow_mm', 'weather_type_Rainy',
       'weather_type_Snowy', 'weather_type_Sunny'],
      dtype='object')