In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from datetime import datetime

def to_minutes(x):
    h,m = x.split(':')
    return int(h)*60 + int(m)

def findDayOfWeek(dateStr):
    date_object = datetime.strptime(dateStr, '%m/%d/%Y')
    dayOfWeek = date_object.strftime('%A')
    return dayOfWeek 

# findDayOfWeek('01/01/2006')

def flightNoStr(x):
    return str(int(x))

def classifyDelay(delay):
    if delay < -5:
        return 0 #flight is early
    elif delay > 5:
        return 1 #flight is delayed
    else:
        return 2 #flight is ontime
# def calculateDepTime()


In [3]:

# cols_to_be_dropped=['DATE', 'DAY', 'FLIGHT NUMBER', 'CARRIER DELAY', 'WEATHER DELAY', 'precipcover', 'AIRLINE YEARLY ON-TIME ARR PERCENTAGE']
cols_to_be_dropped=['DATE', 'DAY', 'FLIGHT NUMBER', 'CARRIER DELAY', 'WEATHER DELAY', 'precipcover']
# cols_to_be_dropped=['DATE', 'DAY', 'FLIGHT NUMBER', 'CARRIER DELAY', 'WEATHER DELAY', 'precipcover', 'snowdepth']
# cols_to_be_dropped=['DATE', 'DAY', 'FLIGHT NUMBER', 'CARRIER DELAY', 'WEATHER DELAY', 'precipcover', 'tempmin']

In [4]:
data_1 = pd.read_csv('data/data.csv')

# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_1.isna().sum()
data_1 = data_1.drop(columns = cols_to_be_dropped)
data_1.head()

DATE                                     0
DAY                                      0
FLIGHT NUMBER                            0
ORIGIN                                   0
DEPARTURE TIME                           0
ARRIVAL TIME                             0
CARRIER DELAY                            0
WEATHER DELAY                            0
AIRPORT YEARLY ON-TIME DEP PERCENTAGE    0
AIRLINE YEARLY ON-TIME ARR PERCENTAGE    0
ARRIVAL STATUS                           0
weather_code                             0
tempmin                                  0
precipcover                              0
snowdepth                                0
windspeedmax                             0
visibility                               0
cloudcover                               0
dtype: int64

Unnamed: 0,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,AIRPORT YEARLY ON-TIME DEP PERCENTAGE,AIRLINE YEARLY ON-TIME ARR PERCENTAGE,ARRIVAL STATUS,weather_code,tempmin,snowdepth,windspeedmax,visibility,cloudcover
0,ORD,420,526,78.21,79.13,0,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3
1,ORD,1067,1167,77.53,83.32,0,cloudy,31.9,2.1,13.9,8.6,98.7
2,ORD,871,970,74.4,83.73,2,partly_cloudy_day,20.9,0.7,17.6,9.7,63.5
3,ORD,1256,1359,76.58,85.0,2,snow,33.4,0.1,14.8,6.5,100.0
4,ORD,895,997,70.37,77.39,1,cloudy,28.9,0.2,15.9,7.8,100.0


In [5]:
data_2 = pd.read_csv('data/data_with_prev_flight_status.csv')
# data_2.head()
# flight_data.columns
# flight_data['FLIGHT NUMBER'].unique().shape
data_2.isna().sum()
data_2 = data_2.drop(columns = cols_to_be_dropped)
data_2.head()

DATE                                     0
DAY                                      0
FLIGHT NUMBER                            0
ORIGIN                                   0
DEPARTURE TIME                           0
ARRIVAL TIME                             0
CARRIER DELAY                            0
WEATHER DELAY                            0
AIRPORT YEARLY ON-TIME DEP PERCENTAGE    0
AIRLINE YEARLY ON-TIME ARR PERCENTAGE    0
ARRIVAL STATUS                           0
weather_code                             0
tempmin                                  0
precipcover                              0
snowdepth                                0
windspeedmax                             0
visibility                               0
cloudcover                               0
previous_flight_status                   0
dtype: int64

Unnamed: 0,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,AIRPORT YEARLY ON-TIME DEP PERCENTAGE,AIRLINE YEARLY ON-TIME ARR PERCENTAGE,ARRIVAL STATUS,weather_code,tempmin,snowdepth,windspeedmax,visibility,cloudcover,previous_flight_status
0,MCO,776,935,81.88,77.45,2,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3,0
1,MCO,1095,1253,81.88,77.45,0,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3,2
2,JFK,405,475,78.98,77.45,1,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3,0
3,JFK,670,745,78.98,77.45,2,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3,1
4,JFK,1020,1112,78.98,77.45,2,partly_cloudy_day,7.8,0.0,14.7,9.9,61.3,2


In [6]:
#Building XGBoost model using the finalData
encoded_data_1 = pd.get_dummies(data_1.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_1.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,AIRPORT YEARLY ON-TIME DEP PERCENTAGE,AIRLINE YEARLY ON-TIME ARR PERCENTAGE,tempmin,snowdepth,windspeedmax,visibility,cloudcover,ORIGIN_MCO,ORIGIN_ORD,weather_code_cloudy,weather_code_partly_cloudy_day,weather_code_rain,weather_code_snow,weather_code_wind
0,420,526,78.21,79.13,7.8,0.0,14.7,9.9,61.3,False,True,False,True,False,False,False
1,1067,1167,77.53,83.32,31.9,2.1,13.9,8.6,98.7,False,True,True,False,False,False,False
2,871,970,74.4,83.73,20.9,0.7,17.6,9.7,63.5,False,True,False,True,False,False,False
3,1256,1359,76.58,85.0,33.4,0.1,14.8,6.5,100.0,False,True,False,False,False,True,False
4,895,997,70.37,77.39,28.9,0.2,15.9,7.8,100.0,False,True,True,False,False,False,False


In [7]:
#Building XGBoost model using the finalData
encoded_data_2 = pd.get_dummies(data_2.drop(columns=['ARRIVAL STATUS']), drop_first=True)
encoded_data_2.head()

Unnamed: 0,DEPARTURE TIME,ARRIVAL TIME,AIRPORT YEARLY ON-TIME DEP PERCENTAGE,AIRLINE YEARLY ON-TIME ARR PERCENTAGE,tempmin,snowdepth,windspeedmax,visibility,cloudcover,previous_flight_status,ORIGIN_MCO,ORIGIN_ORD,weather_code_cloudy,weather_code_partly_cloudy_day,weather_code_rain,weather_code_snow,weather_code_wind
0,776,935,81.88,77.45,7.8,0.0,14.7,9.9,61.3,0,True,False,False,True,False,False,False
1,1095,1253,81.88,77.45,7.8,0.0,14.7,9.9,61.3,2,True,False,False,True,False,False,False
2,405,475,78.98,77.45,7.8,0.0,14.7,9.9,61.3,0,False,False,False,True,False,False,False
3,670,745,78.98,77.45,7.8,0.0,14.7,9.9,61.3,1,False,False,False,True,False,False,False
4,1020,1112,78.98,77.45,7.8,0.0,14.7,9.9,61.3,2,False,False,False,True,False,False,False


In [8]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(encoded_data_1, data_1['ARRIVAL STATUS'], stratify = data_1['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train_1.shape
X_test_1.shape

(11269, 16)

(2818, 16)

In [9]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(encoded_data_2, data_2['ARRIVAL STATUS'], stratify = data_2['ARRIVAL STATUS'], test_size=0.2, random_state=42)
X_train_2.shape
X_test_2.shape

(11269, 17)

(2818, 17)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler_1 = StandardScaler()
X_train_scaled_1 = pd.DataFrame(scaler_1.fit_transform(X_train_1), columns=X_train_1.columns, index=X_train_1.index)
X_test_scaled_1 = pd.DataFrame(scaler_1.transform(X_test_1), columns=X_test_1.columns, index=X_test_1.index)

scaler_total_1 = StandardScaler()
total_Data_scaled_1 = pd.DataFrame(scaler_total_1.fit_transform(encoded_data_1), columns=encoded_data_1.columns, index=encoded_data_1.index)
total_y_1 = data_1['ARRIVAL STATUS']
# X_test_scaled

In [11]:
# from sklearn.preprocessing import StandardScaler
scaler_2 = StandardScaler()
X_train_scaled_2 = pd.DataFrame(scaler_2.fit_transform(X_train_2), columns=X_train_2.columns, index=X_train_2.index)
X_test_scaled_2 = pd.DataFrame(scaler_2.transform(X_test_2), columns=X_test_2.columns, index=X_test_2.index)

scaler_total_2 = StandardScaler()
total_Data_scaled_2 = pd.DataFrame(scaler_total_2.fit_transform(encoded_data_2), columns=encoded_data_2.columns, index=encoded_data_2.index)
total_y_2 = data_2['ARRIVAL STATUS']
# X_test_scaled

In [12]:
# X_train_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})
# X_test_scaled.rename({'DEPARTURE TIME':'DEPARTURE_TIME', 'ARRIVAL TIME':'ARRIVAL_TIME'})

# dtrain_class = xgb.DMatrix(X_train_scaled, y_train, enable_categorical=True)
# dtest_class = xgb.DMatrix(X_test_scaled, y_test, enable_categorical=True)



In [13]:
def accuracy_score(test_output):
    total_count = len(test_output)
    wrong_count = len(test_output[test_output['Predicted Arrival Status'] != test_output['ARRIVAL STATUS']])
    accuracy = wrong_count/total_count
    return accuracy*100

In [14]:
# gridsearch for hyper parameters
# xgb_clf = xgb.XGBClassifier()

# # Define the hyperparameters grid for tuning
# param_grid = {
#     'n_estimators': [100, 150, 200],
#     'max_depth': [5, 10, 15],
#     'learning_rate': [0.05, 0.1],
#     'gamma': [0.03, 0.5, 1],
#     'booster': ['gbtree', 'dart'],
#     'objective': ['multi:softmax']
#     'colsample_bytree': [0.3, 0.5, 0.6],
#     'reg_lambda': [0, 10, 50, 150],
#     # 'reg_alpha': [0, 0.1, 150]
#     # 'C': [0.1, 1, 10],  # Regularization parameter
#     # 'gamma': ['scale', 'auto', 0.1, 0.01],  # Kernel coefficient
#     # 'kernel': ['linear', 'rbf']  # Kernel type
#     # 'learning_rates': [0.05, 0.1, 0.2, 0.3, 0.4],
#     # 'tree_depth': [4, 6, 7, 8, 10, 15]
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=2, scoring='accuracy')
# grid_search.fit(X_train_scaled_1, y_train_1)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

In [None]:
# params = best_params #{'gamma': 0.1, 'learning_rates': 0.05, 'reg_alpha': 1, 'reg_lambda': 10, 'tree_depth': 4}
# xgbModel = xgb.XGBClassifier(**params)
# # xgbModel = GradientBoostingClassifier()
# # xgbModel = GradientBoostingClassifier()
# xgbModel.fit(X_train_scaled_1, y_train_1)
# y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_1), columns=['Predicted Arrival Status'], index=X_test_scaled_1.index)

# test_output = y_pred.merge(y_test_1, left_index=True, right_index=True)
# # test_output.head()
# accuracy = accuracy_score(test_output)
# print('Accuracy of the xgb model is {}'.format(accuracy))
# # if accuracy > best_accuracy_1:
# #     best_accuracy_1 = accuracy
# #     best_params_1 = params

In [15]:
tree_methods = ['exact', 'hist', 'approx']
tree_depth = [4, 6, 7, 8, 10, 15, 20]
learning_rates = [0.05, 0.1, 0.5, 0.9]

best_accuracy_1 = 0
best_params_1 = {}

In [16]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "booster": "gbtree",
                "gamma": "0.5",
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_1, y_train_1)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_1), columns=['Predicted Arrival Status'], index=X_test_scaled_1.index)
            
            test_output = y_pred.merge(y_test_1, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_1:
                best_accuracy_1 = accuracy
                best_params_1 = params

Accuracy of the xgb model is 50.354861603974456


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 48.89992902767921


Accuracy of the xgb model is 49.43222143364088


Accuracy of the xgb model is 49.14833215046132


Accuracy of the xgb model is 49.680624556423


Accuracy of the xgb model is 48.68701206529453


Accuracy of the xgb model is 49.07735982966643


Accuracy of the xgb model is 49.57416607523066


Accuracy of the xgb model is 49.751596877217885


Accuracy of the xgb model is 49.36124911284599


Accuracy of the xgb model is 50.7097232079489


Accuracy of the xgb model is 50.0


Accuracy of the xgb model is 50.28388928317956


Accuracy of the xgb model is 50.567778566359124


Accuracy of the xgb model is 51.1000709723208


Accuracy of the xgb model is 51.38396025550035


Accuracy of the xgb model is 51.13555713271823


Accuracy of the xgb model is 51.95173882185947


Accuracy of the xgb model is 51.66784953867991


Accuracy of the xgb model is 52.696948190205816


Accuracy of the xgb model is 50.49680624556423


Accuracy of the xgb model is 49.82256919801277


Accuracy of the xgb model is 49.46770759403832


Accuracy of the xgb model is 49.82256919801277


Accuracy of the xgb model is 49.32576295244854


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 48.757984386089426


Accuracy of the xgb model is 48.86444286728176


Accuracy of the xgb model is 49.92902767920511


Accuracy of the xgb model is 49.14833215046132


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 49.92902767920511


Accuracy of the xgb model is 49.396735273243436


Accuracy of the xgb model is 50.28388928317956


Accuracy of the xgb model is 50.567778566359124


Accuracy of the xgb model is 51.31298793470547


Accuracy of the xgb model is 51.17104329311568


Accuracy of the xgb model is 51.49041873669269


Accuracy of the xgb model is 52.05819730305181


Accuracy of the xgb model is 52.55500354861604


Accuracy of the xgb model is 52.022711142654366


Accuracy of the xgb model is 49.893541518807666


Accuracy of the xgb model is 49.43222143364088


Accuracy of the xgb model is 49.21930447125621


Accuracy of the xgb model is 49.112845990063875


Accuracy of the xgb model is 48.79347054648687


Accuracy of the xgb model is 49.57416607523066


Accuracy of the xgb model is 48.86444286728176


Accuracy of the xgb model is 49.751596877217885


Accuracy of the xgb model is 50.0


Accuracy of the xgb model is 48.616039744499645


Accuracy of the xgb model is 48.65152590489709


Accuracy of the xgb model is 50.53229240596168


Accuracy of the xgb model is 49.96451383960255


Accuracy of the xgb model is 49.893541518807666


Accuracy of the xgb model is 51.63236337828248


Accuracy of the xgb model is 51.916252661462025


Accuracy of the xgb model is 51.59687721788503


Accuracy of the xgb model is 51.77430801987225


Accuracy of the xgb model is 51.20652945351313


Accuracy of the xgb model is 51.845280340667145


Accuracy of the xgb model is 52.235628105039034


In [17]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_1))
print('Best Parameter using xgb model is {}'.format(best_params_1))

Best Accuracy using xgb model is 52.696948190205816
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'exact', 'max_depth': 20, 'learning_rate': 0.2, 'n_estimators': 200}


In [18]:
best_accuracy_2 = 0
best_params_2 = {}

In [19]:
for method in tree_methods:
    for depth in tree_depth:
        for lr in learning_rates:
            params = {
                "objective": "multi:softmax",
                "tree_method": method,
                "max_depth" : depth,
                "booster": "gbtree",
                "gamma": "0.5",
                "learning_rate" : lr,
                "n_estimators": 200
            }
            xgbModel = xgb.XGBClassifier(**params)
            # xgbModel = GradientBoostingClassifier()
            # xgbModel = GradientBoostingClassifier()
            xgbModel.fit(X_train_scaled_2, y_train_2)
            y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled_2), columns=['Predicted Arrival Status'], index=X_test_scaled_2.index)
            
            test_output = y_pred.merge(y_test_2, left_index=True, right_index=True)
            # test_output.head()
            accuracy = accuracy_score(test_output)
            print('Accuracy of the xgb model is {}'.format(accuracy))
            if accuracy > best_accuracy_2:
                best_accuracy_2 = accuracy
                best_params_2 = params

Accuracy of the xgb model is 50.46132008516678


Accuracy of the xgb model is 49.46770759403832


Accuracy of the xgb model is 48.89992902767921


Accuracy of the xgb model is 50.070972320794894


Accuracy of the xgb model is 49.36124911284599


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 49.82256919801277


Accuracy of the xgb model is 49.396735273243436


Accuracy of the xgb model is 50.567778566359124


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 49.32576295244854


Accuracy of the xgb model is 51.17104329311568


Accuracy of the xgb model is 49.32576295244854


Accuracy of the xgb model is 49.64513839602555


Accuracy of the xgb model is 51.1000709723208


Accuracy of the xgb model is 50.567778566359124


Accuracy of the xgb model is 50.922640170333565


Accuracy of the xgb model is 50.49680624556423


Accuracy of the xgb model is 50.248403122782115


Accuracy of the xgb model is 50.53229240596168


Accuracy of the xgb model is 50.425833924769336


Accuracy of the xgb model is 49.96451383960255


Accuracy of the xgb model is 49.85805535841022


Accuracy of the xgb model is 49.82256919801277


Accuracy of the xgb model is 49.57416607523066


Accuracy of the xgb model is 49.92902767920511


Accuracy of the xgb model is 49.609652235628104


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.78708303761533


Accuracy of the xgb model is 49.112845990063875


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 49.46770759403832


Accuracy of the xgb model is 49.92902767920511


Accuracy of the xgb model is 50.851667849538686


Accuracy of the xgb model is 50.0


Accuracy of the xgb model is 50.60326472675657


Accuracy of the xgb model is 50.248403122782115


Accuracy of the xgb model is 51.06458481192335


Accuracy of the xgb model is 51.31298793470547


Accuracy of the xgb model is 50.354861603974456


Accuracy of the xgb model is 49.78708303761533


Accuracy of the xgb model is 50.141944641589774


Accuracy of the xgb model is 49.50319375443577


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.254790631653655


Accuracy of the xgb model is 49.53867991483322


Accuracy of the xgb model is 49.78708303761533


Accuracy of the xgb model is 49.46770759403832


Accuracy of the xgb model is 49.64513839602555


Accuracy of the xgb model is 49.680624556423


Accuracy of the xgb model is 49.96451383960255


Accuracy of the xgb model is 50.03548616039745


Accuracy of the xgb model is 50.0


Accuracy of the xgb model is 49.32576295244854


Accuracy of the xgb model is 50.39034776437189


Accuracy of the xgb model is 49.751596877217885


Accuracy of the xgb model is 51.63236337828248


Accuracy of the xgb model is 50.10645848119234


Accuracy of the xgb model is 50.10645848119234


Accuracy of the xgb model is 51.13555713271823


In [20]:
print('Best Accuracy using xgb model is {}'.format(best_accuracy_2))
print('Best Parameter using xgb model is {}'.format(best_params_2))

Best Accuracy using xgb model is 51.63236337828248
Best Parameter using xgb model is {'objective': 'multi:softmax', 'tree_method': 'approx', 'max_depth': 15, 'booster': 'gbtree', 'gamma': '0.5', 'learning_rate': 0.2, 'n_estimators': 200}


In [21]:
model_1 = xgb.XGBClassifier(**best_params_1)
cols_model_1 = total_Data_scaled_1.columns
model_1.fit(total_Data_scaled_1, total_y_1)


model_2 = xgb.XGBClassifier(**best_params_2)
cols_model_2 = total_Data_scaled_2.columns
model_2.fit(total_Data_scaled_2, total_y_2)
# y_pred = pd.DataFrame(xgbModel.predict(X_test_scaled), columns=['Predicted Arrival Status'], index=X_test.index)

# test_output = y_pred.merge(y_test, left_index=True, right_index=True)
# # test_output.head()
# accuracy = accuracy_score(test_output)
# print('Accuracy of the xgb model is {}'.format(accuracy))
# if accuracy > best_accuracy:
#     best_accuracy = accuracy
#     best_params = params

In [22]:
test_data = pd.read_csv('CIS_662 _FINAL_Predictions.csv', na_values=[''], keep_default_na=False)
# test_data.drop(columns=['DATE','FLIGHT NUMBER', 'ARRIVAL STATUS_Prev_flight_early', 'ARRIVAL STATUS_Prev_flight_ontime', 'ARRIVAL STATUS_Prev_flight_late'],inplace=True)

In [23]:
internal_test_data = pd.DataFrame(test_data.drop(columns=['DAY']), index=test_data.index)
# test_data
internal_test_data['DATE'] = internal_test_data['DATE'].str.replace('/24', '/2024')
internal_test_data['DATE'] = internal_test_data['DATE'].str.replace('4/', '04/')
internal_test_data

Unnamed: 0,DATE,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,04/19/2024,UA 1400,ORD,6:52 PM,9:47 PM,,,,
1,04/19/2024,AA 3402,ORD,7:59 PM,10:52 PM,,,,
2,04/19/2024,B6 116,JFK,1:34 PM,2:51 PM,,,,
3,04/19/2024,DL 5182,JFK,2:55 PM,4:21 PM,,,,
4,04/19/2024,WN 5285,MCO,11:35 AM,2:20 PM,,,,
5,04/19/2024,B6 656,MCO,1:35 PM,4:25 PM,,,,
6,04/20/2024,UA 1400,ORD,6:52 PM,9:47 PM,,,,
7,04/20/2024,AA 3402,ORD,7:59 PM,10:52 PM,,,,
8,04/20/2024,B6 116,JFK,1:25 PM,2:41 PM,,,,
9,04/20/2024,DL 5182,JFK,2:55 PM,4:21 PM,,,,


In [24]:
def convert_to_24Hr(timeStr):
    if ('AM' not in  timeStr) and ('PM' not in timeStr):
        return timeStr
    time = timeStr.split(sep=' ')
    timeValue = time[0]
    hh_mm = timeValue.split(sep=':')
    hr = hh_mm[0]
    min = hh_mm[1]
    am_pm = time[1]
    if am_pm == 'PM':
        hr_int = int(hr) + 12
        hr = str(hr_int)
    return hr +':'+min

In [25]:
internal_test_data['DEPARTURE TIME'] = [convert_to_24Hr(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['DEPARTURE TIME'] = [to_minutes(x) for x in internal_test_data['DEPARTURE TIME']]
internal_test_data['ARRIVAL TIME'] = [convert_to_24Hr(x) for x in internal_test_data['ARRIVAL TIME']]
internal_test_data['ARRIVAL TIME'] = [to_minutes(x) for x in internal_test_data['ARRIVAL TIME']]

In [26]:
internal_test_data.head()
# internal_test_data = len(test_data)
test_weather_data = pd.read_csv('data/test_weather_data.csv')
test_weather_data
import json

airport_file = open('airport_on_time_dep_pct.json', 'r')
airport_rank_data = json.load(airport_file)
airport_file.close()

airline_file = open('airline_on_time_arr_pct.json', 'r')
airline_rank_data = json.load(airline_file)
airline_file.close()

Unnamed: 0,DATE,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,04/19/2024,UA 1400,ORD,1132,1307,,,,
1,04/19/2024,AA 3402,ORD,1199,1372,,,,
2,04/19/2024,B6 116,JFK,814,891,,,,
3,04/19/2024,DL 5182,JFK,895,981,,,,
4,04/19/2024,WN 5285,MCO,695,860,,,,


Unnamed: 0,DATE,weather_code,tempmin,precipcover,snowdepth,windspeedmax,visibility,cloudcover
0,01/01/2024,cloudy,22.00,0.00,0.20,11.20,9.10,98.40
1,01/02/2024,cloudy,28.10,0.00,0.00,14.50,9.90,100.00
2,01/03/2024,cloudy,34.10,0.00,0.00,13.50,9.80,100.00
3,01/04/2024,snow,23.00,20.83,0.00,16.00,8.30,95.30
4,01/05/2024,snow,21.80,8.33,0.00,18.10,8.80,100.00
...,...,...,...,...,...,...,...,...
117,04/27/2024,cloudy,35.50,4.17,0.00,13.00,15.00,90.70
118,04/28/2024,cloudy,56.00,4.17,0.00,12.10,11.70,100.00
119,04/29/2024,cloudy,56.20,0.00,0.00,15.20,13.30,100.00
120,04/30/2024,cloudy,49.20,8.33,0.00,8.90,10.50,100.00


In [27]:
def getTestDF(testrow, q):
    print(testrow)
    date = testrow['DATE']
    weather_data = test_weather_data[test_weather_data['DATE'] == date]
    print(weather_data)
    origin = testrow['ORIGIN']
    dep_time = testrow['DEPARTURE TIME']
    arr_time = testrow['ARRIVAL TIME']

    columns=X_test_1.columns

    
    # print(q)

In [28]:
#dynamic training loop
for id in internal_test_data.index:
    # day_of_flight = internal_test_data.loc[id,'DAY']
    # testrow = internal_test_data.loc[id]
    # testdf = getTestDF(testrow, 1)
    # break
    # testrow
    date = internal_test_data.loc[id, 'DATE']
    airline_code = internal_test_data.loc[id, 'FLIGHT NUMBER']
    airline_code = str(airline_code).split(' ')[0]
    # print(date)
    # test_weather_data['DATE']
    weather_data = test_weather_data[test_weather_data['DATE'] == date].copy()
    weather_dict = weather_data.iloc[0].to_dict()
    # wid = weather_data.index
    # print('##')
    # weather_data.to_dict()
    # print('sfdfds')
    # w_code = test_weather_data.loc[wid,'weather_code']
    # print(w_code)
    
    origin = internal_test_data.loc[id,'ORIGIN']
    dep_time = internal_test_data.loc[id,'DEPARTURE TIME']
    arr_time = internal_test_data.loc[id,'ARRIVAL TIME']

    
    # test_row = pd.DataFrame(test_data.loc[i].copy())
    # test_row[]
    # day_of_flight = test_row.loc[0,'DAY']
    final_data_for_this_row = {}

    
    q1 = internal_test_data.loc[id, 'ARRIVAL STATUS']
    # print(status_qwery)
    # 'windspeedmax', 'visibility', 'cloudcover', 'ORIGIN_MCO', 'ORIGIN_ORD',
    # 'weather_code_cloudy', 'weather_code_partly_cloudy_day',
    # 'weather_code_rain', 'weather_code_snow', 'weather_code_wind'],
    if q1 != 'NA':
        columns=X_test_1.columns
        # print(columns)
        data_dict = {}
        for col in columns:
            if col == 'DEPARTURE TIME':
                data_dict['DEPARTURE TIME'] = [dep_time]
            if col == 'ARRIVAL TIME':
                data_dict['ARRIVAL TIME'] = [arr_time]
            if col == 'AIRPORT YEARLY ON-TIME DEP PERCENTAGE':
                data_dict[col] = airport_rank_data['2024'][origin]
            if col == 'AIRLINE YEARLY ON-TIME ARR PERCENTAGE':
                data_dict[col] = airline_rank_data['2024'][airline_code]
            if col == 'tempmin':
                data_dict[col] = weather_data[col]
            if col == 'snowdepth':
                data_dict[col] = weather_data[col]
            if col == 'windspeedmax':
                data_dict[col] = weather_data[col]
            if col == 'visibility':
                data_dict[col] = weather_data[col]
            if col == 'cloudcover':
                data_dict[col] = weather_data[col]
            if col == 'ORIGIN_MCO':
                data_dict[col] =  True if origin == 'MCO' else False
            if col == 'ORIGIN_ORD':
                data_dict[col] =  True if origin == 'ORD' else False
            if col == 'weather_code_cloudy':
                data_dict[col] =  True if weather_dict['weather_code'] == 'cloudy' else False
            if col == 'weather_code_partly_cloudy_day':
                data_dict[col] =  True if weather_dict['weather_code'] == 'partly_cloudy_day' else False
            if col == 'weather_code_rain':
                data_dict[col] =  True if weather_dict['weather_code'] == 'rain' else False
            if col == 'weather_code_snow':
                data_dict[col] =  True if weather_dict['weather_code'] == 'snow' else False
            if col == 'weather_code_wind':
                data_dict[col] =  True if weather_dict['weather_code'] == 'wind' else False  
        # break
        test_df_1 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -1
        scaled_test_df_1 = scaler_total_1.transform(test_df_1)
        value = model_1.predict(scaled_test_df_1)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS'] = 'LATE'

        # test_data.loc[id]
    # break
    # continue

    q2 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_early']
    if q2 != 'NA':
        columns=X_test_2.columns
        print(columns)
        data_dict = {}
        for col in columns:
            if col == 'DEPARTURE TIME':
                data_dict['DEPARTURE TIME'] = [dep_time]
            if col == 'ARRIVAL TIME':
                data_dict['ARRIVAL TIME'] = [arr_time]
            if col == 'AIRPORT YEARLY ON-TIME DEP PERCENTAGE':
                data_dict[col] = airport_rank_data['2024'][origin]
            if col == 'AIRLINE YEARLY ON-TIME ARR PERCENTAGE':
                data_dict[col] = airline_rank_data['2024'][airline_code]
            if col == 'tempmin':
                data_dict[col] = weather_data[col]
            if col == 'snowdepth':
                data_dict[col] = weather_data[col]
            if col == 'windspeedmax':
                data_dict[col] = weather_data[col]
            if col == 'visibility':
                data_dict[col] = weather_data[col]
            if col == 'cloudcover':
                data_dict[col] = weather_data[col]
            if col == 'previous_flight_status':
                data_dict[col] = [0]
            if col == 'ORIGIN_MCO':
                data_dict[col] =  True if origin == 'MCO' else False
            if col == 'ORIGIN_ORD':
                data_dict[col] =  True if origin == 'ORD' else False
            if col == 'weather_code_cloudy':
                data_dict[col] =  True if weather_dict['weather_code'] == 'cloudy' else False
            if col == 'weather_code_partly_cloudy_day':
                data_dict[col] =  True if weather_dict['weather_code'] == 'partly_cloudy_day' else False
            if col == 'weather_code_rain':
                data_dict[col] =  True if weather_dict['weather_code'] == 'rain' else False
            if col == 'weather_code_snow':
                data_dict[col] =  True if weather_dict['weather_code'] == 'snow' else False
            if col == 'weather_code_wind':
                data_dict[col] =  True if weather_dict['weather_code'] == 'wind' else False  
        # break
        test_df_2 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -2
        scaled_test_df_2 = scaler_total_2.transform(test_df_2)
        value = model_2.predict(scaled_test_df_2)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_early'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_early'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_early'] = 'LATE'

    q3 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_ontime']
    if q3 != 'NA':
        columns=X_test_2.columns
        print(columns)
        data_dict = {}
        for col in columns:
            if col == 'DEPARTURE TIME':
                data_dict['DEPARTURE TIME'] = [dep_time]
            if col == 'ARRIVAL TIME':
                data_dict['ARRIVAL TIME'] = [arr_time]
            if col == 'AIRPORT YEARLY ON-TIME DEP PERCENTAGE':
                data_dict[col] = airport_rank_data['2024'][origin]
            if col == 'AIRLINE YEARLY ON-TIME ARR PERCENTAGE':
                data_dict[col] = airline_rank_data['2024'][airline_code]
            if col == 'tempmin':
                data_dict[col] = weather_data[col]
            if col == 'snowdepth':
                data_dict[col] = weather_data[col]
            if col == 'windspeedmax':
                data_dict[col] = weather_data[col]
            if col == 'visibility':
                data_dict[col] = weather_data[col]
            if col == 'cloudcover':
                data_dict[col] = weather_data[col]
            if col == 'previous_flight_status':
                data_dict[col] = [1]
            if col == 'ORIGIN_MCO':
                data_dict[col] =  True if origin == 'MCO' else False
            if col == 'ORIGIN_ORD':
                data_dict[col] =  True if origin == 'ORD' else False
            if col == 'weather_code_cloudy':
                data_dict[col] =  True if weather_dict['weather_code'] == 'cloudy' else False
            if col == 'weather_code_partly_cloudy_day':
                data_dict[col] =  True if weather_dict['weather_code'] == 'partly_cloudy_day' else False
            if col == 'weather_code_rain':
                data_dict[col] =  True if weather_dict['weather_code'] == 'rain' else False
            if col == 'weather_code_snow':
                data_dict[col] =  True if weather_dict['weather_code'] == 'snow' else False
            if col == 'weather_code_wind':
                data_dict[col] =  True if weather_dict['weather_code'] == 'wind' else False  
        # break
        test_df_3 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -2
        scaled_test_df_3 = scaler_total_2.transform(test_df_3)
        value = model_2.predict(scaled_test_df_3)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_ontime'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_ontime'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_ontime'] = 'LATE'

    q4 = internal_test_data.loc[id, 'ARRIVAL STATUS_Prev_flight_late']
    if q4 != 'NA':
        columns=X_test_2.columns
        print(columns)
        data_dict = {}
        for col in columns:
            if col == 'DEPARTURE TIME':
                data_dict['DEPARTURE TIME'] = [dep_time]
            if col == 'ARRIVAL TIME':
                data_dict['ARRIVAL TIME'] = [arr_time]
            if col == 'AIRPORT YEARLY ON-TIME DEP PERCENTAGE':
                data_dict[col] = airport_rank_data['2024'][origin]
            if col == 'AIRLINE YEARLY ON-TIME ARR PERCENTAGE':
                data_dict[col] = airline_rank_data['2024'][airline_code]
            if col == 'tempmin':
                data_dict[col] = weather_data[col]
            if col == 'snowdepth':
                data_dict[col] = weather_data[col]
            if col == 'windspeedmax':
                data_dict[col] = weather_data[col]
            if col == 'visibility':
                data_dict[col] = weather_data[col]
            if col == 'cloudcover':
                data_dict[col] = weather_data[col]
            if col == 'previous_flight_status':
                data_dict[col] = [2]
            if col == 'ORIGIN_MCO':
                data_dict[col] =  True if origin == 'MCO' else False
            if col == 'ORIGIN_ORD':
                data_dict[col] =  True if origin == 'ORD' else False
            if col == 'weather_code_cloudy':
                data_dict[col] =  True if weather_dict['weather_code'] == 'cloudy' else False
            if col == 'weather_code_partly_cloudy_day':
                data_dict[col] =  True if weather_dict['weather_code'] == 'partly_cloudy_day' else False
            if col == 'weather_code_rain':
                data_dict[col] =  True if weather_dict['weather_code'] == 'rain' else False
            if col == 'weather_code_snow':
                data_dict[col] =  True if weather_dict['weather_code'] == 'snow' else False
            if col == 'weather_code_wind':
                data_dict[col] =  True if weather_dict['weather_code'] == 'wind' else False  
        # break
        test_df_4 = pd.DataFrame(data_dict) #use this dataframe to get prediction from model -2
        scaled_test_df_4 = scaler_total_2.transform(test_df_4)
        value = model_2.predict(scaled_test_df_4)
        if value == 0:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_late'] = 'EARLY'
        elif value == 1:
            # print('EARLY')
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_late'] = 'ON-TIME'
        else:
            test_data.loc[id,'ARRIVAL STATUS_Prev_flight_late'] = 'LATE'

    # break
test_data
test_data.to_csv('CIS_662 _FINAL_Predictions_Filled.csv', index=False)
# internal_test_data

Index(['DEPARTURE TIME', 'ARRIVAL TIME',
       'AIRPORT YEARLY ON-TIME DEP PERCENTAGE',
       'AIRLINE YEARLY ON-TIME ARR PERCENTAGE', 'tempmin', 'snowdepth',
       'windspeedmax', 'visibility', 'cloudcover', 'ORIGIN_MCO', 'ORIGIN_ORD',
       'weather_code_cloudy', 'weather_code_partly_cloudy_day',
       'weather_code_rain', 'weather_code_snow', 'weather_code_wind'],
      dtype='object')
Index(['DEPARTURE TIME', 'ARRIVAL TIME',
       'AIRPORT YEARLY ON-TIME DEP PERCENTAGE',
       'AIRLINE YEARLY ON-TIME ARR PERCENTAGE', 'tempmin', 'snowdepth',
       'windspeedmax', 'visibility', 'cloudcover', 'previous_flight_status',
       'ORIGIN_MCO', 'ORIGIN_ORD', 'weather_code_cloudy',
       'weather_code_partly_cloudy_day', 'weather_code_rain',
       'weather_code_snow', 'weather_code_wind'],
      dtype='object')
Index(['DEPARTURE TIME', 'ARRIVAL TIME',
       'AIRPORT YEARLY ON-TIME DEP PERCENTAGE',
       'AIRLINE YEARLY ON-TIME ARR PERCENTAGE', 'tempmin', 'snowdepth',
       'wind

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/19/24,FRIDAY,UA 1400,ORD,6:52 PM,9:47 PM,ON-TIME,,,
1,4/19/24,FRIDAY,AA 3402,ORD,7:59 PM,10:52 PM,,EARLY,LATE,LATE
2,4/19/24,FRIDAY,B6 116,JFK,1:34 PM,2:51 PM,LATE,,,
3,4/19/24,FRIDAY,DL 5182,JFK,2:55 PM,4:21 PM,,EARLY,ON-TIME,EARLY
4,4/19/24,FRIDAY,WN 5285,MCO,11:35 AM,2:20 PM,EARLY,,,
5,4/19/24,FRIDAY,B6 656,MCO,1:35 PM,4:25 PM,,ON-TIME,EARLY,EARLY
6,4/20/24,SATURDAY,UA 1400,ORD,6:52 PM,9:47 PM,ON-TIME,,,
7,4/20/24,SATURDAY,AA 3402,ORD,7:59 PM,10:52 PM,,LATE,LATE,ON-TIME
8,4/20/24,SATURDAY,B6 116,JFK,1:25 PM,2:41 PM,EARLY,,,
9,4/20/24,SATURDAY,DL 5182,JFK,2:55 PM,4:21 PM,,EARLY,LATE,LATE
