In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

In [2]:
protocol = "postgres"
user = "postgres"
password = "0UyJ3HQUDBTs*1^4FnqX"
location = "localhost"
port = "5432"
database = "flight_delays"

connection_string = f"{protocol}://{user}:{password}@{location}:{port}/{database}"
print(connection_string)

postgres://postgres:0UyJ3HQUDBTs*1^4FnqX@localhost:5432/flight_delays


In [3]:
engine = create_engine(connection_string)

In [4]:
df = pd.read_sql("SELECT * FROM flight_data", engine)

In [5]:
df

Unnamed: 0,flight_id,fl_num,origin,dest,flight_status,weekday
0,0,7829,135,35,0,3
1,1,7829,135,35,0,4
2,2,7829,128,35,0,5
3,3,7829,128,35,0,6
4,6,7828,35,128,0,5
...,...,...,...,...,...,...
86354,328281,2522,4,45,0,0
86355,328282,2522,4,45,0,1
86356,328287,2522,4,45,2,2
86357,328288,2522,4,45,0,3


In [6]:
#oo_airlines_df = pd.read_csv('Resources/CSV/cleaned_oo_data.csv')
#oo_airlines_df.head()

In [24]:
# Define feature and target sets
X = df
X = X.drop("flight_status", axis=1)
y = df["flight_status"].ravel()

In [25]:
X = pd.get_dummies(X[['weekday', 'fl_num']].astype(str))

In [26]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [27]:
X_train

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,fl_num_2522,fl_num_2523,fl_num_2526,...,fl_num_7803,fl_num_7804,fl_num_7811,fl_num_7813,fl_num_7814,fl_num_7815,fl_num_7816,fl_num_7826,fl_num_7828,fl_num_7829
54848,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39013,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63996,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19624,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17623,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21440,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73349,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50057,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=64, random_state=1) 

In [30]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [31]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [69]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["on_time 1", "early 1", 'late 2'], columns=["Predicted on_time 1", "Predicted early 1", "Predicted late 2"])

cm_df

Unnamed: 0,Predicted on_time 1,Predicted early 1,Predicted late 2
on_time 1,3223,4717,40
early 1,3355,9683,56
late 2,195,316,5


In [32]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

#orign, dest, weekday, fl_num: 0.5987031032885595

0.608105604446503

In [71]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted on_time 1,Predicted early 1,Predicted late 2
on_time 1,3223,4717,40
early 1,3355,9683,56
late 2,195,316,5


Accuracy Score : 0.598008337193145
Classification Report
              precision    recall  f1-score   support

     delayed       0.48      0.40      0.44      7980
       early       0.66      0.74      0.70     13094
     on_time       0.05      0.01      0.02       516

    accuracy                           0.60     21590
   macro avg       0.39      0.38      0.38     21590
weighted avg       0.58      0.60      0.58     21590



In [21]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([7.05441154e-04, 3.71613720e-04, 1.18142641e-03, ...,
       9.83135289e-05, 8.88131930e-06, 2.35217875e-04])

In [73]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5398665650039247, 'OP_CARRIER_FL_NUM'),
 (0.01730215850406031, 'Weekday_Saturday'),
 (0.015816288404367513, 'Weekday_Monday'),
 (0.01581114379342105, 'Weekday_Sunday'),
 (0.015775522343650823, 'Weekday_Friday'),
 (0.013381649163395319, 'Weekday_Tuesday'),
 (0.01281499028530522, 'Weekday_Wednesday'),
 (0.012783040463743324, 'ORIGIN_LAX'),
 (0.011710190343425598, 'Weekday_Thursday'),
 (0.010153463776248808, 'ORIGIN_SFO'),
 (0.007092123885498744, 'DEST_LAX'),
 (0.006979989189907674, 'DEST_SFO'),
 (0.006579294124920651, 'DEST_SLC'),
 (0.005798113560497009, 'ORIGIN_ORD'),
 (0.005726601151328289, 'ORIGIN_SLC'),
 (0.005077199848930965, 'ORIGIN_ATL'),
 (0.004464547088748743, 'DEST_ORD'),
 (0.004116107456953005, 'ORIGIN_DEN'),
 (0.00374220826278664, 'DEST_SAT'),
 (0.003730329211358723, 'ORIGIN_DFW'),
 (0.0036998124442764488, 'DEST_DEN'),
 (0.003320120219658333, 'ORIGIN_SAT'),
 (0.0031312744310490396, 'DEST_COS'),
 (0.0031125838604122414, 'ORIGIN_ASE'),
 (0.002916026741117206, 'ORIGIN_MSP'),