In [59]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [60]:
oo_airlines_df = pd.read_csv('Resources/CSV/cleaned_oo_data.csv')
oo_airlines_df.head()

Unnamed: 0,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,flight_status,Weekday
0,OO,7829,TWF,SLC,delayed,Thursday
1,OO,7829,TWF,SLC,delayed,Friday
2,OO,7829,SUN,SLC,delayed,Saturday
3,OO,7829,SUN,SLC,delayed,Sunday
4,OO,7828,SLC,SUN,delayed,Saturday


In [61]:
# Define feature and target sets
X = oo_airlines_df
X = X.drop("flight_status", axis=1)
y = oo_airlines_df["flight_status"].ravel()

In [62]:
X = pd.get_dummies(X)

In [63]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [64]:
X_train

Unnamed: 0,OP_CARRIER_FL_NUM,OP_CARRIER_OO,ORIGIN_ABE,ORIGIN_ABQ,ORIGIN_ACV,ORIGIN_ASE,ORIGIN_ATL,ORIGIN_ATW,ORIGIN_AUS,ORIGIN_AVP,...,DEST_VPS,DEST_XNA,DEST_YUM,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
54848,5467,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
39013,5829,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
63996,4808,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
19624,6408,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
17623,6463,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21440,6356,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
73349,4580,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
50057,5558,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5192,6739,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [65]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [66]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=64, random_state=1) 

In [67]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [68]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [69]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["on_time 1", "early 1", 'late 2'], columns=["Predicted on_time 1", "Predicted early 1", "Predicted late 2"])

cm_df

Unnamed: 0,Predicted on_time 1,Predicted early 1,Predicted late 2
on_time 1,3223,4717,40
early 1,3355,9683,56
late 2,195,316,5


In [70]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [71]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted on_time 1,Predicted early 1,Predicted late 2
on_time 1,3223,4717,40
early 1,3355,9683,56
late 2,195,316,5


Accuracy Score : 0.598008337193145
Classification Report
              precision    recall  f1-score   support

     delayed       0.48      0.40      0.44      7980
       early       0.66      0.74      0.70     13094
     on_time       0.05      0.01      0.02       516

    accuracy                           0.60     21590
   macro avg       0.39      0.38      0.38     21590
weighted avg       0.58      0.60      0.58     21590



In [72]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([5.39866565e-01, 0.00000000e+00, 4.38552896e-04, 2.25892312e-03,
       1.46281297e-03, 3.11258386e-03, 5.07719985e-03, 1.04124512e-03,
       1.63414057e-03, 5.89061491e-04, 1.32640225e-03, 9.30321428e-04,
       9.37181147e-04, 6.86417819e-04, 1.49991503e-03, 2.83657233e-03,
       4.35575267e-04, 1.88140474e-03, 2.10764152e-03, 4.35424682e-04,
       3.91954584e-04, 7.37484800e-04, 8.46819680e-04, 4.57976001e-04,
       1.17692795e-03, 6.64879879e-04, 1.26518051e-03, 7.73892323e-04,
       4.62281075e-04, 2.74670854e-03, 4.82320035e-04, 5.53815304e-04,
       1.56982353e-03, 4.73021377e-04, 4.79301346e-04, 4.11610746e-03,
       3.73032921e-03, 7.65339272e-04, 1.79396867e-03, 1.05117067e-03,
       4.92673526e-04, 6.33415036e-04, 1.62733380e-03, 4.34839026e-04,
       1.36557085e-03, 2.72211334e-03, 8.63997272e-04, 4.37874798e-04,
       1.15238385e-03, 4.92284060e-04, 8.70253124e-04, 1.23245278e-03,
       1.73899974e-03, 9.71855576e-04, 1.00681130e-03, 2.82223185e-04,
      

In [73]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5398665650039247, 'OP_CARRIER_FL_NUM'),
 (0.01730215850406031, 'Weekday_Saturday'),
 (0.015816288404367513, 'Weekday_Monday'),
 (0.01581114379342105, 'Weekday_Sunday'),
 (0.015775522343650823, 'Weekday_Friday'),
 (0.013381649163395319, 'Weekday_Tuesday'),
 (0.01281499028530522, 'Weekday_Wednesday'),
 (0.012783040463743324, 'ORIGIN_LAX'),
 (0.011710190343425598, 'Weekday_Thursday'),
 (0.010153463776248808, 'ORIGIN_SFO'),
 (0.007092123885498744, 'DEST_LAX'),
 (0.006979989189907674, 'DEST_SFO'),
 (0.006579294124920651, 'DEST_SLC'),
 (0.005798113560497009, 'ORIGIN_ORD'),
 (0.005726601151328289, 'ORIGIN_SLC'),
 (0.005077199848930965, 'ORIGIN_ATL'),
 (0.004464547088748743, 'DEST_ORD'),
 (0.004116107456953005, 'ORIGIN_DEN'),
 (0.00374220826278664, 'DEST_SAT'),
 (0.003730329211358723, 'ORIGIN_DFW'),
 (0.0036998124442764488, 'DEST_DEN'),
 (0.003320120219658333, 'ORIGIN_SAT'),
 (0.0031312744310490396, 'DEST_COS'),
 (0.0031125838604122414, 'ORIGIN_ASE'),
 (0.002916026741117206, 'ORIGIN_MSP'),