In [1]:
# Importing dependencies

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

In [2]:
# Inputing settings for sqlalchemy connection

protocol = "postgres"
user = "postgres"
password = "0UyJ3HQUDBTs*1^4FnqX"
location = "localhost"
port = "5432"
database = "flight_delays"

connection_string = f"{protocol}://{user}:{password}@{location}:{port}/{database}"
print(connection_string)

postgres://postgres:0UyJ3HQUDBTs*1^4FnqX@localhost:5432/flight_delays


In [3]:
# Creating sqlalchemy engine

engine = create_engine(connection_string)

In [4]:
# Downloading flight_data table as dataframe

flight_data_df = pd.read_sql("SELECT * FROM flight_data", engine)

In [5]:
flight_data_df.head()

Unnamed: 0,flight_id,fl_num,origin,dest,flight_status,weekday
0,0,7829,135,35,0,3
1,1,7829,135,35,0,4
2,2,7829,128,35,0,5
3,3,7829,128,35,0,6
4,6,7828,35,128,0,5


In [11]:
# Dropping flight_id, origin, and dest for second run to improve model
flight_data_df.drop(columns=['flight_id', 'origin', 'dest'], axis=1, inplace=True)

In [12]:
flight_data_df.head()

Unnamed: 0,fl_num,flight_status,weekday
0,7829,0,3
1,7829,0,4
2,7829,0,5
3,7829,0,6
4,7828,0,5


In [13]:
# Defining features and targets

X = flight_data_df
X = X.drop("flight_status", axis=1)
y = flight_data_df["flight_status"].ravel()

In [14]:
# Creating dummy variables for features

X = pd.get_dummies(X[['weekday', 'fl_num']].astype(str))

In [15]:
# Splitting into Train and Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Viewing x_train dataset

X_train

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,fl_num_2522,fl_num_2523,fl_num_2526,...,fl_num_7803,fl_num_7804,fl_num_7811,fl_num_7813,fl_num_7814,fl_num_7815,fl_num_7816,fl_num_7826,fl_num_7828,fl_num_7829
54848,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39013,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63996,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19624,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17623,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21440,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
73349,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50057,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5192,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Creating StandardScaler instance

scaler = StandardScaler()

# Fitting Standard Scaler with training data

X_scaler = scaler.fit(X_train)

# Scaling data

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
# Creating random forest classifier
# n_estimators: 64 random_state:1


rf_model = RandomForestClassifier(n_estimators=64, random_state=1) 

In [19]:
# Fitting model

rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Making predictions using testing data

predictions = rf_model.predict(X_test_scaled)

In [21]:
# Creating confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Changing confusion matrix to dataframe

cm_df = pd.DataFrame(
    cm, index=["on_time 1", "early 1", 'late 2'], columns=["Predicted on_time 1", "Predicted early 2", "Predicted late 1"])

cm_df

Unnamed: 0,Predicted on_time 1,Predicted early 2,Predicted late 1
on_time 1,3085,20,4875
early 1,168,4,344
late 2,3027,27,10040


In [22]:
# Calculating accuracy score.

acc_score = accuracy_score(y_test, predictions)
acc_score

0.608105604446503

In [23]:
# Displaying results

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted on_time 1,Predicted early 2,Predicted late 1
on_time 1,3085,20,4875
early 1,168,4,344
late 2,3027,27,10040


Accuracy Score : 0.608105604446503
Classification Report
              precision    recall  f1-score   support

           0       0.49      0.39      0.43      7980
           1       0.08      0.01      0.01       516
           2       0.66      0.77      0.71     13094

    accuracy                           0.61     21590
   macro avg       0.41      0.39      0.39     21590
weighted avg       0.58      0.61      0.59     21590



In [24]:
# Calculating feature importance in random forest model

importances = rf_model.feature_importances_
importances

array([7.95229678e-03, 5.39847524e-03, 5.35568075e-03, ...,
       5.62447037e-05, 1.75090631e-05, 2.71374292e-04])

In [25]:
# Sorting features by importance

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.00840087259836946, 'weekday_6'),
 (0.007952296782686795, 'weekday_0'),
 (0.006039143344152642, 'weekday_5'),
 (0.005443781325206671, 'weekday_4'),
 (0.0053984752385233484, 'weekday_1'),
 (0.005355680745984526, 'weekday_2'),
 (0.004658141798658654, 'weekday_3'),
 (0.0031732934073151527, 'fl_num_7813'),
 (0.00257288002694157, 'fl_num_4377'),
 (0.0024762702645595735, 'fl_num_4481'),
 (0.0022658967661354826, 'fl_num_6075'),
 (0.0021502147417423985, 'fl_num_6320'),
 (0.00206177946730075, 'fl_num_4368'),
 (0.0019763137542277267, 'fl_num_4658'),
 (0.0019549752798110934, 'fl_num_5566'),
 (0.0018937127105526255, 'fl_num_5446'),
 (0.0018371671187323393, 'fl_num_2559'),
 (0.0017864766510733649, 'fl_num_4701'),
 (0.0017650343416896663, 'fl_num_5817'),
 (0.0017373019075526874, 'fl_num_6449'),
 (0.001669827168396594, 'fl_num_4380'),
 (0.001669176410800597, 'fl_num_4899'),
 (0.0016644100683948435, 'fl_num_5792'),
 (0.0016531418346581582, 'fl_num_5448'),
 (0.001649859061331633, 'fl_num_2751'),
 (0