In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [2]:
df = pd.read_csv("flight_data_1000.csv")
df

Unnamed: 0,FL_DATE,DEP_DELAY,ORIGIN_CITY_NAME,DEST_CITY_NAME,CRS_DEP_TIME,DISTANCE
0,2024-08-16,-3,Houston,San Francisco,1167,1161
1,2024-08-08,10,Houston,New York,941,454
2,2024-08-06,59,San Francisco,Houston,829,1204
3,2024-08-12,14,Atlanta,Atlanta,1770,1569
4,2024-08-24,1,Atlanta,Chicago,975,704
...,...,...,...,...,...,...
995,2024-08-17,16,Atlanta,Seattle,1380,2147
996,2024-08-15,55,Miami,Atlanta,1565,506
997,2024-08-12,30,San Francisco,New York,2080,2498
998,2024-08-30,2,Seattle,Houston,1993,464


In [3]:
df.columns

Index(['FL_DATE', 'DEP_DELAY', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME',
       'CRS_DEP_TIME', 'DISTANCE'],
      dtype='object')

In [4]:
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df.set_index('FL_DATE', inplace=True)

In [5]:
features = df.columns[0:]
df = df[features].dropna()
df

Unnamed: 0_level_0,DEP_DELAY,ORIGIN_CITY_NAME,DEST_CITY_NAME,CRS_DEP_TIME,DISTANCE
FL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-08-16,-3,Houston,San Francisco,1167,1161
2024-08-08,10,Houston,New York,941,454
2024-08-06,59,San Francisco,Houston,829,1204
2024-08-12,14,Atlanta,Atlanta,1770,1569
2024-08-24,1,Atlanta,Chicago,975,704
...,...,...,...,...,...
2024-08-17,16,Atlanta,Seattle,1380,2147
2024-08-15,55,Miami,Atlanta,1565,506
2024-08-12,30,San Francisco,New York,2080,2498
2024-08-30,2,Seattle,Houston,1993,464


In [6]:
df = pd.get_dummies(df, columns=['ORIGIN_CITY_NAME', 'DEST_CITY_NAME'])
df

Unnamed: 0_level_0,DEP_DELAY,CRS_DEP_TIME,DISTANCE,ORIGIN_CITY_NAME_Atlanta,ORIGIN_CITY_NAME_Chicago,ORIGIN_CITY_NAME_Dallas,ORIGIN_CITY_NAME_Denver,ORIGIN_CITY_NAME_Houston,ORIGIN_CITY_NAME_Los Angeles,ORIGIN_CITY_NAME_Miami,...,DEST_CITY_NAME_Atlanta,DEST_CITY_NAME_Chicago,DEST_CITY_NAME_Dallas,DEST_CITY_NAME_Denver,DEST_CITY_NAME_Houston,DEST_CITY_NAME_Los Angeles,DEST_CITY_NAME_Miami,DEST_CITY_NAME_New York,DEST_CITY_NAME_San Francisco,DEST_CITY_NAME_Seattle
FL_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-08-16,-3,1167,1161,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
2024-08-08,10,941,454,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
2024-08-06,59,829,1204,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2024-08-12,14,1770,1569,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2024-08-24,1,975,704,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-17,16,1380,2147,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2024-08-15,55,1565,506,False,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,False
2024-08-12,30,2080,2498,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2024-08-30,2,1993,464,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [7]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
scaled_data

array([[0.10144928, 0.48558295, 0.39144288, ..., 0.        , 1.        ,
        0.        ],
       [0.28985507, 0.39114083, 0.06964042, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.34433765, 0.41101502, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.57971014, 0.86711241, 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.17391304, 0.83075637, 0.07419208, ..., 0.        , 0.        ,
        0.        ],
       [0.11594203, 0.64437944, 0.90896677, ..., 0.        , 0.        ,
        0.        ]])

In [9]:
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        target = data[i + seq_length][0]
        sequences.append((seq, target))
    return sequences

seq_length = 30
sequences = create_sequences(scaled_data, seq_length)
len(sequences)

970

In [10]:
train_size = int(len(sequences) * 0.8)
train_sequences = sequences[:train_size]
test_sequences = sequences[train_size:]

X_train, y_train = zip(*train_sequences)
X_train, y_train = np.array(X_train), np.array(y_train)

X_test, y_test = zip(*test_sequences)
X_test, y_test = np.array(X_test), np.array(y_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(776, 30, 23) (194, 30, 23) (776,) (194,)


In [11]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(seq_length, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

  super().__init__(**kwargs)


In [12]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - loss: 0.1533 - val_loss: 0.0858
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0928 - val_loss: 0.0843
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0901 - val_loss: 0.0869
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.0856 - val_loss: 0.0827
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0863 - val_loss: 0.0834
Epoch 6/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - loss: 0.0861 - val_loss: 0.0847
Epoch 7/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.0917 - val_loss: 0.0855
Epoch 8/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0836 - val_loss: 0.0877
Epoch 9/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x300a97190>

In [13]:
preds = model.predict(X_test)
preds = scaler.inverse_transform(np.concatenate((preds, np.zeros((preds.shape[0], scaled_data.shape[1] - 1))), axis=1))[:0]
from sklearn.preprocessing import MinMaxScaler

ft_data = {
    'DEP_DELAY': 0,
    'ORIGIN_CITY_NAME': 'Houston',
    'DEST_CITY_NAME': "New York",
    'CRS_DEP_TIME': 1230,
    'DISTANCE': 454
}
ft_df = pd.DataFrame([ft_data])
ft_df = pd.get_dummies(ft_df, columns=['ORIGIN_CITY_NAME', 'DEST_CITY_NAME'])
scaler_columns = list(scaler.feature_names_in_)

for col in scaler_columns:
    if col not in ft_df.columns:
        ft_df[col] = 0

ft_df = ft_df[scaler_columns]
scaled_future_flight = scaler.transform(ft_df)
seq_length = 30

future_sequence = np.array([scaled_future_flight] * seq_length)
future_sequence = future_sequence.reshape(1, seq_length, future_sequence.shape[2])
predicted_delay = model.predict(future_sequence)
predicted_delay = scaler.inverse_transform(
    np.concatenate(
        (predicted_delay, np.zeros((predicted_delay.shape[0], scaled_future_flight.shape[1] - 1))),
        axis=1
    )
)[:, 0]
print(f"Predicted delay for the specific future flight: {predicted_delay[0]:.2f} minutes")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Predicted delay for the specific future flight: 26.52 minutes
