In [1]:
import pandas as pd

In [2]:
# Load the datasets
data_full = pd.read_csv('../data/full_data_flightdelay.csv')

# Display the first few rows of the dataframe
data_full.tail()



Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
6489057,12,7,0,2300-2359,1,11,3,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
6489058,12,7,0,1800-1859,1,11,2,123,Hawaiian Airlines Inc.,1318,...,16,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
6489059,12,7,0,2000-2059,1,11,2,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
6489060,12,7,0,2100-2159,1,12,3,123,Hawaiian Airlines Inc.,1318,...,18,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21
6489061,12,7,1,2100-2159,1,12,3,123,Hawaiian Airlines Inc.,1318,...,15,Lihue Airport,21.979,-159.346,Honolulu International,0.06,0.0,0.0,84.0,15.21


In [3]:
# Assuming data_full is already loaded
# Display the first few rows of the dataframe
print("Initial Data Full:")
print(data_full.head())

# Check for missing values
print("Missing Values in Each Column:")
print(data_full.isnull().sum())

# Drop rows with missing target variable 'DEP_DEL15'
data_full = data_full.dropna(subset=['DEP_DEL15'])

# Fill missing values in other columns if needed
data_full = data_full.fillna(method='ffill')

# Identify categorical features
categorical_features = ['DEP_TIME_BLK', 'CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT']

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data_full, columns=categorical_features, drop_first=True)

# Display the first few rows of the encoded dataframe
print("Encoded Data Full:")
print(data_encoded.head())

Initial Data Full:
   MONTH  DAY_OF_WEEK  DEP_DEL15 DEP_TIME_BLK  DISTANCE_GROUP  SEGMENT_NUMBER  \
0      1            7          0    0800-0859               2               1   
1      1            7          0    0700-0759               7               1   
2      1            7          0    0600-0659               7               1   
3      1            7          0    0600-0659               9               1   
4      1            7          0    0001-0559               7               1   

   CONCURRENT_FLIGHTS  NUMBER_OF_SEATS            CARRIER_NAME  \
0                  25              143  Southwest Airlines Co.   
1                  29              191    Delta Air Lines Inc.   
2                  27              199    Delta Air Lines Inc.   
3                  27              180    Delta Air Lines Inc.   
4                  10              182        Spirit Air Lines   

   AIRPORT_FLIGHTS_MONTH  ...  PLANE_AGE       DEPARTING_AIRPORT  LATITUDE  \
0                  

  data_full = data_full.fillna(method='ffill')


Encoded Data Full:
   MONTH  DAY_OF_WEEK  DEP_DEL15  DISTANCE_GROUP  SEGMENT_NUMBER  \
0      1            7          0               2               1   
1      1            7          0               7               1   
2      1            7          0               7               1   
3      1            7          0               9               1   
4      1            7          0               7               1   

   CONCURRENT_FLIGHTS  NUMBER_OF_SEATS  AIRPORT_FLIGHTS_MONTH  \
0                  25              143                  13056   
1                  29              191                  13056   
2                  27              199                  13056   
3                  27              180                  13056   
4                  10              182                  13056   

   AIRLINE_FLIGHTS_MONTH  AIRLINE_AIRPORT_FLIGHTS_MONTH  ...  \
0                 107363                           5873  ...   
1                  73508                           11

In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Separate features (X) and target (y)
X = data_encoded.drop('DEP_DEL15', axis=1)
y = data_encoded['DEP_DEL15']

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Get feature importance
importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importance})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
plt.xlabel('Feature Importance')
plt.title('Feature Importance from Random Forest')
plt.gca().invert_yaxis()
plt.show()

# Display the top features
print("Top 10 Features by Importance:")
print(feature_importance_df.head(10))



In [10]:
plane_age_counts = data_full['PLANE_AGE'].value_counts().sort_index(ascending=False)
plane_age_counts

PLANE_AGE
32      3322
31      4689
30       537
29     24351
28     37416
27     61716
26     25044
25     26378
24     32188
23     45374
22     49630
21    190667
20    277246
19    333757
18    433692
17    289374
16    266985
15    375176
14    325330
13    283633
12    266248
11    487576
10    161788
9     125296
8     133807
7     157795
6     225068
5     369048
4     337687
3     395306
2     334709
1     288081
0     120148
Name: count, dtype: int64

In [11]:
data_aircraft = pd.read_csv('raw_data/B43_AIRCRAFT_INVENTORY.csv', encoding='ISO-8859-1')

data_aircraft.head()

#The oldest airplane was 32 as of 2019 based on full dataset, so filter out the planes made earlier than 1967

# Convert the MANUFACTURE_YEAR column to numeric, if it's not already
data_aircraft['MANUFACTURE_YEAR'] = pd.to_numeric(data_aircraft['MANUFACTURE_YEAR'], errors='coerce')

# Filter out rows where MANUFACTURE_YEAR is greater than 1967
filtered_data_aircraft = data_aircraft[data_aircraft['MANUFACTURE_YEAR'] <= 1967]

# Display the first few rows of the filtered dataframe
print("\nFiltered Data (Manufacture Year <= 1967):")
print(filtered_data_aircraft.head())

# Optionally, save the filtered dataframe to a new CSV file
filtered_data_aircraft.to_csv('filtered_B43_AIRCRAFT_INVENTORY.csv', index=False)


Filtered Data (Manufacture Year <= 1967):
   MANUFACTURE_YEAR TAIL_NUM  NUMBER_OF_SEATS
0              1944   N54514              0.0
1              1945   N1651M              0.0
2              1953   N100CE              0.0
3              1953   N141FL              0.0
4              1953   N151FL              0.0


In [16]:
data_T3_2019 = pd.read_csv('raw_data/T3_AIR_CARRIER_SUMMARY_AIRPORT_ACTIVITY_2019.csv')
data_T3_2019

Unnamed: 0,OP_UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,SERVICE_CLASS,REV_ACRFT_DEP_PERF_510,REV_PAX_ENP_110
0,04Q,Tradewind Aviation,15024,K,10.0,39.0
1,04Q,Tradewind Aviation,14843,K,677.0,3649.0
2,04Q,Tradewind Aviation,10257,V,4.0,6.0
3,04Q,Tradewind Aviation,15323,V,1.0,3.0
4,04Q,Tradewind Aviation,10158,V,1.0,2.0
...,...,...,...,...,...,...
27247,ZW,Air Wisconsin Airlines Corp,11637,K,122.0,4535.0
27248,ZW,Air Wisconsin Airlines Corp,11721,K,143.0,5800.0
27249,ZW,Air Wisconsin Airlines Corp,10469,K,248.0,8901.0
27250,ZW,Air Wisconsin Airlines Corp,12884,K,187.0,7923.0


In [15]:
data_T3_2020 = pd.read_csv('raw_data/T3_AIR_CARRIER_SUMMARY_AIRPORT_ACTIVITY_2020.csv')
data_T3_2020

Unnamed: 0,OP_UNIQUE_CARRIER,CARRIER_NAME,ORIGIN_AIRPORT_ID,SERVICE_CLASS,REV_ACRFT_DEP_PERF_510,REV_PAX_ENP_110
0,04Q,Tradewind Aviation,13535,K,20,105.0
1,04Q,Tradewind Aviation,15024,K,9,38.0
2,04Q,Tradewind Aviation,13987,K,1,2.0
3,04Q,Tradewind Aviation,14843,K,626,3553.0
4,04Q,Tradewind Aviation,12197,K,21,109.0
...,...,...,...,...,...,...
6253,ZW,Air Wisconsin Airlines Corp,11721,K,119,4463.0
6254,ZW,Air Wisconsin Airlines Corp,10469,K,160,5095.0
6255,ZW,Air Wisconsin Airlines Corp,12884,K,159,5165.0
6256,ZW,Air Wisconsin Airlines Corp,15380,K,118,4011.0
