# Flight Delay Prediction
This notebook predicts a flight delay using usa airport traffic dataset.

# Import Libraries and Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import scipy.stats as stats
from scipy.stats import jarque_bera, normaltest, shapiro
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.seasonal import seasonal_decompose
import requests
from io import StringIO

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.ensemble import StackingRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
np.random.seed(42)

# Data Loading and Initial Exploration

In [2]:
def load_or_create_dataset():
    url = "https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv"
    try:
        df = pd.read_csv(url)
        print(f"Successfully loaded dataset from URL. Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")

        if 'delay_minutes' not in df.columns:
            print("Dataset doesn't have delay_minutes column. Creating synthetic dataset...")
            return create_synthetic_dataset()

        return df
    except Exception as e:
        print(f"Failed to load dataset from URL: {e}")
        print("Creating synthetic dataset...")
        return create_synthetic_dataset()

def create_synthetic_dataset():
    np.random.seed(42)
    n_samples = 20000

    airlines = ['AA', 'UA', 'DL', 'WN', 'B6', 'AS', 'NK', 'F9', 'G4', 'SY']
    airports = ['JFK', 'LAX', 'ORD', 'DFW', 'DEN', 'SFO', 'SEA', 'LAS', 'PHX', 'IAH',
                'ATL', 'BOS', 'CLT', 'DTW', 'MSP', 'PHL', 'MIA', 'LGA', 'BWI', 'MDW']

    df = pd.DataFrame({
        'airline': np.random.choice(airlines, n_samples),
        'origin': np.random.choice(airports, n_samples),
        'destination': np.random.choice(airports, n_samples),
        'scheduled_departure': np.random.randint(500, 2300, n_samples),
        'distance': np.random.gamma(2, 500, n_samples),
        'day_of_week': np.random.randint(1, 8, n_samples),
        'month': np.random.randint(1, 13, n_samples),
        'weather_delay': np.random.exponential(5, n_samples),
        'security_delay': np.random.exponential(2, n_samples),
        'aircraft_delay': np.random.exponential(3, n_samples),
        'late_aircraft_delay': np.random.exponential(4, n_samples),
        'air_traffic_delay': np.random.exponential(3, n_samples),
    })

    df = df[df['origin'] != df['destination']]

    base_delay = (
        df['distance'] / 100 +
        df['weather_delay'] +
        df['security_delay'] +
        df['aircraft_delay'] +
        df['late_aircraft_delay'] +
        df['air_traffic_delay'] +
        np.where(df['day_of_week'].isin([1, 7]), 15, 0) +
        np.where(df['month'].isin([6, 7, 12]), 10, 0) +
        np.where(df['scheduled_departure'] < 700, 5, 0) +
        np.where(df['scheduled_departure'] > 2000, 12, 0) +
        np.random.normal(0, 8, len(df))
    )

    df['delay_minutes'] = np.maximum(0, base_delay)

    df['is_weekend'] = df['day_of_week'].isin([6, 7]).astype(int)
    df['is_holiday_season'] = df['month'].isin([6, 7, 11, 12]).astype(int)
    df['departure_hour'] = df['scheduled_departure'] // 100
    df['is_peak_hour'] = df['departure_hour'].isin([7, 8, 17, 18, 19]).astype(int)

    return df

df = load_or_create_dataset()

print(f"Final dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nDataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())

Successfully loaded dataset from URL. Shape: (221, 8)
Columns: ['iata', 'airport', 'city', 'state', 'country', 'lat', 'long', 'cnt']
Dataset doesn't have delay_minutes column. Creating synthetic dataset...
Final dataset shape: (18971, 17)
Memory usage: 4.98 MB

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 18971 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   airline              18971 non-null  object 
 1   origin               18971 non-null  object 
 2   destination          18971 non-null  object 
 3   scheduled_departure  18971 non-null  int64  
 4   distance             18971 non-null  float64
 5   day_of_week          18971 non-null  int64  
 6   month                18971 non-null  int64  
 7   weather_delay        18971 non-null  float64
 8   security_delay       18971 non-null  float64
 9   aircraft_delay       18971 non-null  float64
 10  late_aircraft_d

# Data Quality Assessment

In [3]:
def data_quality_report(df):
    missing_data = df.isnull().sum()
    missing_pct = (missing_data / len(df)) * 100
    data_types = df.dtypes
    unique_counts = df.nunique()
    duplicates = df.duplicated().sum()
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    quality_report = pd.DataFrame({
        'Data_Type': data_types,
        'Missing_Count': missing_data,
        'Missing_Percentage': missing_pct,
        'Unique_Values': unique_counts
    })

    print("=== DATA QUALITY REPORT ===")
    print(quality_report)
    print(f"\nDuplicate rows: {duplicates}")
    print(f"Numerical columns: {list(numerical_cols)}")
    print(f"Categorical columns: {list(categorical_cols)}")

    return quality_report, duplicates, numerical_cols, categorical_cols

quality_report, duplicates, numerical_cols, categorical_cols = data_quality_report(df)

=== DATA QUALITY REPORT ===
                    Data_Type  Missing_Count  Missing_Percentage  \
airline                object              0                 0.0   
origin                 object              0                 0.0   
destination            object              0                 0.0   
scheduled_departure     int64              0                 0.0   
distance              float64              0                 0.0   
day_of_week             int64              0                 0.0   
month                   int64              0                 0.0   
weather_delay         float64              0                 0.0   
security_delay        float64              0                 0.0   
aircraft_delay        float64              0                 0.0   
late_aircraft_delay   float64              0                 0.0   
air_traffic_delay     float64              0                 0.0   
delay_minutes         float64              0                 0.0   
is_weekend          

# Statistical Analysis and Distribution Testing


In [4]:
def comprehensive_statistical_analysis(df, target_col):
    if target_col not in df.columns:
        print(f"Warning: Target column '{target_col}' not found in dataset")
        return {}

    numerical_cols = df.select_dtypes(include=[np.number]).columns
    stats_results = {}

    print("=== STATISTICAL ANALYSIS ===")
    for col in numerical_cols:
        if col in df.columns:
            data = df[col].dropna()
            if len(data) > 0:
                sample_size = min(2000, len(data))
                sample_data = data.sample(n=sample_size, random_state=42)

                shapiro_stat, shapiro_p = shapiro(sample_data)
                jb_stat, jb_p = jarque_bera(data)
                skewness = stats.skew(data)
                kurtosis = stats.kurtosis(data)

                stats_results[col] = {
                    'count': len(data),
                    'mean': data.mean(),
                    'std': data.std(),
                    'median': data.median(),
                    'min': data.min(),
                    'max': data.max(),
                    'skewness': skewness,
                    'kurtosis': kurtosis,
                    'shapiro_p': shapiro_p,
                    'jb_p': jb_p,
                    'is_normal': shapiro_p > 0.05 and jb_p > 0.05
                }

    stats_df = pd.DataFrame(stats_results).T
    print(stats_df)

    return stats_results

stats_results = comprehensive_statistical_analysis(df, 'delay_minutes')

=== STATISTICAL ANALYSIS ===
                     count         mean         std      median       min  \
scheduled_departure  18971  1394.387697  521.161169      1392.0       500   
distance             18971  1000.980953  706.913768  844.623697  3.218927   
day_of_week          18971      3.98271    1.998343         4.0         1   
month                18971     6.507933    3.458814         7.0         1   
weather_delay        18971     5.113063     5.11178    3.543844  0.000079   
security_delay       18971     2.001416    2.001074    1.375136  0.000013   
aircraft_delay       18971     3.019087    3.031577    2.102564  0.000014   
late_aircraft_delay  18971     3.968649    3.951653    2.757849  0.000195   
air_traffic_delay    18971     3.047075    3.040161    2.121032  0.000017   
delay_minutes        18971    36.461347   16.226402   35.315083       0.0   
is_weekend           18971     0.281588    0.449785         0.0         0   
is_holiday_season    18971     0.334721    0.47