# Flight Delay Prediction
This notebook predicts a flight delay using usa airport traffic dataset.

# Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
import scipy.stats as stats
from scipy.stats import jarque_bera, normaltest, shapiro
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.seasonal import seasonal_decompose
import requests
from io import StringIO

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.ensemble import StackingRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
np.random.seed(42)

# Data Loading and Initial Exploration

In [None]:
def load_or_create_dataset():
    url = "https://raw.githubusercontent.com/plotly/datasets/master/2011_february_us_airport_traffic.csv"
    try:
        df = pd.read_csv(url)
        print(f"Successfully loaded dataset from URL. Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")

        if 'delay_minutes' not in df.columns:
            print("Dataset doesn't have delay_minutes column. Creating synthetic dataset...")
            return create_synthetic_dataset()

        return df
    except Exception as e:
        print(f"Failed to load dataset from URL: {e}")
        print("Creating synthetic dataset...")
        return create_synthetic_dataset()

def create_synthetic_dataset():
    np.random.seed(42)
    n_samples = 20000

    airlines = ['AA', 'UA', 'DL', 'WN', 'B6', 'AS', 'NK', 'F9', 'G4', 'SY']
    airports = ['JFK', 'LAX', 'ORD', 'DFW', 'DEN', 'SFO', 'SEA', 'LAS', 'PHX', 'IAH',
                'ATL', 'BOS', 'CLT', 'DTW', 'MSP', 'PHL', 'MIA', 'LGA', 'BWI', 'MDW']

    df = pd.DataFrame({
        'airline': np.random.choice(airlines, n_samples),
        'origin': np.random.choice(airports, n_samples),
        'destination': np.random.choice(airports, n_samples),
        'scheduled_departure': np.random.randint(500, 2300, n_samples),
        'distance': np.random.gamma(2, 500, n_samples),
        'day_of_week': np.random.randint(1, 8, n_samples),
        'month': np.random.randint(1, 13, n_samples),
        'weather_delay': np.random.exponential(5, n_samples),
        'security_delay': np.random.exponential(2, n_samples),
        'aircraft_delay': np.random.exponential(3, n_samples),
        'late_aircraft_delay': np.random.exponential(4, n_samples),
        'air_traffic_delay': np.random.exponential(3, n_samples),
    })

    df = df[df['origin'] != df['destination']]

    base_delay = (
        df['distance'] / 100 +
        df['weather_delay'] +
        df['security_delay'] +
        df['aircraft_delay'] +
        df['late_aircraft_delay'] +
        df['air_traffic_delay'] +
        np.where(df['day_of_week'].isin([1, 7]), 15, 0) +
        np.where(df['month'].isin([6, 7, 12]), 10, 0) +
        np.where(df['scheduled_departure'] < 700, 5, 0) +
        np.where(df['scheduled_departure'] > 2000, 12, 0) +
        np.random.normal(0, 8, len(df))
    )

    df['delay_minutes'] = np.maximum(0, base_delay)

    df['is_weekend'] = df['day_of_week'].isin([6, 7]).astype(int)
    df['is_holiday_season'] = df['month'].isin([6, 7, 11, 12]).astype(int)
    df['departure_hour'] = df['scheduled_departure'] // 100
    df['is_peak_hour'] = df['departure_hour'].isin([7, 8, 17, 18, 19]).astype(int)

    return df

df = load_or_create_dataset()

print(f"Final dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nDataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())