### Import Libraries

In [15]:
# Imports
import numpy as np
import pandas as pd
import plotly.express as px


### Data preview

In [16]:
# Load data
flight = pd.read_csv('Airline_Delay_Cause.csv')

# Basic dataset overview
print(flight.shape)         # Rows and columns
print(flight.info())        # Data types and non-null counts
print(flight.describe())    # Summary stats
print(flight.head())        # Preview first 5 rows
print(flight.shape)
flight.columns

(171666, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171666 entries, 0 to 171665
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   year                 171666 non-null  int64  
 1   month                171666 non-null  int64  
 2   carrier              171666 non-null  object 
 3   carrier_name         171666 non-null  object 
 4   airport              171666 non-null  object 
 5   airport_name         171666 non-null  object 
 6   arr_flights          171426 non-null  float64
 7   arr_del15            171223 non-null  float64
 8   carrier_ct           171426 non-null  float64
 9   weather_ct           171426 non-null  float64
 10  nas_ct               171426 non-null  float64
 11  security_ct          171426 non-null  float64
 12  late_aircraft_ct     171426 non-null  float64
 13  arr_cancelled        171426 non-null  float64
 14  arr_diverted         171426 non-null  float64
 15  arr_

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')

### Check for Missing Values and feature understanding

In [17]:
# Check data types and missing values
print(flight.dtypes)
print(flight.isnull().sum())

# Check column names
print(flight.columns)

# 📌 Dataset Column Analysis: Flight Delay Prediction
# The dataset contains monthly aggregated flight arrival statistics for different carriers and airports.
# Each row = unique (carrier, airport, month, year) combination.

# 🔸 Temporal Features:
#   - 'year', 'month': Track seasonal trends and yearly changes.

# 🔸 Airline Info:
#   - 'carrier': IATA code (e.g., 'AA')
#   - 'carrier_name': Full airline name (visuals only)

# 🔸 Airport Info:
#   - 'airport': IATA code (e.g., 'LAX')
#   - 'airport_name': Full airport name (visuals only)

# 🔸 Flight & Delay Counts:
#   - 'arr_flights': Total monthly arriving flights
#   - 'arr_del15': Flights delayed >15 mins (used to calculate delay rate)

# 🔸 Delay Cause Counts (per flight):
#   - 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct'

# 🔸 Cancellation & Diversion:
#   - 'arr_cancelled': Canceled flights
#   - 'arr_diverted': Diverted flights

# 🔸 Delay Duration (in minutes):
#   - 'arr_delay': Total delay minutes
#   - 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'

# ✅ Summary:
# This dataset supports EDA, feature engineering (e.g., delay ratios), and predictive modeling.


year                     int64
month                    int64
carrier                 object
carrier_name            object
airport                 object
airport_name            object
arr_flights            float64
arr_del15              float64
carrier_ct             float64
weather_ct             float64
nas_ct                 float64
security_ct            float64
late_aircraft_ct       float64
arr_cancelled          float64
arr_diverted           float64
arr_delay              float64
carrier_delay          float64
weather_delay          float64
nas_delay              float64
security_delay         float64
late_aircraft_delay    float64
dtype: object
year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircra

### Data Cleaning

In [None]:


# 1. Check for duplicates
print("Duplicate rows:", flight.duplicated().sum())

# 2. Missing values summary
missing_summary = flight.isna().sum().sort_values(ascending=False)
missing_summary = missing_summary[missing_summary > 0]
print("\nMissing values per column:")
print(missing_summary)

# 3. Rows with any missing values
print("\nRows with missing values:", flight[flight.isna().sum(axis=1) > 0].shape[0])

# 4. Drop missing values
flight.dropna(inplace=True)

# 5. Confirm all missing values are gone
print("\nRemaining missing values:", flight.isna().sum().sum())
print("New shape after dropping missing values:", flight.shape)

# 6. Convert key categorical columns to category type
cat_cols = ['carrier', 'carrier_name', 'airport', 'airport_name']
for col in cat_cols:
    flight[col] = flight[col].astype('category')

# 7. Backup categorical columns (good practice)
flight_categorical = flight[cat_cols].copy()
print("\nSample of categorical backup (first 5 rows):")
print(flight_categorical.head())

# 8. Check for invalid month values
print("\nMonth value distribution:")
print(flight['month'].value_counts().sort_index())

# 9. Confirm arr_flights and arr_del15 should be integers
print("\narr_flights and arr_del15 stats:")
print(flight[['arr_flights', 'arr_del15']].describe())

flight['arr_flights'] = flight['arr_flights'].astype(int)
flight['arr_del15'] = flight['arr_del15'].astype(int)

# 10. Final structure check
print("\nFinal DataFrame info:")
print(flight.info())

print("Rows with missing values (final):", flight.isna().any(axis=1).sum())


Duplicate rows: 0

Missing values per column:
arr_del15              443
weather_ct             240
carrier_ct             240
arr_flights            240
security_ct            240
weather_delay          240
arr_diverted           240
arr_delay              240
carrier_delay          240
security_delay         240
nas_ct                 240
late_aircraft_ct       240
arr_cancelled          240
late_aircraft_delay    240
nas_delay              240
dtype: int64

Rows with missing values: 443

Remaining missing values: 0
New shape after dropping missing values: (171223, 21)

Sample of categorical backup (first 5 rows):
  carrier       carrier_name airport  \
0      9E  Endeavor Air Inc.     ABE   
1      9E  Endeavor Air Inc.     ABY   
2      9E  Endeavor Air Inc.     AEX   
3      9E  Endeavor Air Inc.     AGS   
4      9E  Endeavor Air Inc.     ALB   

                                        airport_name  
0  Allentown/Bethlehem/Easton, PA: Lehigh Valley ...  
1             Albany, GA:

###  Feature Engineering

In [None]:


# Normalize delays by flight volume
flight['delay_ratio'] = flight['arr_del15'] / flight['arr_flights']
flight['cancellation_rate'] = flight['arr_cancelled'] / flight['arr_flights']
flight['diversion_rate'] = flight['arr_diverted'] / flight['arr_flights']

# Aggregate total delay and calculate delay percentages
flight['total_delay'] = flight[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].sum(axis=1)
flight['carrier_delay_pct'] = flight['carrier_delay'] / flight['total_delay']
flight['weather_delay_pct'] = flight['weather_delay'] / flight['total_delay']
flight['nas_delay_pct'] = flight['nas_delay'] / flight['total_delay']
flight['security_delay_pct'] = flight['security_delay'] / flight['total_delay']
flight['late_aircraft_delay_pct'] = flight['late_aircraft_delay'] / flight['total_delay']

# Add time-related features
flight['year_month'] = flight['year'].astype(str) + '-' + flight['month'].astype(str).str.zfill(2)

# Seasonal grouping
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    return 'Fall'

flight['season'] = flight['month'].apply(get_season)

# Total flights per carrier
carrier_flight_counts = flight.groupby('carrier', observed=False)['arr_flights'].sum().to_dict()
flight['carrier_total_flights'] = flight['carrier'].map(carrier_flight_counts)

# Airport-level delay rates
airport_delay_ratio = (
    flight.groupby('airport', observed=False)['arr_del15'].sum() /
    flight.groupby('airport', observed=False)['arr_flights'].sum()
)
flight['airport_delay_rate'] = flight['airport'].map(airport_delay_ratio)

# Disruption and delay flags
flight['disrupted'] = ((flight['arr_cancelled'] > 0) | (flight['arr_diverted'] > 0)).astype(int)
flight['high_delay_flag'] = (flight['delay_ratio'] > 0.3).astype(int)

# Flight-level delay average
flight['mean_delay_per_flight'] = flight['total_delay'] / flight['arr_flights']

# Dominant delay cause
delay_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
flight['dominant_delay_cause'] = flight[delay_cols].idxmax(axis=1)

# Monthly delay trend
monthly_delay_rate = (
    flight.groupby('month', observed=False)['arr_del15'].sum() /
    flight.groupby('month', observed=False)['arr_flights'].sum()
)
flight['month_delay_rate'] = flight['month'].map(monthly_delay_rate)

# Carrier vs. external delay ratio
flight['carrier_vs_airport_ratio'] = flight['carrier_delay_pct'] / (
    flight['weather_delay_pct'] + flight['nas_delay_pct'] + 1e-6
)

# Seasonal + Airport interaction
flight['season_airport_combo'] = flight['season'].astype(str) + '_' + flight['airport'].astype(str)
season_airport_delay = (
    flight.groupby('season_airport_combo', observed=False)['arr_del15'].sum() /
    flight.groupby('season_airport_combo', observed=False)['arr_flights'].sum()
)
flight['season_airport_delay_rate'] = flight['season_airport_combo'].map(season_airport_delay)

# Save final cleaned and engineered dataset
flight.to_csv('cleaned_airline_delay.csv', index=False)
flight.columns

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay', 'delay_ratio',
       'cancellation_rate', 'diversion_rate', 'total_delay',
       'carrier_delay_pct', 'weather_delay_pct', 'nas_delay_pct',
       'security_delay_pct', 'late_aircraft_delay_pct', 'year_month', 'season',
       'carrier_total_flights', 'airport_delay_rate', 'disrupted',
       'high_delay_flag', 'mean_delay_per_flight', 'dominant_delay_cause',
       'month_delay_rate', 'carrier_vs_airport_ratio', 'season_airport_combo',
       'season_airport_delay_rate'],
      dtype='object')

### EDA and making Plots

In [20]:
import plotly.express as px
import pandas as pd

# -----------------------------------------------
# 1. Monthly Flight Volumes and Delays Trends
#    - Shows overall flight counts vs delayed flights (15+ mins)
#    - Foundation for understanding seasonality and trends
# -----------------------------------------------
fig1 = px.line(
    flight,
    x='year_month',
    y=['arr_flights', 'arr_del15'],
    title='Monthly Trends: Total Flights vs Delayed Flights (15+ mins)',
    labels={'value': 'Count', 'year_month': 'Month'},
    markers=True
)
fig1.update_layout(yaxis_title='Flights / Delays')
#fig1.show()

# -----------------------------------------------
# 1b. Delay Ratio Over Time
#     - Normalizes delays by total flights for proportional insights
# -----------------------------------------------
flight['delay_ratio'] = flight['arr_del15'] / flight['arr_flights']
fig1b = px.line(
    flight,
    x='year_month',
    y='delay_ratio',
    title='Delay Ratio Over Time (Delays / Total Flights)',
    labels={'delay_ratio': 'Delay Ratio', 'year_month': 'Month'},
    markers=True
)
#fig1b.show()

# -----------------------------------------------
# 2. Carrier Delays During Peak Travel Months
#    - Identifies carriers with highest average delays during busiest months (Jun, Jul, Aug, Dec)
# -----------------------------------------------
peak_months = [6, 7, 8, 12]
peak_flights = flight[flight['month'].isin(peak_months)]

carrier_peak_delay = (
    peak_flights.groupby('carrier_name')['mean_delay_per_flight']
    .mean()
    .reset_index()
    .sort_values(by='mean_delay_per_flight', ascending=False)
)

fig2 = px.bar(
    carrier_peak_delay,
    x='carrier_name',
    y='mean_delay_per_flight',
    title='Carrier Delays During Peak Travel Months',
    labels={'carrier_name': 'Carrier', 'mean_delay_per_flight': 'Avg Delay per Flight'},
    color='mean_delay_per_flight',
    color_continuous_scale='Reds'
)
fig2.update_xaxes(tickangle=45)
#fig2.show()

# -----------------------------------------------
# 3. Top 10 Airports with Highest Delays During Peak Months
#    - Highlights airports contributing most to delays during peak season
# -----------------------------------------------
airport_peak_delay = (
    peak_flights.groupby('airport_name')['mean_delay_per_flight']
    .mean()
    .reset_index()
    .sort_values(by='mean_delay_per_flight', ascending=False)
    .head(10)
)

fig3 = px.bar(
    airport_peak_delay,
    x='mean_delay_per_flight',
    y='airport_name',
    orientation='h',
    title='Top 10 Airports with Highest Delays During Peak Months',
    labels={'airport_name': 'Airport', 'mean_delay_per_flight': 'Avg Delay per Flight'},
    color='mean_delay_per_flight',
    color_continuous_scale='Oranges'
)
fig3.update_yaxes(autorange="reversed")  # For top-down ordering
#fig3.show()

# -----------------------------------------------
# 4. Proportion of Delay Causes in Peak Travel Months
#    - Visualizes contribution of different delay causes by total delay minutes
# -----------------------------------------------
delay_causes = (
    peak_flights[
        ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
    ]
    .sum()
    .reset_index()
)
delay_causes.columns = ['Delay Cause', 'Total Delay (mins)']

fig4 = px.pie(
    delay_causes,
    values='Total Delay (mins)',
    names='Delay Cause',
    title='Proportion of Delay Causes in Peak Travel Months',
    color_discrete_sequence=px.colors.sequential.RdBu
)
fig4.update_traces(textinfo='percent+label', pull=[0.05]*len(delay_causes))  # Slight slice pull
#fig4.show()

# -----------------------------------------------
# 5. Average Delay Percentage by Cause
#    - Shows average delay impact (%) of each cause across all flights
# -----------------------------------------------
avg_pct_delay = flight[['carrier_delay_pct', 'weather_delay_pct', 'nas_delay_pct',
                        'security_delay_pct', 'late_aircraft_delay_pct']].mean()

fig5 = px.bar(
    x=avg_pct_delay.index,
    y=avg_pct_delay.values,
    title='Average Delay Percentage by Cause',
    labels={'x': 'Cause', 'y': 'Average Percentage'}
)
#fig5.show()

# -----------------------------------------------
# 6. Dominant Delay Causes Count
#    - Counts flights by their main delay cause to understand prevalence
# -----------------------------------------------
dominant_counts = flight['dominant_delay_cause'].value_counts().reset_index()
dominant_counts.columns = ['cause', 'count']

fig6 = px.bar(
    dominant_counts,
    x='cause',
    y='count',
    title='Dominant Delay Causes Count',
    labels={'cause': 'Dominant Cause', 'count': 'Number of Flights'}
)
#fig6.show()

# -----------------------------------------------
# 7. Cancellation Rate Over Time
#    - Tracks cancellations as a key disruption metric over months
# -----------------------------------------------
monthly_cancel = flight.groupby('year_month')['cancellation_rate'].mean().reset_index()

fig7 = px.line(
    monthly_cancel,
    x='year_month',
    y='cancellation_rate',
    title='Cancellation Rate Over Time',
    markers=True
)
#fig7.show()

# -----------------------------------------------
# 8. Diversion Rate Over Time
#    - Tracks flight diversions, another important disruption metric over months
# -----------------------------------------------
monthly_divert = flight.groupby('year_month')['diversion_rate'].mean().reset_index()

fig8 = px.line(
    monthly_divert,
    x='year_month',
    y='diversion_rate',
    title='Diversion Rate Over Time',
    markers=True
)
#fig8.show()








### EDA continue

In [21]:
import plotly.express as px

# --- Seasonal Delay Trends Analysis ---

# Insight 1: Average Delay per Flight by Season
# Let's start by looking at the overall average delay per flight across the four seasons.
# This sets the baseline for understanding how delays fluctuate seasonally.
fig_avg_delay_season = px.bar(
    flight.groupby('season')['mean_delay_per_flight'].mean().reset_index(),
    x='season',
    y='mean_delay_per_flight',
    title='Average Delay per Flight by Season',
    labels={'season': 'Season', 'mean_delay_per_flight': 'Avg Delay per Flight'},
    color='season',
    color_discrete_sequence=px.colors.qualitative.Set2
)
#fig_avg_delay_season.show()


# Insight 2: Total Flights Scheduled per Season
# Next, let's see the total number of flights each season.
# Understanding flight volume helps contextualize delay impacts.
fig_total_flights_season = px.bar(
    flight.groupby('season')['arr_flights'].sum().reset_index(),
    x='season',
    y='arr_flights',
    title='Total Flights per Season',
    labels={'season': 'Season', 'arr_flights': 'Total Flights'},
    color='season',
    color_discrete_sequence=px.colors.qualitative.Set3
)
#fig_total_flights_season.show()


# Insight 3: Carrier Performance by Season (Delay Distribution)
# Now, we break down delays by carrier and season to identify which carriers contribute most to seasonal delays.
# The boxplot shows distribution and variability, highlighting outliers and consistency.
fig_carrier_delay_season = px.box(
    flight,
    x='season',
    y='mean_delay_per_flight',
    color='carrier_name',
    title='Average Delay per Flight by Season and Carrier',
    labels={'season': 'Season', 'mean_delay_per_flight': 'Avg Delay per Flight', 'carrier_name': 'Carrier'},
    points='all'  # Shows all data points for granular insight
)
fig_carrier_delay_season.update_layout(boxmode='group')
#fig_carrier_delay_season.show()


# Insight 4: Main Delay Causes by Season (Stacked Bar Chart)
# Let's examine the causes behind the delays each season.
# This stacked bar chart helps us understand which delay factors dominate per season.
delay_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
seasonal_delay_causes = flight.groupby('season')[delay_cols].sum().reset_index()
seasonal_delay_causes_melted = seasonal_delay_causes.melt(
    id_vars='season', var_name='Delay Cause', value_name='Total Delay'
)

fig_delay_causes_season = px.bar(
    seasonal_delay_causes_melted,
    x='season',
    y='Total Delay',
    color='Delay Cause',
    title='Distribution of Delay Causes by Season',
    barmode='stack',
    color_discrete_sequence=px.colors.qualitative.Pastel
)
#fig_delay_causes_season.show()


# Insight 5: Flight Disruption Rate by Season (Delays + Cancellations)
# We then measure overall flight disruption rates (including cancellations) by season.
# This tells us how frequently flights are disrupted during different times of the year.
fig_disruption_rate_season = px.bar(
    flight.groupby('season')['disrupted'].mean().reset_index(),
    x='season',
    y='disrupted',
    title='Flight Disruption Rate by Season',
    labels={'season': 'Season', 'disrupted': 'Disruption Rate'},
    color='season',
    color_discrete_sequence=px.colors.sequential.Viridis
)
#fig_disruption_rate_season.show()


# Insight 6: Top 5 Airports with Highest Delay Rates per Season
# Finally, we identify the airports with the worst delay rates for each season.
# This highlights potential infrastructure or operational issues impacting delays regionally.
top_season_airport = (
    flight.groupby(['season', 'airport_name'])['season_airport_delay_rate']
    .mean()
    .reset_index()
)
top_season_airport = (
    top_season_airport
    .sort_values('season_airport_delay_rate', ascending=False)
    .groupby('season')
    .head(5)
)

fig_top_airports_season = px.bar(
    top_season_airport,
    x='airport_name',
    y='season_airport_delay_rate',
    color='season',
    title='Top 5 Airports with Highest Seasonal Delay Rates',
    labels={'airport_name': 'Airport', 'season_airport_delay_rate': 'Delay Rate'},
    barmode='group'
)
#fig_top_airports_season.update_xaxes(tickangle=45)
#fig_top_airports_season.show()






### EDA continue

In [22]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# === Group 3: Carrier-Focused Flight Delay Analysis (Reduced) ===

# 1. Average Delay Ratio per Carrier
carrier_delay_ratio = flight.groupby('carrier_name')['delay_ratio'].mean().reset_index()
carrier_delay_ratio = carrier_delay_ratio.sort_values(by='delay_ratio', ascending=False)

fig_avg_delay_ratio = px.bar(carrier_delay_ratio, x='carrier_name', y='delay_ratio',
                            title='Average Delay Ratio per Carrier',
                            labels={'carrier_name': 'Carrier', 'delay_ratio': 'Average Delay Ratio'})
#fig_avg_delay_ratio.update_xaxes(tickangle=45)
#fig_avg_delay_ratio.show()

# 2. Heatmap: Delay Ratio by Carrier and Flight Volume Bin
carrier_stats = flight.groupby('carrier_name').agg({
    'arr_flights': 'sum',
    'delay_ratio': 'mean'
}).reset_index()

bins = [0, 10000, 20000, 30000, 40000, 50000, np.inf]
labels = ['0-10k', '10k-20k', '20k-30k', '30k-40k', '40k-50k', '50k+']
carrier_stats['flights_bin'] = pd.cut(carrier_stats['arr_flights'], bins=bins, labels=labels)

heatmap_data = carrier_stats.pivot(index='flights_bin', columns='carrier_name', values='delay_ratio')

fig_heatmap = px.imshow(heatmap_data,
                        labels=dict(x="Carrier", y="Total Flights Bin", color="Mean Delay Ratio"),
                        x=heatmap_data.columns,
                        y=heatmap_data.index,
                        title="Heatmap of Mean Delay Ratio by Carrier and Flight Volume Bin")
#fig_heatmap.show()

# 3. Carrier Delay Ratio Across Seasons
carrier_season = flight.groupby(['carrier_name', 'season'])['delay_ratio'].mean().reset_index()

fig_season = px.bar(carrier_season, x='season', y='delay_ratio',
                    color='carrier_name', barmode='group',
                    title='Carrier Delay Ratio Across Seasons',
                    labels={'delay_ratio': 'Average Delay Ratio', 'season': 'Season', 'carrier_name': 'Carrier'})
#fig_season.show()

# 4. Dominant Delay Cause Frequency Across Carriers (Histogram)
delay_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

carrier_delays = flight.groupby('carrier_name')[delay_cols].mean()
carrier_delays['dominant_cause'] = carrier_delays.idxmax(axis=1)
carrier_dominant = carrier_delays['dominant_cause'].reset_index()

dominant_cause_counts = carrier_dominant['dominant_cause'].value_counts().reset_index()
dominant_cause_counts.columns = ['Delay Cause', 'Number of Carriers']

fig_dominant_cause_hist = px.histogram(dominant_cause_counts, x='Delay Cause', y='Number of Carriers',
                                      title='Dominant Delay Cause Frequency Across Carriers',
                                      labels={'Delay Cause': 'Dominant Delay Cause', 'Number of Carriers': 'Number of Carriers'})
#fig_dominant_cause_hist.show()

# 5. Delay Causes Breakdown by Top 10 Carriers (Total Delay in minutes)
top_carriers = flight.groupby('carrier_name')['arr_flights'].sum().nlargest(10).index
carrier_delay_causes = flight[flight['carrier_name'].isin(top_carriers)].groupby('carrier_name')[delay_cols].sum().reset_index()

carrier_delay_melted = carrier_delay_causes.melt(
    id_vars='carrier_name',
    value_vars=delay_cols,
    var_name='Delay Cause', value_name='Total Delay'
)

fig_delay_breakdown_top10 = px.bar(
    carrier_delay_melted,
    x='carrier_name',
    y='Total Delay',
    color='Delay Cause',
    title='Delay Causes Breakdown by Top 10 Carriers',
    labels={'carrier_name': 'Carrier', 'Total Delay': 'Total Delay (minutes)'},
    text_auto=True
)
#fig_delay_breakdown_top10.update_xaxes(tickangle=45)
#fig_delay_breakdown_top10.show()

# 6. Average Delay Minutes per Flight by Cause and Carrier (Normalized)
delay_per_flight = carrier_delay_causes.copy()
delay_per_flight['total_flights'] = flight.groupby('carrier_name')['arr_flights'].sum().reindex(delay_per_flight['carrier_name']).values

for cause in delay_cols:
    delay_per_flight[cause] = delay_per_flight[cause] / delay_per_flight['total_flights']

delay_per_flight_long = delay_per_flight.melt(
    id_vars='carrier_name',
    value_vars=delay_cols,
    var_name='Delay Cause', value_name='Avg Delay per Flight (min)'
)

fig_avg_delay_per_flight = px.bar(
    delay_per_flight_long,
    x='carrier_name',
    y='Avg Delay per Flight (min)',
    color='Delay Cause',
    title='Average Delay Minutes per Flight by Cause and Carrier',
    labels={'carrier_name': 'Carrier', 'Avg Delay per Flight (min)': 'Avg Delay (minutes)', 'Delay Cause': 'Cause of Delay'}
)
#fig_avg_delay_per_flight.update_xaxes(tickangle=45)
#fig_avg_delay_per_flight.show()

# 7. Correlation of Delay Causes vs Disruption Rate per Carrier
carrier_summary = flight.groupby('carrier_name').agg({
    'disrupted': 'mean',
    **{cause: 'sum' for cause in delay_cols}
}).reset_index()

fig_correlation = go.Figure()
for cause in delay_cols:
    fig_correlation.add_trace(go.Scatter(
        x=carrier_summary[cause],
        y=carrier_summary['disrupted'],
        mode='markers',
        name=cause.replace('_', ' ').title(),
        text=carrier_summary['carrier_name']
    ))

#fig_correlation.update_layout(
#    title='Correlation of Delay Causes vs Disruption Rate per Carrier',
#    xaxis_title='Total Delay Minutes by Cause',
#    yaxis_title='Disruption Rate',
#    legend_title='Delay Cause',
#    hovermode='closest'
#)
#fig_correlation.show()




















### EDA continue

In [23]:
import plotly.express as px
import pandas as pd
import numpy as np

# Filter top 10 airports by total flights
top_airports = flight.groupby('airport_name')['arr_flights'].sum().nlargest(10).index
flight_top_airports = flight[flight['airport_name'].isin(top_airports)].copy()

# Use short names (truncate to first 10 chars)
flight_top_airports['airport_short'] = flight_top_airports['airport_name'].str[:10]

# Compute avg delay per flight by airport_short
airport_delay_avg = flight_top_airports.groupby('airport_short')['mean_delay_per_flight'].mean().reset_index()

# Compute avg delay rate by airport_name for the same top airports
flight['airport_delay_rate'] = pd.to_numeric(flight['airport_delay_rate'], errors='coerce')
top_airports_delay_rate = flight.groupby('airport_name')['airport_delay_rate'].mean().reset_index()
top_airports_delay_rate = top_airports_delay_rate[top_airports_delay_rate['airport_name'].isin(top_airports)].copy()
top_airports_delay_rate['airport_short'] = top_airports_delay_rate['airport_name'].str[:10]

# Merge the two metrics on airport_short
merged_airport_metrics = pd.merge(
    airport_delay_avg,
    top_airports_delay_rate[['airport_short', 'airport_delay_rate']],
    on='airport_short'
)

# Grouped bar chart: Avg Delay per Flight vs Delay Rate
fig_merged = px.bar(
    merged_airport_metrics.melt(id_vars='airport_short', var_name='Metric', value_name='Value'),
    x='airport_short', y='Value', color='Metric', barmode='group',
    title='Top Airports: Avg Delay per Flight vs Delay Rate',
    labels={'airport_short': 'Airport', 'Value': 'Minutes / Rate'}
)
fig_merged.update_xaxes(tickangle=45)
fig_merged.show()

# Simpler scatter plot replacing faceted scatter:
# Delay ratio vs total flights, colored by airport_short
fig_scatter = px.scatter(
    flight_top_airports,
    x='arr_flights',
    y='delay_ratio',
    color='airport_short',
    title='Delay Ratio vs Total Flights by Top Airports',
    labels={'arr_flights': 'Total Flights', 'delay_ratio': 'Delay Ratio', 'airport_short': 'Airport'}
)
fig_scatter.show()

# Delay cause columns
delay_causes_cols = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

# Group by airport code and sum delays
airport_delay_summary = flight.groupby('airport')[delay_causes_cols].sum().reset_index()

# Melt to long format for heatmap
airport_delay_long = airport_delay_summary.melt(
    id_vars='airport',
    value_vars=delay_causes_cols,
    var_name='Delay Cause',
    value_name='Total Delay'
)

# Clean up delay cause names for better labels
airport_delay_long['Delay Cause'] = (
    airport_delay_long['Delay Cause']
    .str.replace('_delay', '')
    .str.replace('_', ' ')
    .str.title()
)

# Log transform delay values to reduce skewness and improve color contrast
airport_delay_long['Total Delay Log'] = np.log1p(airport_delay_long['Total Delay'])

# Heatmap of log-transformed delay causes by airport
fig_heatmap = px.density_heatmap(
    airport_delay_long,
    x='Delay Cause',
    y='airport',
    z='Total Delay Log',
    color_continuous_scale='Inferno',
    title='Heatmap of Log-Transformed Delay Causes by Airport Code',
    labels={'airport': 'Airport Code', 'Total Delay Log': 'Log(Total Delay + 1)'}
)

fig_heatmap.update_layout(
    yaxis={'categoryorder': 'category descending'}
)

fig_heatmap.show()












### EDA continue

In [24]:
import plotly.express as px
import pandas as pd

# Aggregate disruption rates by year_month
monthly_disruptions = flight.groupby('year_month')[['cancellation_rate', 'diversion_rate']].mean().reset_index()

# Melt to long format for combined plotting
monthly_disruptions_long = monthly_disruptions.melt(
    id_vars='year_month', 
    value_vars=['cancellation_rate', 'diversion_rate'], 
    var_name='Disruption Type', 
    value_name='Rate'
)

# Create combined line plot for cancellation and diversion rates
fig_disrupt = px.line(
    monthly_disruptions_long, 
    x='year_month', 
    y='Rate', 
    color='Disruption Type',
    title='Cancellation and Diversion Rates Over Time',
    markers=True,
    labels={
        'year_month': 'Year-Month',
        'Rate': 'Rate',
        'Disruption Type': 'Disruption Type'
    }
)

fig_disrupt.update_layout(
    xaxis=dict(tickangle=45),
    yaxis=dict(range=[0, monthly_disruptions_long['Rate'].max() * 1.1])
)

fig_disrupt.show()
