# Shipment Delay Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the shipment delay dataset to understand patterns, relationships, and potential features for our prediction model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
df = pd.read_csv('../data/shipment_data.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

## 2. Missing Value Analysis

In [None]:
def analyze_missing_values(df):
    # Calculate missing values
    missing = df.isnull().sum()
    missing_percent = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_percent
    }).sort_values('Percentage', ascending=False)
    
    # Plot missing values
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(missing_df)), missing_df['Percentage'])
    plt.xticks(range(len(missing_df)), missing_df.index, rotation=45, ha='right')
    plt.title('Missing Values by Feature')
    plt.xlabel('Features')
    plt.ylabel('Missing Percentage')
    plt.tight_layout()
    plt.show()
    
    return missing_df

missing_analysis = analyze_missing_values(df)
print("\nMissing Value Analysis:")
print(missing_analysis)

## 3. Feature Distribution Analysis

In [None]:
def plot_numerical_distributions(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    n_cols = len(numerical_cols)
    n_rows = (n_cols + 2) // 3
    
    fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
    axes = axes.ravel()
    
    for idx, col in enumerate(numerical_cols):
        sns.histplot(data=df, x=col, kde=True, ax=axes[idx])
        axes[idx].set_title(f'Distribution of {col}')
    
    plt.tight_layout()
    plt.show()

def plot_categorical_distributions(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        value_counts = df[col].value_counts()
        sns.barplot(x=value_counts.index, y=value_counts.values)
        plt.title(f'Distribution of {col}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

print("Numerical Feature Distributions:")
plot_numerical_distributions(df)

print("\nCategorical Feature Distributions:")
plot_categorical_distributions(df)

## 4. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
correlation_matrix = df[numerical_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

## 5. Target Variable Analysis

In [None]:
def analyze_target_variable(df):
    # Target distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='Delayed')
    plt.title('Distribution of Shipment Delays')
    plt.show()
    
    # Target vs numerical features
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    numerical_cols = numerical_cols.drop('Delayed') if 'Delayed' in numerical_cols else numerical_cols
    
    for col in numerical_cols:
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=df, x='Delayed', y=col)
        plt.title(f'Delay Status vs {col}')
        plt.show()
    
    # Target vs categorical features
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        delay_rates = df.groupby(col)['Delayed'].mean().sort_values(ascending=False)
        sns.barplot(x=delay_rates.index, y=delay_rates.values)
        plt.title(f'Delay Rate by {col}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

analyze_target_variable(df)

## 6. Outlier Detection

In [None]:
def detect_outliers(df):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    for col in numerical_cols:
        # Calculate IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define outlier bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
        
        print(f"\nOutliers in {col}:")
        print(f"Number of outliers: {len(outliers)}")
        print(f"Percentage of outliers: {(len(outliers)/len(df))*100:.2f}%")
        
        # Box plot
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[col])
        plt.title(f'Box Plot of {col}')
        plt.show()

detect_outliers(df)

## 7. Feature Engineering Insights

In [None]:
# Analyze temporal patterns
df['Shipment_Date'] = pd.to_datetime(df['Shipment Date'])
df['day_of_week'] = df['Shipment_Date'].dt.day_name()
df['month'] = df['Shipment_Date'].dt.month_name()

# Plot delay patterns by day of week
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='day_of_week', y='Delayed', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Delay Rate by Day of Week')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot delay patterns by month
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x='month', y='Delayed')
plt.title('Delay Rate by Month')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Key Findings and Recommendations

Based on the exploratory data analysis, here are the key findings:

1. **Data Quality**:
   - [To be filled after analysis]

2. **Feature Importance**:
   - [To be filled after analysis]

3. **Patterns and Relationships**:
   - [To be filled after analysis]

4. **Recommendations for Model Development**:
   - [To be filled after analysis]

5. **Feature Engineering Suggestions**:
   - [To be filled after analysis]