# Shipment Delay Prediction - Exploratory Data Analysis

This notebook contains the exploratory data analysis for the shipment delay prediction project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## 1. Data Loading and Initial Inspection

In [None]:
# Load the data
df = pd.read_excel('AI ML Internship Training Data.xlsx')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

print("\nFirst few rows:")
df.head()

## 2. Data Quality Assessment

In [None]:
# Check missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

# Check duplicates
print("\nDuplicate Rows:", df.duplicated().sum())

## 3. Feature Analysis

### 3.1 Categorical Features

In [None]:
# Analyze categorical variables
categorical_cols = ['Origin', 'Destination', 'Vehicle Type', 'Weather Conditions', 'Traffic Conditions', 'Delayed']

for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=col)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()
    
    # Print value counts and percentages
    print(f"\nValue counts for {col}:")
    print(df[col].value_counts(normalize=True).round(3) * 100)

### 3.2 Numerical Features

In [None]:
# Analyze Distance
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Distance (km)', bins=50)
plt.title('Distribution of Distance')
plt.show()

# Basic statistics
print("\nDistance Statistics:")
print(df['Distance (km)'].describe())

## 4. Relationship Analysis

In [None]:
# Distance vs Delay
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Delayed', y='Distance (km)')
plt.title('Distance Distribution by Delay Status')
plt.show()

# Weather and Traffic vs Delay
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

sns.heatmap(pd.crosstab(df['Weather Conditions'], df['Delayed'], normalize='index'),
            annot=True, fmt='.2%', ax=axes[0])
axes[0].set_title('Weather Conditions vs Delay')

sns.heatmap(pd.crosstab(df['Traffic Conditions'], df['Delayed'], normalize='index'),
            annot=True, fmt='.2%', ax=axes[1])
axes[1].set_title('Traffic Conditions vs Delay')

plt.tight_layout()
plt.show()

## 5. Route Analysis

In [None]:
# Create route frequency analysis
df['Route'] = df['Origin'] + ' -> ' + df['Destination']

# Route delay rates
route_delays = df.groupby('Route')['Delayed'].value_counts(normalize=True).unstack()
route_delays = route_delays.sort_values('Yes', ascending=False)

plt.figure(figsize=(15, 8))
route_delays['Yes'].plot(kind='bar')
plt.title('Delay Rate by Route')
plt.xlabel('Route')
plt.ylabel('Delay Rate')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## 6. Key Findings

1. **Data Quality**:
   - No missing values in critical fields
   - No duplicate entries
   - All categorical variables have expected values

2. **Feature Insights**:
   - Traffic and Weather conditions are the most influential factors
   - Distance shows a moderate correlation with delays
   - Some routes have significantly higher delay rates

3. **Recommendations for Model Development**:
   - Use feature engineering for route-specific patterns
   - Create interaction features for weather and traffic
   - Consider distance bucketization
   - Implement route frequency encoding