# Exploratory Data Analysis - Final Traffic Accident Dataset

This notebook performs comprehensive exploratory data analysis on the final traffic accident dataset (`data/final/data.csv`).

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Inspect Data

In [None]:
# Load the dataset
df = pd.read_csv('../../data/final/data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Column information
df.info()

In [None]:
# Display column names for better understanding
print("Column names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:3d}. {col}")

## 2. Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percent': missing_percent.values
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(f"\nColumns with missing values: {len(missing_df)}")
print("\nTop 20 columns with most missing values:")
missing_df.head(20)

In [None]:
# Visualize missing values
plt.figure(figsize=(14, 8))
if len(missing_df) > 0:
    top_missing = missing_df.head(20)
    plt.barh(range(len(top_missing)), top_missing['Missing_Percent'])
    plt.yticks(range(len(top_missing)), top_missing['Column'])
    plt.xlabel('Missing Percentage (%)')
    plt.title('Top 20 Columns with Missing Values')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

## 3. Basic Statistical Summary

In [None]:
# Statistical summary of numerical columns
df.describe()

In [None]:
# Statistical summary of categorical columns
df.describe(include='object')

## 4. Accident Severity Analysis

In [None]:
# Accident severity distribution
severity_counts = df['accident_severity'].value_counts()
print("Accident Severity Distribution:")
print(severity_counts)
print(f"\nTotal accidents: {severity_counts.sum()}")

In [None]:
# Visualize accident severity
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
severity_counts.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Accident Severity Distribution (Bar Chart)')
axes[0].set_xlabel('Severity')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(severity_counts.values, labels=severity_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Accident Severity Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

## 5. Temporal Analysis

In [None]:
# Convert date column to datetime
df['accident_datetime'] = pd.to_datetime(df['accident_datetime'])
df['accident_year'] = df['accident_datetime'].dt.year
df['accident_month'] = df['accident_datetime'].dt.month
df['accident_day_of_week'] = df['accident_datetime'].dt.dayofweek
df['accident_hour'] = df['accident_datetime'].dt.hour

print("Temporal features extracted successfully!")

In [None]:
# Accidents by year
accidents_by_year = df['accident_year'].value_counts().sort_index()
print("Accidents by Year:")
print(accidents_by_year)

plt.figure(figsize=(12, 5))
accidents_by_year.plot(kind='bar', color='coral')
plt.title('Number of Accidents by Year')
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Accidents by month
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
accidents_by_month = df['accident_month'].value_counts().sort_index()

plt.figure(figsize=(12, 5))
plt.bar(range(1, 13), [accidents_by_month.get(i, 0) for i in range(1, 13)], color='skyblue')
plt.title('Number of Accidents by Month')
plt.xlabel('Month')
plt.ylabel('Number of Accidents')
plt.xticks(range(1, 13), month_names)
plt.tight_layout()
plt.show()

In [None]:
# Accidents by day of week
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
accidents_by_day = df['accident_day_of_week'].value_counts().sort_index()

plt.figure(figsize=(12, 5))
plt.bar(range(7), [accidents_by_day.get(i, 0) for i in range(7)], color='lightgreen')
plt.title('Number of Accidents by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Accidents')
plt.xticks(range(7), day_names, rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Accidents by time of day category
time_category_counts = df['accident_time_category'].value_counts()
print("Accidents by Time Category:")
print(time_category_counts)

plt.figure(figsize=(12, 5))
time_category_counts.plot(kind='bar', color='orange')
plt.title('Number of Accidents by Time of Day')
plt.xlabel('Time Category')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Location Analysis

In [None]:
# Top cities with most accidents
top_cities = df['city'].value_counts().head(15)
print("Top 15 Cities with Most Accidents:")
print(top_cities)

plt.figure(figsize=(12, 6))
top_cities.plot(kind='barh', color='mediumpurple')
plt.title('Top 15 Cities with Most Accidents')
plt.xlabel('Number of Accidents')
plt.ylabel('City')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Region distribution
region_counts = df['region'].value_counts()
print("Accidents by Region:")
print(region_counts)

plt.figure(figsize=(12, 5))
region_counts.plot(kind='bar', color='teal')
plt.title('Number of Accidents by Region')
plt.xlabel('Region')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Street type distribution
street_type_counts = df['street_type'].value_counts()
print("Accidents by Street Type:")
print(street_type_counts)

plt.figure(figsize=(12, 5))
street_type_counts.plot(kind='bar', color='salmon')
plt.title('Number of Accidents by Street Type')
plt.xlabel('Street Type')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Driver Demographics Analysis

In [None]:
# Total number of drivers involved
total_drivers = df['num_drivers_total'].sum()
print(f"Total drivers involved in accidents: {total_drivers}")

# Driver gender distribution
male_drivers = df['num_drivers_male'].sum()
female_drivers = df['num_drivers_female'].sum()
unknown_gender = df['num_drivers_unknown'].sum()

print(f"\nDriver Gender Distribution:")
print(f"Male drivers: {male_drivers} ({male_drivers/total_drivers*100:.1f}%)")
print(f"Female drivers: {female_drivers} ({female_drivers/total_drivers*100:.1f}%)")
print(f"Unknown gender: {unknown_gender} ({unknown_gender/total_drivers*100:.1f}%)")

# Visualize
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie([male_drivers, female_drivers, unknown_gender], 
       labels=['Male', 'Female', 'Unknown'], 
       autopct='%1.1f%%', 
       startangle=90,
       colors=['steelblue', 'pink', 'gray'])
ax.set_title('Driver Gender Distribution')
plt.show()

In [None]:
# Driver age group distribution
age_groups = {
    'Under 18': df['num_drivers_under_18'].sum(),
    '18-24': df['num_drivers_18_to_24'].sum(),
    '25-49': df['num_drivers_25_to_49'].sum(),
    '50-64': df['num_drivers_50_to_64'].sum(),
    '65+': df['num_drivers_65_plus'].sum()
}

print("Driver Age Group Distribution:")
for age_group, count in age_groups.items():
    print(f"{age_group}: {count} ({count/total_drivers*100:.1f}%)")

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(age_groups.keys(), age_groups.values(), color='lightcoral')
plt.title('Driver Age Group Distribution')
plt.xlabel('Age Group')
plt.ylabel('Number of Drivers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Vehicle Type Analysis

In [None]:
# Vehicle type distribution
vehicle_types = {
    'Car': df['num_vehicle_car'].sum(),
    'Motorbike': df['num_vehicle_motorbike'].sum(),
    'Van': df['num_vehicle_van'].sum(),
    'Bus': df['num_vehicle_bus'].sum(),
    'Bicycle': df['num_vehicle_bicycle'].sum(),
    'Pedestrian': df['num_vehicle_pedestrian'].sum(),
    'Unknown': df['num_vehicle_unknown'].sum()
}

print("Vehicle Type Distribution:")
for vehicle, count in sorted(vehicle_types.items(), key=lambda x: x[1], reverse=True):
    total_vehicles = sum(vehicle_types.values())
    print(f"{vehicle}: {count} ({count/total_vehicles*100:.1f}%)")

# Visualize
plt.figure(figsize=(12, 6))
plt.bar(vehicle_types.keys(), vehicle_types.values(), color='mediumseagreen')
plt.title('Vehicle Type Distribution')
plt.xlabel('Vehicle Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. Injuries Analysis

In [None]:
# Total injuries
total_injured = df['total_injured'].sum()
print(f"Total people injured: {total_injured}")
print(f"Average injuries per accident: {df['total_injured'].mean():.2f}")

# Distribution of injuries
plt.figure(figsize=(12, 6))
df['total_injured'].value_counts().sort_index().plot(kind='bar', color='indianred')
plt.title('Distribution of Number of Injuries per Accident')
plt.xlabel('Number of Injured')
plt.ylabel('Number of Accidents')
plt.tight_layout()
plt.show()

## 10. Weather Conditions Analysis

In [None]:
# Weather statistics
print("Weather Conditions Summary:")
print(f"Temperature (mean): Min={df['temperature_min'].min():.1f}°C, Max={df['temperature_max'].max():.1f}°C, Avg={df['temperature_mean'].mean():.1f}°C")
print(f"Precipitation (sum): Min={df['precipitation_sum'].min():.1f}mm, Max={df['precipitation_sum'].max():.1f}mm, Avg={df['precipitation_sum'].mean():.1f}mm")
print(f"Wind speed (max): Min={df['windspeed_max'].min():.1f}km/h, Max={df['windspeed_max'].max():.1f}km/h, Avg={df['windspeed_max'].mean():.1f}km/h")

# Accidents in rain vs no rain
rain_counts = df['is_raining'].value_counts()
print(f"\nAccidents during rain: {rain_counts.get(True, 0)} ({rain_counts.get(True, 0)/len(df)*100:.1f}%)")
print(f"Accidents without rain: {rain_counts.get(False, 0)} ({rain_counts.get(False, 0)/len(df)*100:.1f}%)")

In [None]:
# Visualize weather impact
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Temperature distribution
axes[0, 0].hist(df['temperature_mean'], bins=30, color='orange', edgecolor='black')
axes[0, 0].set_title('Distribution of Mean Temperature')
axes[0, 0].set_xlabel('Temperature (°C)')
axes[0, 0].set_ylabel('Frequency')

# Precipitation distribution
axes[0, 1].hist(df['precipitation_sum'], bins=30, color='blue', edgecolor='black')
axes[0, 1].set_title('Distribution of Precipitation')
axes[0, 1].set_xlabel('Precipitation (mm)')
axes[0, 1].set_ylabel('Frequency')

# Wind speed distribution
axes[1, 0].hist(df['windspeed_max'], bins=30, color='green', edgecolor='black')
axes[1, 0].set_title('Distribution of Max Wind Speed')
axes[1, 0].set_xlabel('Wind Speed (km/h)')
axes[1, 0].set_ylabel('Frequency')

# Rain vs no rain
rain_counts.plot(kind='pie', ax=axes[1, 1], autopct='%1.1f%%', 
                 labels=['No Rain', 'Rain'], colors=['gold', 'skyblue'])
axes[1, 1].set_title('Accidents: Rain vs No Rain')
axes[1, 1].set_ylabel('')

plt.tight_layout()
plt.show()

## 11. Traffic Level Analysis

In [None]:
# Traffic level distribution
traffic_level_counts = df['traffic_level'].value_counts()
print("Accidents by Traffic Level:")
print(traffic_level_counts)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
traffic_level_counts.plot(kind='bar', ax=axes[0], color='purple')
axes[0].set_title('Accidents by Traffic Level')
axes[0].set_xlabel('Traffic Level')
axes[0].set_ylabel('Number of Accidents')
axes[0].tick_params(axis='x', rotation=45)

# Traffic ratio distribution
axes[1].hist(df['traffic_ratio'], bins=30, color='darkviolet', edgecolor='black')
axes[1].set_title('Distribution of Traffic Ratio')
axes[1].set_xlabel('Traffic Ratio')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 12. Special Days Analysis (Weekend, Holidays, School Days)

In [None]:
# Special days analysis
weekend_counts = df['is_weekend'].value_counts()
holiday_counts = df['is_public_holiday_mt'].value_counts()
school_holiday_counts = df['is_school_holiday_mt'].value_counts()
school_day_counts = df['is_school_day_mt'].value_counts()

print("Accidents on Weekends:")
print(f"Weekend: {weekend_counts.get(True, 0)} ({weekend_counts.get(True, 0)/len(df)*100:.1f}%)")
print(f"Weekday: {weekend_counts.get(False, 0)} ({weekend_counts.get(False, 0)/len(df)*100:.1f}%)")

print("\nAccidents on Public Holidays:")
print(f"Public Holiday: {holiday_counts.get(True, 0)} ({holiday_counts.get(True, 0)/len(df)*100:.1f}%)")
print(f"Not Public Holiday: {holiday_counts.get(False, 0)} ({holiday_counts.get(False, 0)/len(df)*100:.1f}%)")

print("\nAccidents on School Holidays:")
print(f"School Holiday: {school_holiday_counts.get(True, 0)} ({school_holiday_counts.get(True, 0)/len(df)*100:.1f}%)")
print(f"Not School Holiday: {school_holiday_counts.get(False, 0)} ({school_holiday_counts.get(False, 0)/len(df)*100:.1f}%)")

print("\nAccidents on School Days:")
print(f"School Day: {school_day_counts.get(True, 0)} ({school_day_counts.get(True, 0)/len(df)*100:.1f}%)")
print(f"Not School Day: {school_day_counts.get(False, 0)} ({school_day_counts.get(False, 0)/len(df)*100:.1f}%)")

In [None]:
# Visualize special days
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Weekend
weekend_counts.plot(kind='pie', ax=axes[0, 0], autopct='%1.1f%%', 
                    labels=['Weekday', 'Weekend'], colors=['lightblue', 'salmon'])
axes[0, 0].set_title('Accidents: Weekend vs Weekday')
axes[0, 0].set_ylabel('')

# Public Holiday
holiday_counts.plot(kind='pie', ax=axes[0, 1], autopct='%1.1f%%', 
                    labels=['Not Holiday', 'Holiday'], colors=['lightgreen', 'orange'])
axes[0, 1].set_title('Accidents: Public Holiday vs Not')
axes[0, 1].set_ylabel('')

# School Holiday
school_holiday_counts.plot(kind='pie', ax=axes[1, 0], autopct='%1.1f%%', 
                           labels=['Not School Holiday', 'School Holiday'], colors=['yellow', 'purple'])
axes[1, 0].set_title('Accidents: School Holiday vs Not')
axes[1, 0].set_ylabel('')

# School Day
school_day_counts.plot(kind='pie', ax=axes[1, 1], autopct='%1.1f%%', 
                       labels=['Not School Day', 'School Day'], colors=['pink', 'teal'])
axes[1, 1].set_title('Accidents: School Day vs Not')
axes[1, 1].set_ylabel('')

plt.tight_layout()
plt.show()

## 13. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Remove ID and date columns
exclude_cols = ['id', 'accident_date_id', 'accident_year', 'accident_month', 
                'accident_day_of_week', 'accident_hour']
numerical_cols = [col for col in numerical_cols if col not in exclude_cols]

print(f"Number of numerical columns for correlation: {len(numerical_cols)}")

In [None]:
# Compute correlation matrix
correlation_matrix = df[numerical_cols].corr()

# Plot correlation heatmap (top features)
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap of Numerical Features', fontsize=16)
plt.tight_layout()
plt.show()

## 14. Key Insights and Summary

In [None]:
# Summary statistics
print("="*80)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*80)
print(f"\n1. DATASET OVERVIEW")
print(f"   - Total accidents: {len(df)}")
print(f"   - Total features: {df.shape[1]}")
print(f"   - Date range: {df['accident_datetime'].min()} to {df['accident_datetime'].max()}")

print(f"\n2. ACCIDENT SEVERITY")
for severity, count in df['accident_severity'].value_counts().items():
    print(f"   - {severity}: {count} ({count/len(df)*100:.1f}%)")

print(f"\n3. CASUALTIES")
print(f"   - Total injured: {df['total_injured'].sum()}")
print(f"   - Average per accident: {df['total_injured'].mean():.2f}")

print(f"\n4. TOP LOCATIONS")
print(f"   - Top city: {df['city'].value_counts().index[0]} ({df['city'].value_counts().values[0]} accidents)")
print(f"   - Top region: {df['region'].value_counts().index[0]} ({df['region'].value_counts().values[0]} accidents)")

print(f"\n5. TIME PATTERNS")
print(f"   - Most accidents in: {df['accident_time_category'].value_counts().index[0]}")
print(f"   - Weekend accidents: {df['is_weekend'].sum()} ({df['is_weekend'].sum()/len(df)*100:.1f}%)")

print(f"\n6. DRIVER DEMOGRAPHICS")
print(f"   - Total drivers: {df['num_drivers_total'].sum()}")
print(f"   - Male drivers: {df['num_drivers_male'].sum()} ({df['num_drivers_male'].sum()/df['num_drivers_total'].sum()*100:.1f}%)")
print(f"   - Female drivers: {df['num_drivers_female'].sum()} ({df['num_drivers_female'].sum()/df['num_drivers_total'].sum()*100:.1f}%)")

print(f"\n7. VEHICLE TYPES")
print(f"   - Cars: {df['num_vehicle_car'].sum()}")
print(f"   - Motorbikes: {df['num_vehicle_motorbike'].sum()}")
print(f"   - Pedestrians involved: {df['num_vehicle_pedestrian'].sum()}")

print(f"\n8. WEATHER CONDITIONS")
print(f"   - Accidents in rain: {df['is_raining'].sum()} ({df['is_raining'].sum()/len(df)*100:.1f}%)")
print(f"   - Average temperature: {df['temperature_mean'].mean():.1f}°C")

print(f"\n9. TRAFFIC CONDITIONS")
print(f"   - Average traffic ratio: {df['traffic_ratio'].mean():.2f}")
for level, count in df['traffic_level'].value_counts().items():
    print(f"   - {level}: {count} ({count/len(df)*100:.1f}%)")

print("\n" + "="*80)