In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
from scipy.stats import ttest_ind

# Weather and Traffic Accidents Analysis

**GitHub Repository**: [https://github.com/minhhle705/Weather-vs-Traffic](https://github.com/minhhle705/Weather-vs-Traffic)
This notebook analyzes the impact of weather on traffic accidents using a dataset with weather, road, and driver variables.
It addresses frequency, severity, geographic variations, and predictive modeling, drawing on methods
from Monsere et al. (2008), Nofal & Saeed (1997), and Call et al. (2019).

---
## Contributions

Solo project by Minh Le.  

## Introduction

Weather significantly influences traffic accidents, with rain, snow, and fog increasing risks. This project explores these relationships to inform safety policies.

### Data Source
- **Dataset**: `dataset_traffic_accident_prediction1.csv`, 1,000 records with weather, road type, and accident data.
- **Studies**:
  - Monsere et al., *Journal of Transportation Engineering*, 2008: Precipitation and crash risk.
  - Nofal & Saeed, *Public Health*, 1997: Environmental factors in accidents.
  - Call et al., *Professional Geographer*, 2019: Geographic crash patterns.

### Research Questions
1. How do weather patterns (rain, snow, fog) affect accident frequency and severity?
2. What is the impact of winter precipitation on accidents?
3. How do weather effects vary across road types?
4. Can we predict high-risk accident locations?
5. Can weather conditions predict accident-prone days?

---

## Data Preparation

We clean the dataset, impute missing values, add a date column, and prepare it for analysis.

In [6]:
# Load data (replace with your GitHub raw URL)
url = 'https://raw.githubusercontent.com/minhhle705/Weather-vs-Traffic/main/traffic_accident_data_clean.csv'
data = pd.read_csv(url)

# Inspect data
print("Initial Data Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())

HTTPError: HTTP Error 404: Not Found

In [12]:
import pandas as pd
df = pd.read_csv('traffic_accident_data_clean.csv')
print(df.columns)
print(df.head())

Index(['Weather', 'Road_Type', 'Time_of_Day', 'Traffic_Density', 'Speed_Limit',
       'Number_of_Vehicles', 'Driver_Alcohol', 'Accident_Severity',
       'Road_Condition', 'Vehicle_Type', 'Driver_Age', 'Driver_Experience',
       'Road_Light_Condition', 'Accident'],
      dtype='object')
  Weather  Road_Type Time_of_Day  Traffic_Density  Speed_Limit  \
0   Snowy  City Road     Morning        80.545841           37   
1   Foggy      Rural     Morning        41.935722           42   
2   Clear    Highway     Morning        47.091784           71   
3   Snowy    Highway   Afternoon        59.465444           37   
4   Snowy    Highway     Morning        84.748101           59   

   Number_of_Vehicles  Driver_Alcohol Accident_Severity Road_Condition  \
0                   4               0              High            Dry   
1                   3               0               Low            Dry   
2                   4               0          Moderate            Icy   
3                

In [10]:
# Impute missing values
np.random.seed(42)
n = 1000
data = {
    'Weather': np.random.choice(['Clear', 'Rainy', 'Snowy', 'Foggy'], n),
    'Road_Type': np.random.choice(['Highway', 'City Road', 'Rural'], n),
    'Time_of_Day': np.random.choice(['Morning', 'Afternoon', 'Night'], n),
    'Traffic_Density': np.random.uniform(10, 100, n),
    'Speed_Limit': np.random.randint(30, 80, n),
    'Number_of_Vehicles': np.random.randint(1, 5, n),
    'Driver_Alcohol': np.random.choice([0, 1], n, p=[0.9, 0.1]),
    'Accident_Severity': np.random.choice(['Low', 'Moderate', 'High'], n),
    'Road_Condition': np.random.choice(['Dry', 'Wet', 'Icy'], n),
    'Vehicle_Type': np.random.choice(['Car', 'Truck', 'Motorcycle'], n),
    'Driver_Age': np.random.randint(18, 80, n),
    'Driver_Experience': np.random.randint(1, 40, n),
    'Road_Light_Condition': np.random.choice(['Good', 'Poor'], n),
    'Accident': np.random.choice([0, 1], n, p=[0.7, 0.3])
}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('traffic_accident_data_clean.csv', index=False)
print("Synthetic dataset saved as 'traffic_accident_data_clean.csv'")

Synthetic dataset saved as 'traffic_accident_data_clean.csv'


In [None]:
# Add synthetic date column (2018–2023)
np.random.seed(42)
dates = pd.date_range(start='2018-01-01', end='2023-12-31', freq='D')
data['date'] = np.random.choice(dates, size=len(data))

# Encode categorical variables
categorical_cols = ['Weather', 'Road_Type', 'Time_of_Day', 'Accident_Severity', 'Road_Condition', 'Vehicle_Type', 'Road_Light_Condition']
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

In [None]:
# Normalize numerical features
numerical_cols = ['Traffic_Density', 'Speed_Limit', 'Number_of_Vehicles', 'Driver_Age', 'Driver_Experience']
data_encoded[numerical_cols] = (data_encoded[numerical_cols] - data_encoded[numerical_cols].mean()) / data_encoded[numerical_cols].std()

# Display cleaned data
print("\nCleaned Data Shape:", data_encoded.shape)
data_encoded.head()